xref: /linux/net/core/sock.c (revision 359bcf15ec1d6738ede721db628594ecf05fd998)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <linux/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <linux/skbuff_ref.h>
128 #include <net/net_namespace.h>
129 #include <net/request_sock.h>
130 #include <net/sock.h>
131 #include <net/proto_memory.h>
132 #include <linux/net_tstamp.h>
133 #include <net/xfrm.h>
134 #include <linux/ipsec.h>
135 #include <net/cls_cgroup.h>
136 #include <net/netprio_cgroup.h>
137 #include <linux/sock_diag.h>
138 
139 #include <linux/filter.h>
140 #include <net/sock_reuseport.h>
141 #include <net/bpf_sk_storage.h>
142 
143 #include <trace/events/sock.h>
144 
145 #include <net/tcp.h>
146 #include <net/busy_poll.h>
147 #include <net/phonet/phonet.h>
148 
149 #include <linux/ethtool.h>
150 
151 #include <uapi/linux/pidfd.h>
152 
153 #include "dev.h"
154 
155 static DEFINE_MUTEX(proto_list_mutex);
156 static LIST_HEAD(proto_list);
157 
158 static void sock_def_write_space_wfree(struct sock *sk);
159 static void sock_def_write_space(struct sock *sk);
160 
161 /**
162  * sk_ns_capable - General socket capability test
163  * @sk: Socket to use a capability on or through
164  * @user_ns: The user namespace of the capability to use
165  * @cap: The capability to use
166  *
167  * Test to see if the opener of the socket had when the socket was
168  * created and the current process has the capability @cap in the user
169  * namespace @user_ns.
170  */
171 bool sk_ns_capable(const struct sock *sk,
172 		   struct user_namespace *user_ns, int cap)
173 {
174 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
175 		ns_capable(user_ns, cap);
176 }
177 EXPORT_SYMBOL(sk_ns_capable);
178 
179 /**
180  * sk_capable - Socket global capability test
181  * @sk: Socket to use a capability on or through
182  * @cap: The global capability to use
183  *
184  * Test to see if the opener of the socket had when the socket was
185  * created and the current process has the capability @cap in all user
186  * namespaces.
187  */
188 bool sk_capable(const struct sock *sk, int cap)
189 {
190 	return sk_ns_capable(sk, &init_user_ns, cap);
191 }
192 EXPORT_SYMBOL(sk_capable);
193 
194 /**
195  * sk_net_capable - Network namespace socket capability test
196  * @sk: Socket to use a capability on or through
197  * @cap: The capability to use
198  *
199  * Test to see if the opener of the socket had when the socket was created
200  * and the current process has the capability @cap over the network namespace
201  * the socket is a member of.
202  */
203 bool sk_net_capable(const struct sock *sk, int cap)
204 {
205 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
206 }
207 EXPORT_SYMBOL(sk_net_capable);
208 
209 /*
210  * Each address family might have different locking rules, so we have
211  * one slock key per address family and separate keys for internal and
212  * userspace sockets.
213  */
214 static struct lock_class_key af_family_keys[AF_MAX];
215 static struct lock_class_key af_family_kern_keys[AF_MAX];
216 static struct lock_class_key af_family_slock_keys[AF_MAX];
217 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
218 
219 /*
220  * Make lock validator output more readable. (we pre-construct these
221  * strings build-time, so that runtime initialization of socket
222  * locks is fast):
223  */
224 
225 #define _sock_locks(x)						  \
226   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
227   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
228   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
229   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
230   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
231   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
232   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
233   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
234   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
235   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
236   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
237   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
238   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
239   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
240   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
241   x "AF_MCTP"  , \
242   x "AF_MAX"
243 
244 static const char *const af_family_key_strings[AF_MAX+1] = {
245 	_sock_locks("sk_lock-")
246 };
247 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
248 	_sock_locks("slock-")
249 };
250 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
251 	_sock_locks("clock-")
252 };
253 
254 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
255 	_sock_locks("k-sk_lock-")
256 };
257 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
258 	_sock_locks("k-slock-")
259 };
260 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
261 	_sock_locks("k-clock-")
262 };
263 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
264 	_sock_locks("rlock-")
265 };
266 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
267 	_sock_locks("wlock-")
268 };
269 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
270 	_sock_locks("elock-")
271 };
272 
273 /*
274  * sk_callback_lock and sk queues locking rules are per-address-family,
275  * so split the lock classes by using a per-AF key:
276  */
277 static struct lock_class_key af_callback_keys[AF_MAX];
278 static struct lock_class_key af_rlock_keys[AF_MAX];
279 static struct lock_class_key af_wlock_keys[AF_MAX];
280 static struct lock_class_key af_elock_keys[AF_MAX];
281 static struct lock_class_key af_kern_callback_keys[AF_MAX];
282 
283 /* Run time adjustable parameters. */
284 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
285 EXPORT_SYMBOL(sysctl_wmem_max);
286 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
287 EXPORT_SYMBOL(sysctl_rmem_max);
288 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
289 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
290 
291 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
292 EXPORT_SYMBOL_GPL(memalloc_socks_key);
293 
294 /**
295  * sk_set_memalloc - sets %SOCK_MEMALLOC
296  * @sk: socket to set it on
297  *
298  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
299  * It's the responsibility of the admin to adjust min_free_kbytes
300  * to meet the requirements
301  */
302 void sk_set_memalloc(struct sock *sk)
303 {
304 	sock_set_flag(sk, SOCK_MEMALLOC);
305 	sk->sk_allocation |= __GFP_MEMALLOC;
306 	static_branch_inc(&memalloc_socks_key);
307 }
308 EXPORT_SYMBOL_GPL(sk_set_memalloc);
309 
310 void sk_clear_memalloc(struct sock *sk)
311 {
312 	sock_reset_flag(sk, SOCK_MEMALLOC);
313 	sk->sk_allocation &= ~__GFP_MEMALLOC;
314 	static_branch_dec(&memalloc_socks_key);
315 
316 	/*
317 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
318 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
319 	 * it has rmem allocations due to the last swapfile being deactivated
320 	 * but there is a risk that the socket is unusable due to exceeding
321 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
322 	 */
323 	sk_mem_reclaim(sk);
324 }
325 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
326 
327 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
328 {
329 	int ret;
330 	unsigned int noreclaim_flag;
331 
332 	/* these should have been dropped before queueing */
333 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
334 
335 	noreclaim_flag = memalloc_noreclaim_save();
336 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
337 				 tcp_v6_do_rcv,
338 				 tcp_v4_do_rcv,
339 				 sk, skb);
340 	memalloc_noreclaim_restore(noreclaim_flag);
341 
342 	return ret;
343 }
344 EXPORT_SYMBOL(__sk_backlog_rcv);
345 
346 void sk_error_report(struct sock *sk)
347 {
348 	sk->sk_error_report(sk);
349 
350 	switch (sk->sk_family) {
351 	case AF_INET:
352 		fallthrough;
353 	case AF_INET6:
354 		trace_inet_sk_error_report(sk);
355 		break;
356 	default:
357 		break;
358 	}
359 }
360 EXPORT_SYMBOL(sk_error_report);
361 
362 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
363 {
364 	struct __kernel_sock_timeval tv;
365 
366 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
367 		tv.tv_sec = 0;
368 		tv.tv_usec = 0;
369 	} else {
370 		tv.tv_sec = timeo / HZ;
371 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
372 	}
373 
374 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
375 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
376 		*(struct old_timeval32 *)optval = tv32;
377 		return sizeof(tv32);
378 	}
379 
380 	if (old_timeval) {
381 		struct __kernel_old_timeval old_tv;
382 		old_tv.tv_sec = tv.tv_sec;
383 		old_tv.tv_usec = tv.tv_usec;
384 		*(struct __kernel_old_timeval *)optval = old_tv;
385 		return sizeof(old_tv);
386 	}
387 
388 	*(struct __kernel_sock_timeval *)optval = tv;
389 	return sizeof(tv);
390 }
391 EXPORT_SYMBOL(sock_get_timeout);
392 
393 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
394 			   sockptr_t optval, int optlen, bool old_timeval)
395 {
396 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
397 		struct old_timeval32 tv32;
398 
399 		if (optlen < sizeof(tv32))
400 			return -EINVAL;
401 
402 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
403 			return -EFAULT;
404 		tv->tv_sec = tv32.tv_sec;
405 		tv->tv_usec = tv32.tv_usec;
406 	} else if (old_timeval) {
407 		struct __kernel_old_timeval old_tv;
408 
409 		if (optlen < sizeof(old_tv))
410 			return -EINVAL;
411 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
412 			return -EFAULT;
413 		tv->tv_sec = old_tv.tv_sec;
414 		tv->tv_usec = old_tv.tv_usec;
415 	} else {
416 		if (optlen < sizeof(*tv))
417 			return -EINVAL;
418 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
419 			return -EFAULT;
420 	}
421 
422 	return 0;
423 }
424 EXPORT_SYMBOL(sock_copy_user_timeval);
425 
426 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
427 			    bool old_timeval)
428 {
429 	struct __kernel_sock_timeval tv;
430 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
431 	long val;
432 
433 	if (err)
434 		return err;
435 
436 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
437 		return -EDOM;
438 
439 	if (tv.tv_sec < 0) {
440 		static int warned __read_mostly;
441 
442 		WRITE_ONCE(*timeo_p, 0);
443 		if (warned < 10 && net_ratelimit()) {
444 			warned++;
445 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
446 				__func__, current->comm, task_pid_nr(current));
447 		}
448 		return 0;
449 	}
450 	val = MAX_SCHEDULE_TIMEOUT;
451 	if ((tv.tv_sec || tv.tv_usec) &&
452 	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
453 		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
454 						    USEC_PER_SEC / HZ);
455 	WRITE_ONCE(*timeo_p, val);
456 	return 0;
457 }
458 
459 static bool sk_set_prio_allowed(const struct sock *sk, int val)
460 {
461 	return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) ||
462 		sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
463 		sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN));
464 }
465 
466 static bool sock_needs_netstamp(const struct sock *sk)
467 {
468 	switch (sk->sk_family) {
469 	case AF_UNSPEC:
470 	case AF_UNIX:
471 		return false;
472 	default:
473 		return true;
474 	}
475 }
476 
477 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
478 {
479 	if (sk->sk_flags & flags) {
480 		sk->sk_flags &= ~flags;
481 		if (sock_needs_netstamp(sk) &&
482 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
483 			net_disable_timestamp();
484 	}
485 }
486 
487 
488 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
489 {
490 	unsigned long flags;
491 	struct sk_buff_head *list = &sk->sk_receive_queue;
492 
493 	if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
494 		atomic_inc(&sk->sk_drops);
495 		trace_sock_rcvqueue_full(sk, skb);
496 		return -ENOMEM;
497 	}
498 
499 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
500 		atomic_inc(&sk->sk_drops);
501 		return -ENOBUFS;
502 	}
503 
504 	skb->dev = NULL;
505 	skb_set_owner_r(skb, sk);
506 
507 	/* we escape from rcu protected region, make sure we dont leak
508 	 * a norefcounted dst
509 	 */
510 	skb_dst_force(skb);
511 
512 	spin_lock_irqsave(&list->lock, flags);
513 	sock_skb_set_dropcount(sk, skb);
514 	__skb_queue_tail(list, skb);
515 	spin_unlock_irqrestore(&list->lock, flags);
516 
517 	if (!sock_flag(sk, SOCK_DEAD))
518 		sk->sk_data_ready(sk);
519 	return 0;
520 }
521 EXPORT_SYMBOL(__sock_queue_rcv_skb);
522 
523 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
524 			      enum skb_drop_reason *reason)
525 {
526 	enum skb_drop_reason drop_reason;
527 	int err;
528 
529 	err = sk_filter(sk, skb);
530 	if (err) {
531 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
532 		goto out;
533 	}
534 	err = __sock_queue_rcv_skb(sk, skb);
535 	switch (err) {
536 	case -ENOMEM:
537 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
538 		break;
539 	case -ENOBUFS:
540 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
541 		break;
542 	default:
543 		drop_reason = SKB_NOT_DROPPED_YET;
544 		break;
545 	}
546 out:
547 	if (reason)
548 		*reason = drop_reason;
549 	return err;
550 }
551 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
552 
553 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
554 		     const int nested, unsigned int trim_cap, bool refcounted)
555 {
556 	int rc = NET_RX_SUCCESS;
557 
558 	if (sk_filter_trim_cap(sk, skb, trim_cap))
559 		goto discard_and_relse;
560 
561 	skb->dev = NULL;
562 
563 	if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
564 		atomic_inc(&sk->sk_drops);
565 		goto discard_and_relse;
566 	}
567 	if (nested)
568 		bh_lock_sock_nested(sk);
569 	else
570 		bh_lock_sock(sk);
571 	if (!sock_owned_by_user(sk)) {
572 		/*
573 		 * trylock + unlock semantics:
574 		 */
575 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
576 
577 		rc = sk_backlog_rcv(sk, skb);
578 
579 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
580 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
581 		bh_unlock_sock(sk);
582 		atomic_inc(&sk->sk_drops);
583 		goto discard_and_relse;
584 	}
585 
586 	bh_unlock_sock(sk);
587 out:
588 	if (refcounted)
589 		sock_put(sk);
590 	return rc;
591 discard_and_relse:
592 	kfree_skb(skb);
593 	goto out;
594 }
595 EXPORT_SYMBOL(__sk_receive_skb);
596 
597 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
598 							  u32));
599 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
600 							   u32));
601 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
602 {
603 	struct dst_entry *dst = __sk_dst_get(sk);
604 
605 	if (dst && dst->obsolete &&
606 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
607 			       dst, cookie) == NULL) {
608 		sk_tx_queue_clear(sk);
609 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
610 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
611 		dst_release(dst);
612 		return NULL;
613 	}
614 
615 	return dst;
616 }
617 EXPORT_SYMBOL(__sk_dst_check);
618 
619 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
620 {
621 	struct dst_entry *dst = sk_dst_get(sk);
622 
623 	if (dst && dst->obsolete &&
624 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
625 			       dst, cookie) == NULL) {
626 		sk_dst_reset(sk);
627 		dst_release(dst);
628 		return NULL;
629 	}
630 
631 	return dst;
632 }
633 EXPORT_SYMBOL(sk_dst_check);
634 
635 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
636 {
637 	int ret = -ENOPROTOOPT;
638 #ifdef CONFIG_NETDEVICES
639 	struct net *net = sock_net(sk);
640 
641 	/* Sorry... */
642 	ret = -EPERM;
643 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
644 		goto out;
645 
646 	ret = -EINVAL;
647 	if (ifindex < 0)
648 		goto out;
649 
650 	/* Paired with all READ_ONCE() done locklessly. */
651 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
652 
653 	if (sk->sk_prot->rehash)
654 		sk->sk_prot->rehash(sk);
655 	sk_dst_reset(sk);
656 
657 	ret = 0;
658 
659 out:
660 #endif
661 
662 	return ret;
663 }
664 
665 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
666 {
667 	int ret;
668 
669 	if (lock_sk)
670 		lock_sock(sk);
671 	ret = sock_bindtoindex_locked(sk, ifindex);
672 	if (lock_sk)
673 		release_sock(sk);
674 
675 	return ret;
676 }
677 EXPORT_SYMBOL(sock_bindtoindex);
678 
679 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
680 {
681 	int ret = -ENOPROTOOPT;
682 #ifdef CONFIG_NETDEVICES
683 	struct net *net = sock_net(sk);
684 	char devname[IFNAMSIZ];
685 	int index;
686 
687 	ret = -EINVAL;
688 	if (optlen < 0)
689 		goto out;
690 
691 	/* Bind this socket to a particular device like "eth0",
692 	 * as specified in the passed interface name. If the
693 	 * name is "" or the option length is zero the socket
694 	 * is not bound.
695 	 */
696 	if (optlen > IFNAMSIZ - 1)
697 		optlen = IFNAMSIZ - 1;
698 	memset(devname, 0, sizeof(devname));
699 
700 	ret = -EFAULT;
701 	if (copy_from_sockptr(devname, optval, optlen))
702 		goto out;
703 
704 	index = 0;
705 	if (devname[0] != '\0') {
706 		struct net_device *dev;
707 
708 		rcu_read_lock();
709 		dev = dev_get_by_name_rcu(net, devname);
710 		if (dev)
711 			index = dev->ifindex;
712 		rcu_read_unlock();
713 		ret = -ENODEV;
714 		if (!dev)
715 			goto out;
716 	}
717 
718 	sockopt_lock_sock(sk);
719 	ret = sock_bindtoindex_locked(sk, index);
720 	sockopt_release_sock(sk);
721 out:
722 #endif
723 
724 	return ret;
725 }
726 
727 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
728 				sockptr_t optlen, int len)
729 {
730 	int ret = -ENOPROTOOPT;
731 #ifdef CONFIG_NETDEVICES
732 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
733 	struct net *net = sock_net(sk);
734 	char devname[IFNAMSIZ];
735 
736 	if (bound_dev_if == 0) {
737 		len = 0;
738 		goto zero;
739 	}
740 
741 	ret = -EINVAL;
742 	if (len < IFNAMSIZ)
743 		goto out;
744 
745 	ret = netdev_get_name(net, devname, bound_dev_if);
746 	if (ret)
747 		goto out;
748 
749 	len = strlen(devname) + 1;
750 
751 	ret = -EFAULT;
752 	if (copy_to_sockptr(optval, devname, len))
753 		goto out;
754 
755 zero:
756 	ret = -EFAULT;
757 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
758 		goto out;
759 
760 	ret = 0;
761 
762 out:
763 #endif
764 
765 	return ret;
766 }
767 
768 bool sk_mc_loop(const struct sock *sk)
769 {
770 	if (dev_recursion_level())
771 		return false;
772 	if (!sk)
773 		return true;
774 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
775 	switch (READ_ONCE(sk->sk_family)) {
776 	case AF_INET:
777 		return inet_test_bit(MC_LOOP, sk);
778 #if IS_ENABLED(CONFIG_IPV6)
779 	case AF_INET6:
780 		return inet6_test_bit(MC6_LOOP, sk);
781 #endif
782 	}
783 	WARN_ON_ONCE(1);
784 	return true;
785 }
786 EXPORT_SYMBOL(sk_mc_loop);
787 
788 void sock_set_reuseaddr(struct sock *sk)
789 {
790 	lock_sock(sk);
791 	sk->sk_reuse = SK_CAN_REUSE;
792 	release_sock(sk);
793 }
794 EXPORT_SYMBOL(sock_set_reuseaddr);
795 
796 void sock_set_reuseport(struct sock *sk)
797 {
798 	lock_sock(sk);
799 	sk->sk_reuseport = true;
800 	release_sock(sk);
801 }
802 EXPORT_SYMBOL(sock_set_reuseport);
803 
804 void sock_no_linger(struct sock *sk)
805 {
806 	lock_sock(sk);
807 	WRITE_ONCE(sk->sk_lingertime, 0);
808 	sock_set_flag(sk, SOCK_LINGER);
809 	release_sock(sk);
810 }
811 EXPORT_SYMBOL(sock_no_linger);
812 
813 void sock_set_priority(struct sock *sk, u32 priority)
814 {
815 	WRITE_ONCE(sk->sk_priority, priority);
816 }
817 EXPORT_SYMBOL(sock_set_priority);
818 
819 void sock_set_sndtimeo(struct sock *sk, s64 secs)
820 {
821 	lock_sock(sk);
822 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
823 		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
824 	else
825 		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
826 	release_sock(sk);
827 }
828 EXPORT_SYMBOL(sock_set_sndtimeo);
829 
830 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
831 {
832 	sock_valbool_flag(sk, SOCK_RCVTSTAMP, val);
833 	sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns);
834 	if (val)  {
835 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
836 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
837 	}
838 }
839 
840 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
841 {
842 	switch (optname) {
843 	case SO_TIMESTAMP_OLD:
844 		__sock_set_timestamps(sk, valbool, false, false);
845 		break;
846 	case SO_TIMESTAMP_NEW:
847 		__sock_set_timestamps(sk, valbool, true, false);
848 		break;
849 	case SO_TIMESTAMPNS_OLD:
850 		__sock_set_timestamps(sk, valbool, false, true);
851 		break;
852 	case SO_TIMESTAMPNS_NEW:
853 		__sock_set_timestamps(sk, valbool, true, true);
854 		break;
855 	}
856 }
857 
858 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
859 {
860 	struct net *net = sock_net(sk);
861 	struct net_device *dev = NULL;
862 	bool match = false;
863 	int *vclock_index;
864 	int i, num;
865 
866 	if (sk->sk_bound_dev_if)
867 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
868 
869 	if (!dev) {
870 		pr_err("%s: sock not bind to device\n", __func__);
871 		return -EOPNOTSUPP;
872 	}
873 
874 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
875 	dev_put(dev);
876 
877 	for (i = 0; i < num; i++) {
878 		if (*(vclock_index + i) == phc_index) {
879 			match = true;
880 			break;
881 		}
882 	}
883 
884 	if (num > 0)
885 		kfree(vclock_index);
886 
887 	if (!match)
888 		return -EINVAL;
889 
890 	WRITE_ONCE(sk->sk_bind_phc, phc_index);
891 
892 	return 0;
893 }
894 
895 int sock_set_timestamping(struct sock *sk, int optname,
896 			  struct so_timestamping timestamping)
897 {
898 	int val = timestamping.flags;
899 	int ret;
900 
901 	if (val & ~SOF_TIMESTAMPING_MASK)
902 		return -EINVAL;
903 
904 	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
905 	    !(val & SOF_TIMESTAMPING_OPT_ID))
906 		return -EINVAL;
907 
908 	if (val & SOF_TIMESTAMPING_OPT_ID &&
909 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
910 		if (sk_is_tcp(sk)) {
911 			if ((1 << sk->sk_state) &
912 			    (TCPF_CLOSE | TCPF_LISTEN))
913 				return -EINVAL;
914 			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
915 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
916 			else
917 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
918 		} else {
919 			atomic_set(&sk->sk_tskey, 0);
920 		}
921 	}
922 
923 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
924 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
925 		return -EINVAL;
926 
927 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
928 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
929 		if (ret)
930 			return ret;
931 	}
932 
933 	WRITE_ONCE(sk->sk_tsflags, val);
934 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
935 	sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY));
936 
937 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
938 		sock_enable_timestamp(sk,
939 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
940 	else
941 		sock_disable_timestamp(sk,
942 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
943 	return 0;
944 }
945 
946 #if defined(CONFIG_CGROUP_BPF)
947 void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
948 {
949 	struct bpf_sock_ops_kern sock_ops;
950 
951 	memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
952 	sock_ops.op = op;
953 	sock_ops.is_fullsock = 1;
954 	sock_ops.sk = sk;
955 	bpf_skops_init_skb(&sock_ops, skb, 0);
956 	__cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
957 }
958 #endif
959 
960 void sock_set_keepalive(struct sock *sk)
961 {
962 	lock_sock(sk);
963 	if (sk->sk_prot->keepalive)
964 		sk->sk_prot->keepalive(sk, true);
965 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
966 	release_sock(sk);
967 }
968 EXPORT_SYMBOL(sock_set_keepalive);
969 
970 static void __sock_set_rcvbuf(struct sock *sk, int val)
971 {
972 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
973 	 * as a negative value.
974 	 */
975 	val = min_t(int, val, INT_MAX / 2);
976 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
977 
978 	/* We double it on the way in to account for "struct sk_buff" etc.
979 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
980 	 * will allow that much actual data to be received on that socket.
981 	 *
982 	 * Applications are unaware that "struct sk_buff" and other overheads
983 	 * allocate from the receive buffer during socket buffer allocation.
984 	 *
985 	 * And after considering the possible alternatives, returning the value
986 	 * we actually used in getsockopt is the most desirable behavior.
987 	 */
988 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
989 }
990 
991 void sock_set_rcvbuf(struct sock *sk, int val)
992 {
993 	lock_sock(sk);
994 	__sock_set_rcvbuf(sk, val);
995 	release_sock(sk);
996 }
997 EXPORT_SYMBOL(sock_set_rcvbuf);
998 
999 static void __sock_set_mark(struct sock *sk, u32 val)
1000 {
1001 	if (val != sk->sk_mark) {
1002 		WRITE_ONCE(sk->sk_mark, val);
1003 		sk_dst_reset(sk);
1004 	}
1005 }
1006 
1007 void sock_set_mark(struct sock *sk, u32 val)
1008 {
1009 	lock_sock(sk);
1010 	__sock_set_mark(sk, val);
1011 	release_sock(sk);
1012 }
1013 EXPORT_SYMBOL(sock_set_mark);
1014 
1015 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1016 {
1017 	/* Round down bytes to multiple of pages */
1018 	bytes = round_down(bytes, PAGE_SIZE);
1019 
1020 	WARN_ON(bytes > sk->sk_reserved_mem);
1021 	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1022 	sk_mem_reclaim(sk);
1023 }
1024 
1025 static int sock_reserve_memory(struct sock *sk, int bytes)
1026 {
1027 	long allocated;
1028 	bool charged;
1029 	int pages;
1030 
1031 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1032 		return -EOPNOTSUPP;
1033 
1034 	if (!bytes)
1035 		return 0;
1036 
1037 	pages = sk_mem_pages(bytes);
1038 
1039 	/* pre-charge to memcg */
1040 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1041 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1042 	if (!charged)
1043 		return -ENOMEM;
1044 
1045 	/* pre-charge to forward_alloc */
1046 	sk_memory_allocated_add(sk, pages);
1047 	allocated = sk_memory_allocated(sk);
1048 	/* If the system goes into memory pressure with this
1049 	 * precharge, give up and return error.
1050 	 */
1051 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1052 		sk_memory_allocated_sub(sk, pages);
1053 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1054 		return -ENOMEM;
1055 	}
1056 	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1057 
1058 	WRITE_ONCE(sk->sk_reserved_mem,
1059 		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1060 
1061 	return 0;
1062 }
1063 
1064 #ifdef CONFIG_PAGE_POOL
1065 
1066 /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
1067  * in 1 syscall. The limit exists to limit the amount of memory the kernel
1068  * allocates to copy these tokens, and to prevent looping over the frags for
1069  * too long.
1070  */
1071 #define MAX_DONTNEED_TOKENS 128
1072 #define MAX_DONTNEED_FRAGS 1024
1073 
1074 static noinline_for_stack int
1075 sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
1076 {
1077 	unsigned int num_tokens, i, j, k, netmem_num = 0;
1078 	struct dmabuf_token *tokens;
1079 	int ret = 0, num_frags = 0;
1080 	netmem_ref netmems[16];
1081 
1082 	if (!sk_is_tcp(sk))
1083 		return -EBADF;
1084 
1085 	if (optlen % sizeof(*tokens) ||
1086 	    optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
1087 		return -EINVAL;
1088 
1089 	num_tokens = optlen / sizeof(*tokens);
1090 	tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
1091 	if (!tokens)
1092 		return -ENOMEM;
1093 
1094 	if (copy_from_sockptr(tokens, optval, optlen)) {
1095 		kvfree(tokens);
1096 		return -EFAULT;
1097 	}
1098 
1099 	xa_lock_bh(&sk->sk_user_frags);
1100 	for (i = 0; i < num_tokens; i++) {
1101 		for (j = 0; j < tokens[i].token_count; j++) {
1102 			if (++num_frags > MAX_DONTNEED_FRAGS)
1103 				goto frag_limit_reached;
1104 
1105 			netmem_ref netmem = (__force netmem_ref)__xa_erase(
1106 				&sk->sk_user_frags, tokens[i].token_start + j);
1107 
1108 			if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
1109 				continue;
1110 
1111 			netmems[netmem_num++] = netmem;
1112 			if (netmem_num == ARRAY_SIZE(netmems)) {
1113 				xa_unlock_bh(&sk->sk_user_frags);
1114 				for (k = 0; k < netmem_num; k++)
1115 					WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1116 				netmem_num = 0;
1117 				xa_lock_bh(&sk->sk_user_frags);
1118 			}
1119 			ret++;
1120 		}
1121 	}
1122 
1123 frag_limit_reached:
1124 	xa_unlock_bh(&sk->sk_user_frags);
1125 	for (k = 0; k < netmem_num; k++)
1126 		WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1127 
1128 	kvfree(tokens);
1129 	return ret;
1130 }
1131 #endif
1132 
1133 void sockopt_lock_sock(struct sock *sk)
1134 {
1135 	/* When current->bpf_ctx is set, the setsockopt is called from
1136 	 * a bpf prog.  bpf has ensured the sk lock has been
1137 	 * acquired before calling setsockopt().
1138 	 */
1139 	if (has_current_bpf_ctx())
1140 		return;
1141 
1142 	lock_sock(sk);
1143 }
1144 EXPORT_SYMBOL(sockopt_lock_sock);
1145 
1146 void sockopt_release_sock(struct sock *sk)
1147 {
1148 	if (has_current_bpf_ctx())
1149 		return;
1150 
1151 	release_sock(sk);
1152 }
1153 EXPORT_SYMBOL(sockopt_release_sock);
1154 
1155 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1156 {
1157 	return has_current_bpf_ctx() || ns_capable(ns, cap);
1158 }
1159 EXPORT_SYMBOL(sockopt_ns_capable);
1160 
1161 bool sockopt_capable(int cap)
1162 {
1163 	return has_current_bpf_ctx() || capable(cap);
1164 }
1165 EXPORT_SYMBOL(sockopt_capable);
1166 
1167 static int sockopt_validate_clockid(__kernel_clockid_t value)
1168 {
1169 	switch (value) {
1170 	case CLOCK_REALTIME:
1171 	case CLOCK_MONOTONIC:
1172 	case CLOCK_TAI:
1173 		return 0;
1174 	}
1175 	return -EINVAL;
1176 }
1177 
1178 /*
1179  *	This is meant for all protocols to use and covers goings on
1180  *	at the socket level. Everything here is generic.
1181  */
1182 
1183 int sk_setsockopt(struct sock *sk, int level, int optname,
1184 		  sockptr_t optval, unsigned int optlen)
1185 {
1186 	struct so_timestamping timestamping;
1187 	struct socket *sock = sk->sk_socket;
1188 	struct sock_txtime sk_txtime;
1189 	int val;
1190 	int valbool;
1191 	struct linger ling;
1192 	int ret = 0;
1193 
1194 	/*
1195 	 *	Options without arguments
1196 	 */
1197 
1198 	if (optname == SO_BINDTODEVICE)
1199 		return sock_setbindtodevice(sk, optval, optlen);
1200 
1201 	if (optlen < sizeof(int))
1202 		return -EINVAL;
1203 
1204 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1205 		return -EFAULT;
1206 
1207 	valbool = val ? 1 : 0;
1208 
1209 	/* handle options which do not require locking the socket. */
1210 	switch (optname) {
1211 	case SO_PRIORITY:
1212 		if (sk_set_prio_allowed(sk, val)) {
1213 			sock_set_priority(sk, val);
1214 			return 0;
1215 		}
1216 		return -EPERM;
1217 	case SO_TYPE:
1218 	case SO_PROTOCOL:
1219 	case SO_DOMAIN:
1220 	case SO_ERROR:
1221 		return -ENOPROTOOPT;
1222 #ifdef CONFIG_NET_RX_BUSY_POLL
1223 	case SO_BUSY_POLL:
1224 		if (val < 0)
1225 			return -EINVAL;
1226 		WRITE_ONCE(sk->sk_ll_usec, val);
1227 		return 0;
1228 	case SO_PREFER_BUSY_POLL:
1229 		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1230 			return -EPERM;
1231 		WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1232 		return 0;
1233 	case SO_BUSY_POLL_BUDGET:
1234 		if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1235 		    !sockopt_capable(CAP_NET_ADMIN))
1236 			return -EPERM;
1237 		if (val < 0 || val > U16_MAX)
1238 			return -EINVAL;
1239 		WRITE_ONCE(sk->sk_busy_poll_budget, val);
1240 		return 0;
1241 #endif
1242 	case SO_MAX_PACING_RATE:
1243 		{
1244 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1245 		unsigned long pacing_rate;
1246 
1247 		if (sizeof(ulval) != sizeof(val) &&
1248 		    optlen >= sizeof(ulval) &&
1249 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1250 			return -EFAULT;
1251 		}
1252 		if (ulval != ~0UL)
1253 			cmpxchg(&sk->sk_pacing_status,
1254 				SK_PACING_NONE,
1255 				SK_PACING_NEEDED);
1256 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1257 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1258 		pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1259 		if (ulval < pacing_rate)
1260 			WRITE_ONCE(sk->sk_pacing_rate, ulval);
1261 		return 0;
1262 		}
1263 	case SO_TXREHASH:
1264 		if (!sk_is_tcp(sk))
1265 			return -EOPNOTSUPP;
1266 		if (val < -1 || val > 1)
1267 			return -EINVAL;
1268 		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1269 			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1270 		/* Paired with READ_ONCE() in tcp_rtx_synack()
1271 		 * and sk_getsockopt().
1272 		 */
1273 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1274 		return 0;
1275 	case SO_PEEK_OFF:
1276 		{
1277 		int (*set_peek_off)(struct sock *sk, int val);
1278 
1279 		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1280 		if (set_peek_off)
1281 			ret = set_peek_off(sk, val);
1282 		else
1283 			ret = -EOPNOTSUPP;
1284 		return ret;
1285 		}
1286 #ifdef CONFIG_PAGE_POOL
1287 	case SO_DEVMEM_DONTNEED:
1288 		return sock_devmem_dontneed(sk, optval, optlen);
1289 #endif
1290 	}
1291 
1292 	sockopt_lock_sock(sk);
1293 
1294 	switch (optname) {
1295 	case SO_DEBUG:
1296 		if (val && !sockopt_capable(CAP_NET_ADMIN))
1297 			ret = -EACCES;
1298 		else
1299 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1300 		break;
1301 	case SO_REUSEADDR:
1302 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1303 		break;
1304 	case SO_REUSEPORT:
1305 		if (valbool && !sk_is_inet(sk))
1306 			ret = -EOPNOTSUPP;
1307 		else
1308 			sk->sk_reuseport = valbool;
1309 		break;
1310 	case SO_DONTROUTE:
1311 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1312 		sk_dst_reset(sk);
1313 		break;
1314 	case SO_BROADCAST:
1315 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1316 		break;
1317 	case SO_SNDBUF:
1318 		/* Don't error on this BSD doesn't and if you think
1319 		 * about it this is right. Otherwise apps have to
1320 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1321 		 * are treated in BSD as hints
1322 		 */
1323 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1324 set_sndbuf:
1325 		/* Ensure val * 2 fits into an int, to prevent max_t()
1326 		 * from treating it as a negative value.
1327 		 */
1328 		val = min_t(int, val, INT_MAX / 2);
1329 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1330 		WRITE_ONCE(sk->sk_sndbuf,
1331 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1332 		/* Wake up sending tasks if we upped the value. */
1333 		sk->sk_write_space(sk);
1334 		break;
1335 
1336 	case SO_SNDBUFFORCE:
1337 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1338 			ret = -EPERM;
1339 			break;
1340 		}
1341 
1342 		/* No negative values (to prevent underflow, as val will be
1343 		 * multiplied by 2).
1344 		 */
1345 		if (val < 0)
1346 			val = 0;
1347 		goto set_sndbuf;
1348 
1349 	case SO_RCVBUF:
1350 		/* Don't error on this BSD doesn't and if you think
1351 		 * about it this is right. Otherwise apps have to
1352 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1353 		 * are treated in BSD as hints
1354 		 */
1355 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1356 		break;
1357 
1358 	case SO_RCVBUFFORCE:
1359 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1360 			ret = -EPERM;
1361 			break;
1362 		}
1363 
1364 		/* No negative values (to prevent underflow, as val will be
1365 		 * multiplied by 2).
1366 		 */
1367 		__sock_set_rcvbuf(sk, max(val, 0));
1368 		break;
1369 
1370 	case SO_KEEPALIVE:
1371 		if (sk->sk_prot->keepalive)
1372 			sk->sk_prot->keepalive(sk, valbool);
1373 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1374 		break;
1375 
1376 	case SO_OOBINLINE:
1377 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1378 		break;
1379 
1380 	case SO_NO_CHECK:
1381 		sk->sk_no_check_tx = valbool;
1382 		break;
1383 
1384 	case SO_LINGER:
1385 		if (optlen < sizeof(ling)) {
1386 			ret = -EINVAL;	/* 1003.1g */
1387 			break;
1388 		}
1389 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1390 			ret = -EFAULT;
1391 			break;
1392 		}
1393 		if (!ling.l_onoff) {
1394 			sock_reset_flag(sk, SOCK_LINGER);
1395 		} else {
1396 			unsigned long t_sec = ling.l_linger;
1397 
1398 			if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1399 				WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1400 			else
1401 				WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1402 			sock_set_flag(sk, SOCK_LINGER);
1403 		}
1404 		break;
1405 
1406 	case SO_BSDCOMPAT:
1407 		break;
1408 
1409 	case SO_TIMESTAMP_OLD:
1410 	case SO_TIMESTAMP_NEW:
1411 	case SO_TIMESTAMPNS_OLD:
1412 	case SO_TIMESTAMPNS_NEW:
1413 		sock_set_timestamp(sk, optname, valbool);
1414 		break;
1415 
1416 	case SO_TIMESTAMPING_NEW:
1417 	case SO_TIMESTAMPING_OLD:
1418 		if (optlen == sizeof(timestamping)) {
1419 			if (copy_from_sockptr(&timestamping, optval,
1420 					      sizeof(timestamping))) {
1421 				ret = -EFAULT;
1422 				break;
1423 			}
1424 		} else {
1425 			memset(&timestamping, 0, sizeof(timestamping));
1426 			timestamping.flags = val;
1427 		}
1428 		ret = sock_set_timestamping(sk, optname, timestamping);
1429 		break;
1430 
1431 	case SO_RCVLOWAT:
1432 		{
1433 		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1434 
1435 		if (val < 0)
1436 			val = INT_MAX;
1437 		if (sock)
1438 			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1439 		if (set_rcvlowat)
1440 			ret = set_rcvlowat(sk, val);
1441 		else
1442 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1443 		break;
1444 		}
1445 	case SO_RCVTIMEO_OLD:
1446 	case SO_RCVTIMEO_NEW:
1447 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1448 				       optlen, optname == SO_RCVTIMEO_OLD);
1449 		break;
1450 
1451 	case SO_SNDTIMEO_OLD:
1452 	case SO_SNDTIMEO_NEW:
1453 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1454 				       optlen, optname == SO_SNDTIMEO_OLD);
1455 		break;
1456 
1457 	case SO_ATTACH_FILTER: {
1458 		struct sock_fprog fprog;
1459 
1460 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1461 		if (!ret)
1462 			ret = sk_attach_filter(&fprog, sk);
1463 		break;
1464 	}
1465 	case SO_ATTACH_BPF:
1466 		ret = -EINVAL;
1467 		if (optlen == sizeof(u32)) {
1468 			u32 ufd;
1469 
1470 			ret = -EFAULT;
1471 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1472 				break;
1473 
1474 			ret = sk_attach_bpf(ufd, sk);
1475 		}
1476 		break;
1477 
1478 	case SO_ATTACH_REUSEPORT_CBPF: {
1479 		struct sock_fprog fprog;
1480 
1481 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1482 		if (!ret)
1483 			ret = sk_reuseport_attach_filter(&fprog, sk);
1484 		break;
1485 	}
1486 	case SO_ATTACH_REUSEPORT_EBPF:
1487 		ret = -EINVAL;
1488 		if (optlen == sizeof(u32)) {
1489 			u32 ufd;
1490 
1491 			ret = -EFAULT;
1492 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1493 				break;
1494 
1495 			ret = sk_reuseport_attach_bpf(ufd, sk);
1496 		}
1497 		break;
1498 
1499 	case SO_DETACH_REUSEPORT_BPF:
1500 		ret = reuseport_detach_prog(sk);
1501 		break;
1502 
1503 	case SO_DETACH_FILTER:
1504 		ret = sk_detach_filter(sk);
1505 		break;
1506 
1507 	case SO_LOCK_FILTER:
1508 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1509 			ret = -EPERM;
1510 		else
1511 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1512 		break;
1513 
1514 	case SO_MARK:
1515 		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1516 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1517 			ret = -EPERM;
1518 			break;
1519 		}
1520 
1521 		__sock_set_mark(sk, val);
1522 		break;
1523 	case SO_RCVMARK:
1524 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1525 		break;
1526 
1527 	case SO_RCVPRIORITY:
1528 		sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool);
1529 		break;
1530 
1531 	case SO_RXQ_OVFL:
1532 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1533 		break;
1534 
1535 	case SO_WIFI_STATUS:
1536 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1537 		break;
1538 
1539 	case SO_NOFCS:
1540 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1541 		break;
1542 
1543 	case SO_SELECT_ERR_QUEUE:
1544 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1545 		break;
1546 
1547 	case SO_PASSCRED:
1548 		if (sk_may_scm_recv(sk))
1549 			sk->sk_scm_credentials = valbool;
1550 		else
1551 			ret = -EOPNOTSUPP;
1552 		break;
1553 
1554 	case SO_PASSSEC:
1555 		if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk))
1556 			sk->sk_scm_security = valbool;
1557 		else
1558 			ret = -EOPNOTSUPP;
1559 		break;
1560 
1561 	case SO_PASSPIDFD:
1562 		if (sk_is_unix(sk))
1563 			sk->sk_scm_pidfd = valbool;
1564 		else
1565 			ret = -EOPNOTSUPP;
1566 		break;
1567 
1568 	case SO_PASSRIGHTS:
1569 		if (sk_is_unix(sk))
1570 			sk->sk_scm_rights = valbool;
1571 		else
1572 			ret = -EOPNOTSUPP;
1573 		break;
1574 
1575 	case SO_INCOMING_CPU:
1576 		reuseport_update_incoming_cpu(sk, val);
1577 		break;
1578 
1579 	case SO_CNX_ADVICE:
1580 		if (val == 1)
1581 			dst_negative_advice(sk);
1582 		break;
1583 
1584 	case SO_ZEROCOPY:
1585 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1586 			if (!(sk_is_tcp(sk) ||
1587 			      (sk->sk_type == SOCK_DGRAM &&
1588 			       sk->sk_protocol == IPPROTO_UDP)))
1589 				ret = -EOPNOTSUPP;
1590 		} else if (sk->sk_family != PF_RDS) {
1591 			ret = -EOPNOTSUPP;
1592 		}
1593 		if (!ret) {
1594 			if (val < 0 || val > 1)
1595 				ret = -EINVAL;
1596 			else
1597 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1598 		}
1599 		break;
1600 
1601 	case SO_TXTIME:
1602 		if (optlen != sizeof(struct sock_txtime)) {
1603 			ret = -EINVAL;
1604 			break;
1605 		} else if (copy_from_sockptr(&sk_txtime, optval,
1606 			   sizeof(struct sock_txtime))) {
1607 			ret = -EFAULT;
1608 			break;
1609 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1610 			ret = -EINVAL;
1611 			break;
1612 		}
1613 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1614 		 * scheduler has enough safe guards.
1615 		 */
1616 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1617 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1618 			ret = -EPERM;
1619 			break;
1620 		}
1621 
1622 		ret = sockopt_validate_clockid(sk_txtime.clockid);
1623 		if (ret)
1624 			break;
1625 
1626 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1627 		sk->sk_clockid = sk_txtime.clockid;
1628 		sk->sk_txtime_deadline_mode =
1629 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1630 		sk->sk_txtime_report_errors =
1631 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1632 		break;
1633 
1634 	case SO_BINDTOIFINDEX:
1635 		ret = sock_bindtoindex_locked(sk, val);
1636 		break;
1637 
1638 	case SO_BUF_LOCK:
1639 		if (val & ~SOCK_BUF_LOCK_MASK) {
1640 			ret = -EINVAL;
1641 			break;
1642 		}
1643 		sk->sk_userlocks = val | (sk->sk_userlocks &
1644 					  ~SOCK_BUF_LOCK_MASK);
1645 		break;
1646 
1647 	case SO_RESERVE_MEM:
1648 	{
1649 		int delta;
1650 
1651 		if (val < 0) {
1652 			ret = -EINVAL;
1653 			break;
1654 		}
1655 
1656 		delta = val - sk->sk_reserved_mem;
1657 		if (delta < 0)
1658 			sock_release_reserved_memory(sk, -delta);
1659 		else
1660 			ret = sock_reserve_memory(sk, delta);
1661 		break;
1662 	}
1663 
1664 	default:
1665 		ret = -ENOPROTOOPT;
1666 		break;
1667 	}
1668 	sockopt_release_sock(sk);
1669 	return ret;
1670 }
1671 
1672 int sock_setsockopt(struct socket *sock, int level, int optname,
1673 		    sockptr_t optval, unsigned int optlen)
1674 {
1675 	return sk_setsockopt(sock->sk, level, optname,
1676 			     optval, optlen);
1677 }
1678 EXPORT_SYMBOL(sock_setsockopt);
1679 
1680 static const struct cred *sk_get_peer_cred(struct sock *sk)
1681 {
1682 	const struct cred *cred;
1683 
1684 	spin_lock(&sk->sk_peer_lock);
1685 	cred = get_cred(sk->sk_peer_cred);
1686 	spin_unlock(&sk->sk_peer_lock);
1687 
1688 	return cred;
1689 }
1690 
1691 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1692 			  struct ucred *ucred)
1693 {
1694 	ucred->pid = pid_vnr(pid);
1695 	ucred->uid = ucred->gid = -1;
1696 	if (cred) {
1697 		struct user_namespace *current_ns = current_user_ns();
1698 
1699 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1700 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1701 	}
1702 }
1703 
1704 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1705 {
1706 	struct user_namespace *user_ns = current_user_ns();
1707 	int i;
1708 
1709 	for (i = 0; i < src->ngroups; i++) {
1710 		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1711 
1712 		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1713 			return -EFAULT;
1714 	}
1715 
1716 	return 0;
1717 }
1718 
1719 int sk_getsockopt(struct sock *sk, int level, int optname,
1720 		  sockptr_t optval, sockptr_t optlen)
1721 {
1722 	struct socket *sock = sk->sk_socket;
1723 
1724 	union {
1725 		int val;
1726 		u64 val64;
1727 		unsigned long ulval;
1728 		struct linger ling;
1729 		struct old_timeval32 tm32;
1730 		struct __kernel_old_timeval tm;
1731 		struct  __kernel_sock_timeval stm;
1732 		struct sock_txtime txtime;
1733 		struct so_timestamping timestamping;
1734 	} v;
1735 
1736 	int lv = sizeof(int);
1737 	int len;
1738 
1739 	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1740 		return -EFAULT;
1741 	if (len < 0)
1742 		return -EINVAL;
1743 
1744 	memset(&v, 0, sizeof(v));
1745 
1746 	switch (optname) {
1747 	case SO_DEBUG:
1748 		v.val = sock_flag(sk, SOCK_DBG);
1749 		break;
1750 
1751 	case SO_DONTROUTE:
1752 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1753 		break;
1754 
1755 	case SO_BROADCAST:
1756 		v.val = sock_flag(sk, SOCK_BROADCAST);
1757 		break;
1758 
1759 	case SO_SNDBUF:
1760 		v.val = READ_ONCE(sk->sk_sndbuf);
1761 		break;
1762 
1763 	case SO_RCVBUF:
1764 		v.val = READ_ONCE(sk->sk_rcvbuf);
1765 		break;
1766 
1767 	case SO_REUSEADDR:
1768 		v.val = sk->sk_reuse;
1769 		break;
1770 
1771 	case SO_REUSEPORT:
1772 		v.val = sk->sk_reuseport;
1773 		break;
1774 
1775 	case SO_KEEPALIVE:
1776 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1777 		break;
1778 
1779 	case SO_TYPE:
1780 		v.val = sk->sk_type;
1781 		break;
1782 
1783 	case SO_PROTOCOL:
1784 		v.val = sk->sk_protocol;
1785 		break;
1786 
1787 	case SO_DOMAIN:
1788 		v.val = sk->sk_family;
1789 		break;
1790 
1791 	case SO_ERROR:
1792 		v.val = -sock_error(sk);
1793 		if (v.val == 0)
1794 			v.val = xchg(&sk->sk_err_soft, 0);
1795 		break;
1796 
1797 	case SO_OOBINLINE:
1798 		v.val = sock_flag(sk, SOCK_URGINLINE);
1799 		break;
1800 
1801 	case SO_NO_CHECK:
1802 		v.val = sk->sk_no_check_tx;
1803 		break;
1804 
1805 	case SO_PRIORITY:
1806 		v.val = READ_ONCE(sk->sk_priority);
1807 		break;
1808 
1809 	case SO_LINGER:
1810 		lv		= sizeof(v.ling);
1811 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1812 		v.ling.l_linger	= READ_ONCE(sk->sk_lingertime) / HZ;
1813 		break;
1814 
1815 	case SO_BSDCOMPAT:
1816 		break;
1817 
1818 	case SO_TIMESTAMP_OLD:
1819 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1820 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1821 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1822 		break;
1823 
1824 	case SO_TIMESTAMPNS_OLD:
1825 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1826 		break;
1827 
1828 	case SO_TIMESTAMP_NEW:
1829 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1830 		break;
1831 
1832 	case SO_TIMESTAMPNS_NEW:
1833 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1834 		break;
1835 
1836 	case SO_TIMESTAMPING_OLD:
1837 	case SO_TIMESTAMPING_NEW:
1838 		lv = sizeof(v.timestamping);
1839 		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1840 		 * returning the flags when they were set through the same option.
1841 		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1842 		 */
1843 		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1844 			v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1845 			v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1846 		}
1847 		break;
1848 
1849 	case SO_RCVTIMEO_OLD:
1850 	case SO_RCVTIMEO_NEW:
1851 		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1852 				      SO_RCVTIMEO_OLD == optname);
1853 		break;
1854 
1855 	case SO_SNDTIMEO_OLD:
1856 	case SO_SNDTIMEO_NEW:
1857 		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1858 				      SO_SNDTIMEO_OLD == optname);
1859 		break;
1860 
1861 	case SO_RCVLOWAT:
1862 		v.val = READ_ONCE(sk->sk_rcvlowat);
1863 		break;
1864 
1865 	case SO_SNDLOWAT:
1866 		v.val = 1;
1867 		break;
1868 
1869 	case SO_PASSCRED:
1870 		if (!sk_may_scm_recv(sk))
1871 			return -EOPNOTSUPP;
1872 
1873 		v.val = sk->sk_scm_credentials;
1874 		break;
1875 
1876 	case SO_PASSPIDFD:
1877 		if (!sk_is_unix(sk))
1878 			return -EOPNOTSUPP;
1879 
1880 		v.val = sk->sk_scm_pidfd;
1881 		break;
1882 
1883 	case SO_PASSRIGHTS:
1884 		if (!sk_is_unix(sk))
1885 			return -EOPNOTSUPP;
1886 
1887 		v.val = sk->sk_scm_rights;
1888 		break;
1889 
1890 	case SO_PEERCRED:
1891 	{
1892 		struct ucred peercred;
1893 		if (len > sizeof(peercred))
1894 			len = sizeof(peercred);
1895 
1896 		spin_lock(&sk->sk_peer_lock);
1897 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1898 		spin_unlock(&sk->sk_peer_lock);
1899 
1900 		if (copy_to_sockptr(optval, &peercred, len))
1901 			return -EFAULT;
1902 		goto lenout;
1903 	}
1904 
1905 	case SO_PEERPIDFD:
1906 	{
1907 		struct pid *peer_pid;
1908 		struct file *pidfd_file = NULL;
1909 		unsigned int flags = 0;
1910 		int pidfd;
1911 
1912 		if (len > sizeof(pidfd))
1913 			len = sizeof(pidfd);
1914 
1915 		spin_lock(&sk->sk_peer_lock);
1916 		peer_pid = get_pid(sk->sk_peer_pid);
1917 		spin_unlock(&sk->sk_peer_lock);
1918 
1919 		if (!peer_pid)
1920 			return -ENODATA;
1921 
1922 		/* The use of PIDFD_STALE requires stashing of struct pid
1923 		 * on pidfs with pidfs_register_pid() and only AF_UNIX
1924 		 * were prepared for this.
1925 		 */
1926 		if (sk->sk_family == AF_UNIX)
1927 			flags = PIDFD_STALE;
1928 
1929 		pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file);
1930 		put_pid(peer_pid);
1931 		if (pidfd < 0)
1932 			return pidfd;
1933 
1934 		if (copy_to_sockptr(optval, &pidfd, len) ||
1935 		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1936 			put_unused_fd(pidfd);
1937 			fput(pidfd_file);
1938 
1939 			return -EFAULT;
1940 		}
1941 
1942 		fd_install(pidfd, pidfd_file);
1943 		return 0;
1944 	}
1945 
1946 	case SO_PEERGROUPS:
1947 	{
1948 		const struct cred *cred;
1949 		int ret, n;
1950 
1951 		cred = sk_get_peer_cred(sk);
1952 		if (!cred)
1953 			return -ENODATA;
1954 
1955 		n = cred->group_info->ngroups;
1956 		if (len < n * sizeof(gid_t)) {
1957 			len = n * sizeof(gid_t);
1958 			put_cred(cred);
1959 			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1960 		}
1961 		len = n * sizeof(gid_t);
1962 
1963 		ret = groups_to_user(optval, cred->group_info);
1964 		put_cred(cred);
1965 		if (ret)
1966 			return ret;
1967 		goto lenout;
1968 	}
1969 
1970 	case SO_PEERNAME:
1971 	{
1972 		struct sockaddr_storage address;
1973 
1974 		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1975 		if (lv < 0)
1976 			return -ENOTCONN;
1977 		if (lv < len)
1978 			return -EINVAL;
1979 		if (copy_to_sockptr(optval, &address, len))
1980 			return -EFAULT;
1981 		goto lenout;
1982 	}
1983 
1984 	/* Dubious BSD thing... Probably nobody even uses it, but
1985 	 * the UNIX standard wants it for whatever reason... -DaveM
1986 	 */
1987 	case SO_ACCEPTCONN:
1988 		v.val = sk->sk_state == TCP_LISTEN;
1989 		break;
1990 
1991 	case SO_PASSSEC:
1992 		if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk))
1993 			return -EOPNOTSUPP;
1994 
1995 		v.val = sk->sk_scm_security;
1996 		break;
1997 
1998 	case SO_PEERSEC:
1999 		return security_socket_getpeersec_stream(sock,
2000 							 optval, optlen, len);
2001 
2002 	case SO_MARK:
2003 		v.val = READ_ONCE(sk->sk_mark);
2004 		break;
2005 
2006 	case SO_RCVMARK:
2007 		v.val = sock_flag(sk, SOCK_RCVMARK);
2008 		break;
2009 
2010 	case SO_RCVPRIORITY:
2011 		v.val = sock_flag(sk, SOCK_RCVPRIORITY);
2012 		break;
2013 
2014 	case SO_RXQ_OVFL:
2015 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
2016 		break;
2017 
2018 	case SO_WIFI_STATUS:
2019 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
2020 		break;
2021 
2022 	case SO_PEEK_OFF:
2023 		if (!READ_ONCE(sock->ops)->set_peek_off)
2024 			return -EOPNOTSUPP;
2025 
2026 		v.val = READ_ONCE(sk->sk_peek_off);
2027 		break;
2028 	case SO_NOFCS:
2029 		v.val = sock_flag(sk, SOCK_NOFCS);
2030 		break;
2031 
2032 	case SO_BINDTODEVICE:
2033 		return sock_getbindtodevice(sk, optval, optlen, len);
2034 
2035 	case SO_GET_FILTER:
2036 		len = sk_get_filter(sk, optval, len);
2037 		if (len < 0)
2038 			return len;
2039 
2040 		goto lenout;
2041 
2042 	case SO_LOCK_FILTER:
2043 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
2044 		break;
2045 
2046 	case SO_BPF_EXTENSIONS:
2047 		v.val = bpf_tell_extensions();
2048 		break;
2049 
2050 	case SO_SELECT_ERR_QUEUE:
2051 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
2052 		break;
2053 
2054 #ifdef CONFIG_NET_RX_BUSY_POLL
2055 	case SO_BUSY_POLL:
2056 		v.val = READ_ONCE(sk->sk_ll_usec);
2057 		break;
2058 	case SO_PREFER_BUSY_POLL:
2059 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
2060 		break;
2061 #endif
2062 
2063 	case SO_MAX_PACING_RATE:
2064 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
2065 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
2066 			lv = sizeof(v.ulval);
2067 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
2068 		} else {
2069 			/* 32bit version */
2070 			v.val = min_t(unsigned long, ~0U,
2071 				      READ_ONCE(sk->sk_max_pacing_rate));
2072 		}
2073 		break;
2074 
2075 	case SO_INCOMING_CPU:
2076 		v.val = READ_ONCE(sk->sk_incoming_cpu);
2077 		break;
2078 
2079 	case SO_MEMINFO:
2080 	{
2081 		u32 meminfo[SK_MEMINFO_VARS];
2082 
2083 		sk_get_meminfo(sk, meminfo);
2084 
2085 		len = min_t(unsigned int, len, sizeof(meminfo));
2086 		if (copy_to_sockptr(optval, &meminfo, len))
2087 			return -EFAULT;
2088 
2089 		goto lenout;
2090 	}
2091 
2092 #ifdef CONFIG_NET_RX_BUSY_POLL
2093 	case SO_INCOMING_NAPI_ID:
2094 		v.val = READ_ONCE(sk->sk_napi_id);
2095 
2096 		/* aggregate non-NAPI IDs down to 0 */
2097 		if (!napi_id_valid(v.val))
2098 			v.val = 0;
2099 
2100 		break;
2101 #endif
2102 
2103 	case SO_COOKIE:
2104 		lv = sizeof(u64);
2105 		if (len < lv)
2106 			return -EINVAL;
2107 		v.val64 = sock_gen_cookie(sk);
2108 		break;
2109 
2110 	case SO_ZEROCOPY:
2111 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
2112 		break;
2113 
2114 	case SO_TXTIME:
2115 		lv = sizeof(v.txtime);
2116 		v.txtime.clockid = sk->sk_clockid;
2117 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
2118 				  SOF_TXTIME_DEADLINE_MODE : 0;
2119 		v.txtime.flags |= sk->sk_txtime_report_errors ?
2120 				  SOF_TXTIME_REPORT_ERRORS : 0;
2121 		break;
2122 
2123 	case SO_BINDTOIFINDEX:
2124 		v.val = READ_ONCE(sk->sk_bound_dev_if);
2125 		break;
2126 
2127 	case SO_NETNS_COOKIE:
2128 		lv = sizeof(u64);
2129 		if (len != lv)
2130 			return -EINVAL;
2131 		v.val64 = sock_net(sk)->net_cookie;
2132 		break;
2133 
2134 	case SO_BUF_LOCK:
2135 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
2136 		break;
2137 
2138 	case SO_RESERVE_MEM:
2139 		v.val = READ_ONCE(sk->sk_reserved_mem);
2140 		break;
2141 
2142 	case SO_TXREHASH:
2143 		if (!sk_is_tcp(sk))
2144 			return -EOPNOTSUPP;
2145 
2146 		/* Paired with WRITE_ONCE() in sk_setsockopt() */
2147 		v.val = READ_ONCE(sk->sk_txrehash);
2148 		break;
2149 
2150 	default:
2151 		/* We implement the SO_SNDLOWAT etc to not be settable
2152 		 * (1003.1g 7).
2153 		 */
2154 		return -ENOPROTOOPT;
2155 	}
2156 
2157 	if (len > lv)
2158 		len = lv;
2159 	if (copy_to_sockptr(optval, &v, len))
2160 		return -EFAULT;
2161 lenout:
2162 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2163 		return -EFAULT;
2164 	return 0;
2165 }
2166 
2167 /*
2168  * Initialize an sk_lock.
2169  *
2170  * (We also register the sk_lock with the lock validator.)
2171  */
2172 static inline void sock_lock_init(struct sock *sk)
2173 {
2174 	sk_owner_clear(sk);
2175 
2176 	if (sk->sk_kern_sock)
2177 		sock_lock_init_class_and_name(
2178 			sk,
2179 			af_family_kern_slock_key_strings[sk->sk_family],
2180 			af_family_kern_slock_keys + sk->sk_family,
2181 			af_family_kern_key_strings[sk->sk_family],
2182 			af_family_kern_keys + sk->sk_family);
2183 	else
2184 		sock_lock_init_class_and_name(
2185 			sk,
2186 			af_family_slock_key_strings[sk->sk_family],
2187 			af_family_slock_keys + sk->sk_family,
2188 			af_family_key_strings[sk->sk_family],
2189 			af_family_keys + sk->sk_family);
2190 }
2191 
2192 /*
2193  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2194  * even temporarily, because of RCU lookups. sk_node should also be left as is.
2195  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2196  */
2197 static void sock_copy(struct sock *nsk, const struct sock *osk)
2198 {
2199 	const struct proto *prot = READ_ONCE(osk->sk_prot);
2200 #ifdef CONFIG_SECURITY_NETWORK
2201 	void *sptr = nsk->sk_security;
2202 #endif
2203 
2204 	/* If we move sk_tx_queue_mapping out of the private section,
2205 	 * we must check if sk_tx_queue_clear() is called after
2206 	 * sock_copy() in sk_clone_lock().
2207 	 */
2208 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2209 		     offsetof(struct sock, sk_dontcopy_begin) ||
2210 		     offsetof(struct sock, sk_tx_queue_mapping) >=
2211 		     offsetof(struct sock, sk_dontcopy_end));
2212 
2213 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2214 
2215 	unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2216 		      prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2217 		      /* alloc is larger than struct, see sk_prot_alloc() */);
2218 
2219 #ifdef CONFIG_SECURITY_NETWORK
2220 	nsk->sk_security = sptr;
2221 	security_sk_clone(osk, nsk);
2222 #endif
2223 }
2224 
2225 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2226 		int family)
2227 {
2228 	struct sock *sk;
2229 	struct kmem_cache *slab;
2230 
2231 	slab = prot->slab;
2232 	if (slab != NULL) {
2233 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2234 		if (!sk)
2235 			return sk;
2236 		if (want_init_on_alloc(priority))
2237 			sk_prot_clear_nulls(sk, prot->obj_size);
2238 	} else
2239 		sk = kmalloc(prot->obj_size, priority);
2240 
2241 	if (sk != NULL) {
2242 		if (security_sk_alloc(sk, family, priority))
2243 			goto out_free;
2244 
2245 		if (!try_module_get(prot->owner))
2246 			goto out_free_sec;
2247 	}
2248 
2249 	return sk;
2250 
2251 out_free_sec:
2252 	security_sk_free(sk);
2253 out_free:
2254 	if (slab != NULL)
2255 		kmem_cache_free(slab, sk);
2256 	else
2257 		kfree(sk);
2258 	return NULL;
2259 }
2260 
2261 static void sk_prot_free(struct proto *prot, struct sock *sk)
2262 {
2263 	struct kmem_cache *slab;
2264 	struct module *owner;
2265 
2266 	owner = prot->owner;
2267 	slab = prot->slab;
2268 
2269 	cgroup_sk_free(&sk->sk_cgrp_data);
2270 	mem_cgroup_sk_free(sk);
2271 	security_sk_free(sk);
2272 
2273 	sk_owner_put(sk);
2274 
2275 	if (slab != NULL)
2276 		kmem_cache_free(slab, sk);
2277 	else
2278 		kfree(sk);
2279 	module_put(owner);
2280 }
2281 
2282 /**
2283  *	sk_alloc - All socket objects are allocated here
2284  *	@net: the applicable net namespace
2285  *	@family: protocol family
2286  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2287  *	@prot: struct proto associated with this new sock instance
2288  *	@kern: is this to be a kernel socket?
2289  */
2290 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2291 		      struct proto *prot, int kern)
2292 {
2293 	struct sock *sk;
2294 
2295 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2296 	if (sk) {
2297 		sk->sk_family = family;
2298 		/*
2299 		 * See comment in struct sock definition to understand
2300 		 * why we need sk_prot_creator -acme
2301 		 */
2302 		sk->sk_prot = sk->sk_prot_creator = prot;
2303 		sk->sk_kern_sock = kern;
2304 		sock_lock_init(sk);
2305 		sk->sk_net_refcnt = kern ? 0 : 1;
2306 		if (likely(sk->sk_net_refcnt)) {
2307 			get_net_track(net, &sk->ns_tracker, priority);
2308 			sock_inuse_add(net, 1);
2309 		} else {
2310 			net_passive_inc(net);
2311 			__netns_tracker_alloc(net, &sk->ns_tracker,
2312 					      false, priority);
2313 		}
2314 
2315 		sock_net_set(sk, net);
2316 		refcount_set(&sk->sk_wmem_alloc, 1);
2317 
2318 		mem_cgroup_sk_alloc(sk);
2319 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2320 		sock_update_classid(&sk->sk_cgrp_data);
2321 		sock_update_netprioidx(&sk->sk_cgrp_data);
2322 		sk_tx_queue_clear(sk);
2323 	}
2324 
2325 	return sk;
2326 }
2327 EXPORT_SYMBOL(sk_alloc);
2328 
2329 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2330  * grace period. This is the case for UDP sockets and TCP listeners.
2331  */
2332 static void __sk_destruct(struct rcu_head *head)
2333 {
2334 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2335 	struct net *net = sock_net(sk);
2336 	struct sk_filter *filter;
2337 
2338 	if (sk->sk_destruct)
2339 		sk->sk_destruct(sk);
2340 
2341 	filter = rcu_dereference_check(sk->sk_filter,
2342 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2343 	if (filter) {
2344 		sk_filter_uncharge(sk, filter);
2345 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2346 	}
2347 
2348 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2349 
2350 #ifdef CONFIG_BPF_SYSCALL
2351 	bpf_sk_storage_free(sk);
2352 #endif
2353 
2354 	if (atomic_read(&sk->sk_omem_alloc))
2355 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2356 			 __func__, atomic_read(&sk->sk_omem_alloc));
2357 
2358 	if (sk->sk_frag.page) {
2359 		put_page(sk->sk_frag.page);
2360 		sk->sk_frag.page = NULL;
2361 	}
2362 
2363 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2364 	put_cred(sk->sk_peer_cred);
2365 	put_pid(sk->sk_peer_pid);
2366 
2367 	if (likely(sk->sk_net_refcnt)) {
2368 		put_net_track(net, &sk->ns_tracker);
2369 	} else {
2370 		__netns_tracker_free(net, &sk->ns_tracker, false);
2371 		net_passive_dec(net);
2372 	}
2373 	sk_prot_free(sk->sk_prot_creator, sk);
2374 }
2375 
2376 void sk_net_refcnt_upgrade(struct sock *sk)
2377 {
2378 	struct net *net = sock_net(sk);
2379 
2380 	WARN_ON_ONCE(sk->sk_net_refcnt);
2381 	__netns_tracker_free(net, &sk->ns_tracker, false);
2382 	net_passive_dec(net);
2383 	sk->sk_net_refcnt = 1;
2384 	get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
2385 	sock_inuse_add(net, 1);
2386 }
2387 EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade);
2388 
2389 void sk_destruct(struct sock *sk)
2390 {
2391 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2392 
2393 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2394 		reuseport_detach_sock(sk);
2395 		use_call_rcu = true;
2396 	}
2397 
2398 	if (use_call_rcu)
2399 		call_rcu(&sk->sk_rcu, __sk_destruct);
2400 	else
2401 		__sk_destruct(&sk->sk_rcu);
2402 }
2403 
2404 static void __sk_free(struct sock *sk)
2405 {
2406 	if (likely(sk->sk_net_refcnt))
2407 		sock_inuse_add(sock_net(sk), -1);
2408 
2409 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2410 		sock_diag_broadcast_destroy(sk);
2411 	else
2412 		sk_destruct(sk);
2413 }
2414 
2415 void sk_free(struct sock *sk)
2416 {
2417 	/*
2418 	 * We subtract one from sk_wmem_alloc and can know if
2419 	 * some packets are still in some tx queue.
2420 	 * If not null, sock_wfree() will call __sk_free(sk) later
2421 	 */
2422 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2423 		__sk_free(sk);
2424 }
2425 EXPORT_SYMBOL(sk_free);
2426 
2427 static void sk_init_common(struct sock *sk)
2428 {
2429 	skb_queue_head_init(&sk->sk_receive_queue);
2430 	skb_queue_head_init(&sk->sk_write_queue);
2431 	skb_queue_head_init(&sk->sk_error_queue);
2432 
2433 	rwlock_init(&sk->sk_callback_lock);
2434 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2435 			af_rlock_keys + sk->sk_family,
2436 			af_family_rlock_key_strings[sk->sk_family]);
2437 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2438 			af_wlock_keys + sk->sk_family,
2439 			af_family_wlock_key_strings[sk->sk_family]);
2440 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2441 			af_elock_keys + sk->sk_family,
2442 			af_family_elock_key_strings[sk->sk_family]);
2443 	if (sk->sk_kern_sock)
2444 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2445 			af_kern_callback_keys + sk->sk_family,
2446 			af_family_kern_clock_key_strings[sk->sk_family]);
2447 	else
2448 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2449 			af_callback_keys + sk->sk_family,
2450 			af_family_clock_key_strings[sk->sk_family]);
2451 }
2452 
2453 /**
2454  *	sk_clone_lock - clone a socket, and lock its clone
2455  *	@sk: the socket to clone
2456  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2457  *
2458  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2459  */
2460 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2461 {
2462 	struct proto *prot = READ_ONCE(sk->sk_prot);
2463 	struct sk_filter *filter;
2464 	bool is_charged = true;
2465 	struct sock *newsk;
2466 
2467 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2468 	if (!newsk)
2469 		goto out;
2470 
2471 	sock_copy(newsk, sk);
2472 
2473 	newsk->sk_prot_creator = prot;
2474 
2475 	/* SANITY */
2476 	if (likely(newsk->sk_net_refcnt)) {
2477 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2478 		sock_inuse_add(sock_net(newsk), 1);
2479 	} else {
2480 		/* Kernel sockets are not elevating the struct net refcount.
2481 		 * Instead, use a tracker to more easily detect if a layer
2482 		 * is not properly dismantling its kernel sockets at netns
2483 		 * destroy time.
2484 		 */
2485 		net_passive_inc(sock_net(newsk));
2486 		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2487 				      false, priority);
2488 	}
2489 	sk_node_init(&newsk->sk_node);
2490 	sock_lock_init(newsk);
2491 	bh_lock_sock(newsk);
2492 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2493 	newsk->sk_backlog.len = 0;
2494 
2495 	atomic_set(&newsk->sk_rmem_alloc, 0);
2496 
2497 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2498 	refcount_set(&newsk->sk_wmem_alloc, 1);
2499 
2500 	atomic_set(&newsk->sk_omem_alloc, 0);
2501 	sk_init_common(newsk);
2502 
2503 	newsk->sk_dst_cache	= NULL;
2504 	newsk->sk_dst_pending_confirm = 0;
2505 	newsk->sk_wmem_queued	= 0;
2506 	newsk->sk_forward_alloc = 0;
2507 	newsk->sk_reserved_mem  = 0;
2508 	atomic_set(&newsk->sk_drops, 0);
2509 	newsk->sk_send_head	= NULL;
2510 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2511 	atomic_set(&newsk->sk_zckey, 0);
2512 
2513 	sock_reset_flag(newsk, SOCK_DONE);
2514 
2515 	/* sk->sk_memcg will be populated at accept() time */
2516 	newsk->sk_memcg = NULL;
2517 
2518 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2519 
2520 	rcu_read_lock();
2521 	filter = rcu_dereference(sk->sk_filter);
2522 	if (filter != NULL)
2523 		/* though it's an empty new sock, the charging may fail
2524 		 * if sysctl_optmem_max was changed between creation of
2525 		 * original socket and cloning
2526 		 */
2527 		is_charged = sk_filter_charge(newsk, filter);
2528 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2529 	rcu_read_unlock();
2530 
2531 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2532 		/* We need to make sure that we don't uncharge the new
2533 		 * socket if we couldn't charge it in the first place
2534 		 * as otherwise we uncharge the parent's filter.
2535 		 */
2536 		if (!is_charged)
2537 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2538 
2539 		goto free;
2540 	}
2541 
2542 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2543 
2544 	if (bpf_sk_storage_clone(sk, newsk))
2545 		goto free;
2546 
2547 	/* Clear sk_user_data if parent had the pointer tagged
2548 	 * as not suitable for copying when cloning.
2549 	 */
2550 	if (sk_user_data_is_nocopy(newsk))
2551 		newsk->sk_user_data = NULL;
2552 
2553 	newsk->sk_err	   = 0;
2554 	newsk->sk_err_soft = 0;
2555 	newsk->sk_priority = 0;
2556 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2557 
2558 	/* Before updating sk_refcnt, we must commit prior changes to memory
2559 	 * (Documentation/RCU/rculist_nulls.rst for details)
2560 	 */
2561 	smp_wmb();
2562 	refcount_set(&newsk->sk_refcnt, 2);
2563 
2564 	sk_set_socket(newsk, NULL);
2565 	sk_tx_queue_clear(newsk);
2566 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2567 
2568 	if (newsk->sk_prot->sockets_allocated)
2569 		sk_sockets_allocated_inc(newsk);
2570 
2571 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2572 		net_enable_timestamp();
2573 out:
2574 	return newsk;
2575 free:
2576 	/* It is still raw copy of parent, so invalidate
2577 	 * destructor and make plain sk_free()
2578 	 */
2579 	newsk->sk_destruct = NULL;
2580 	bh_unlock_sock(newsk);
2581 	sk_free(newsk);
2582 	newsk = NULL;
2583 	goto out;
2584 }
2585 EXPORT_SYMBOL_GPL(sk_clone_lock);
2586 
2587 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2588 {
2589 	bool is_ipv6 = false;
2590 	u32 max_size;
2591 
2592 #if IS_ENABLED(CONFIG_IPV6)
2593 	is_ipv6 = (sk->sk_family == AF_INET6 &&
2594 		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2595 #endif
2596 	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2597 	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2598 			READ_ONCE(dst->dev->gso_ipv4_max_size);
2599 	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2600 		max_size = GSO_LEGACY_MAX_SIZE;
2601 
2602 	return max_size - (MAX_TCP_HEADER + 1);
2603 }
2604 
2605 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2606 {
2607 	u32 max_segs = 1;
2608 
2609 	sk->sk_route_caps = dst->dev->features;
2610 	if (sk_is_tcp(sk)) {
2611 		struct inet_connection_sock *icsk = inet_csk(sk);
2612 
2613 		sk->sk_route_caps |= NETIF_F_GSO;
2614 		icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK);
2615 	}
2616 	if (sk->sk_route_caps & NETIF_F_GSO)
2617 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2618 	if (unlikely(sk->sk_gso_disabled))
2619 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2620 	if (sk_can_gso(sk)) {
2621 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2622 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2623 		} else {
2624 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2625 			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2626 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2627 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2628 		}
2629 	}
2630 	sk->sk_gso_max_segs = max_segs;
2631 	sk_dst_set(sk, dst);
2632 }
2633 EXPORT_SYMBOL_GPL(sk_setup_caps);
2634 
2635 /*
2636  *	Simple resource managers for sockets.
2637  */
2638 
2639 
2640 /*
2641  * Write buffer destructor automatically called from kfree_skb.
2642  */
2643 void sock_wfree(struct sk_buff *skb)
2644 {
2645 	struct sock *sk = skb->sk;
2646 	unsigned int len = skb->truesize;
2647 	bool free;
2648 
2649 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2650 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2651 		    sk->sk_write_space == sock_def_write_space) {
2652 			rcu_read_lock();
2653 			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2654 			sock_def_write_space_wfree(sk);
2655 			rcu_read_unlock();
2656 			if (unlikely(free))
2657 				__sk_free(sk);
2658 			return;
2659 		}
2660 
2661 		/*
2662 		 * Keep a reference on sk_wmem_alloc, this will be released
2663 		 * after sk_write_space() call
2664 		 */
2665 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2666 		sk->sk_write_space(sk);
2667 		len = 1;
2668 	}
2669 	/*
2670 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2671 	 * could not do because of in-flight packets
2672 	 */
2673 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2674 		__sk_free(sk);
2675 }
2676 EXPORT_SYMBOL(sock_wfree);
2677 
2678 /* This variant of sock_wfree() is used by TCP,
2679  * since it sets SOCK_USE_WRITE_QUEUE.
2680  */
2681 void __sock_wfree(struct sk_buff *skb)
2682 {
2683 	struct sock *sk = skb->sk;
2684 
2685 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2686 		__sk_free(sk);
2687 }
2688 
2689 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2690 {
2691 	skb_orphan(skb);
2692 #ifdef CONFIG_INET
2693 	if (unlikely(!sk_fullsock(sk)))
2694 		return skb_set_owner_edemux(skb, sk);
2695 #endif
2696 	skb->sk = sk;
2697 	skb->destructor = sock_wfree;
2698 	skb_set_hash_from_sk(skb, sk);
2699 	/*
2700 	 * We used to take a refcount on sk, but following operation
2701 	 * is enough to guarantee sk_free() won't free this sock until
2702 	 * all in-flight packets are completed
2703 	 */
2704 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2705 }
2706 EXPORT_SYMBOL(skb_set_owner_w);
2707 
2708 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2709 {
2710 	/* Drivers depend on in-order delivery for crypto offload,
2711 	 * partial orphan breaks out-of-order-OK logic.
2712 	 */
2713 	if (skb_is_decrypted(skb))
2714 		return false;
2715 
2716 	return (skb->destructor == sock_wfree ||
2717 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2718 }
2719 
2720 /* This helper is used by netem, as it can hold packets in its
2721  * delay queue. We want to allow the owner socket to send more
2722  * packets, as if they were already TX completed by a typical driver.
2723  * But we also want to keep skb->sk set because some packet schedulers
2724  * rely on it (sch_fq for example).
2725  */
2726 void skb_orphan_partial(struct sk_buff *skb)
2727 {
2728 	if (skb_is_tcp_pure_ack(skb))
2729 		return;
2730 
2731 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2732 		return;
2733 
2734 	skb_orphan(skb);
2735 }
2736 EXPORT_SYMBOL(skb_orphan_partial);
2737 
2738 /*
2739  * Read buffer destructor automatically called from kfree_skb.
2740  */
2741 void sock_rfree(struct sk_buff *skb)
2742 {
2743 	struct sock *sk = skb->sk;
2744 	unsigned int len = skb->truesize;
2745 
2746 	atomic_sub(len, &sk->sk_rmem_alloc);
2747 	sk_mem_uncharge(sk, len);
2748 }
2749 EXPORT_SYMBOL(sock_rfree);
2750 
2751 /*
2752  * Buffer destructor for skbs that are not used directly in read or write
2753  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2754  */
2755 void sock_efree(struct sk_buff *skb)
2756 {
2757 	sock_put(skb->sk);
2758 }
2759 EXPORT_SYMBOL(sock_efree);
2760 
2761 /* Buffer destructor for prefetch/receive path where reference count may
2762  * not be held, e.g. for listen sockets.
2763  */
2764 #ifdef CONFIG_INET
2765 void sock_pfree(struct sk_buff *skb)
2766 {
2767 	struct sock *sk = skb->sk;
2768 
2769 	if (!sk_is_refcounted(sk))
2770 		return;
2771 
2772 	if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2773 		inet_reqsk(sk)->rsk_listener = NULL;
2774 		reqsk_free(inet_reqsk(sk));
2775 		return;
2776 	}
2777 
2778 	sock_gen_put(sk);
2779 }
2780 EXPORT_SYMBOL(sock_pfree);
2781 #endif /* CONFIG_INET */
2782 
2783 kuid_t sock_i_uid(struct sock *sk)
2784 {
2785 	kuid_t uid;
2786 
2787 	read_lock_bh(&sk->sk_callback_lock);
2788 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2789 	read_unlock_bh(&sk->sk_callback_lock);
2790 	return uid;
2791 }
2792 EXPORT_SYMBOL(sock_i_uid);
2793 
2794 unsigned long __sock_i_ino(struct sock *sk)
2795 {
2796 	unsigned long ino;
2797 
2798 	read_lock(&sk->sk_callback_lock);
2799 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2800 	read_unlock(&sk->sk_callback_lock);
2801 	return ino;
2802 }
2803 EXPORT_SYMBOL(__sock_i_ino);
2804 
2805 unsigned long sock_i_ino(struct sock *sk)
2806 {
2807 	unsigned long ino;
2808 
2809 	local_bh_disable();
2810 	ino = __sock_i_ino(sk);
2811 	local_bh_enable();
2812 	return ino;
2813 }
2814 EXPORT_SYMBOL(sock_i_ino);
2815 
2816 /*
2817  * Allocate a skb from the socket's send buffer.
2818  */
2819 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2820 			     gfp_t priority)
2821 {
2822 	if (force ||
2823 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2824 		struct sk_buff *skb = alloc_skb(size, priority);
2825 
2826 		if (skb) {
2827 			skb_set_owner_w(skb, sk);
2828 			return skb;
2829 		}
2830 	}
2831 	return NULL;
2832 }
2833 EXPORT_SYMBOL(sock_wmalloc);
2834 
2835 static void sock_ofree(struct sk_buff *skb)
2836 {
2837 	struct sock *sk = skb->sk;
2838 
2839 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2840 }
2841 
2842 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2843 			     gfp_t priority)
2844 {
2845 	struct sk_buff *skb;
2846 
2847 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2848 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2849 	    READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2850 		return NULL;
2851 
2852 	skb = alloc_skb(size, priority);
2853 	if (!skb)
2854 		return NULL;
2855 
2856 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2857 	skb->sk = sk;
2858 	skb->destructor = sock_ofree;
2859 	return skb;
2860 }
2861 
2862 /*
2863  * Allocate a memory block from the socket's option memory buffer.
2864  */
2865 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2866 {
2867 	int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2868 
2869 	if ((unsigned int)size <= optmem_max &&
2870 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2871 		void *mem;
2872 		/* First do the add, to avoid the race if kmalloc
2873 		 * might sleep.
2874 		 */
2875 		atomic_add(size, &sk->sk_omem_alloc);
2876 		mem = kmalloc(size, priority);
2877 		if (mem)
2878 			return mem;
2879 		atomic_sub(size, &sk->sk_omem_alloc);
2880 	}
2881 	return NULL;
2882 }
2883 EXPORT_SYMBOL(sock_kmalloc);
2884 
2885 /*
2886  * Duplicate the input "src" memory block using the socket's
2887  * option memory buffer.
2888  */
2889 void *sock_kmemdup(struct sock *sk, const void *src,
2890 		   int size, gfp_t priority)
2891 {
2892 	void *mem;
2893 
2894 	mem = sock_kmalloc(sk, size, priority);
2895 	if (mem)
2896 		memcpy(mem, src, size);
2897 	return mem;
2898 }
2899 EXPORT_SYMBOL(sock_kmemdup);
2900 
2901 /* Free an option memory block. Note, we actually want the inline
2902  * here as this allows gcc to detect the nullify and fold away the
2903  * condition entirely.
2904  */
2905 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2906 				  const bool nullify)
2907 {
2908 	if (WARN_ON_ONCE(!mem))
2909 		return;
2910 	if (nullify)
2911 		kfree_sensitive(mem);
2912 	else
2913 		kfree(mem);
2914 	atomic_sub(size, &sk->sk_omem_alloc);
2915 }
2916 
2917 void sock_kfree_s(struct sock *sk, void *mem, int size)
2918 {
2919 	__sock_kfree_s(sk, mem, size, false);
2920 }
2921 EXPORT_SYMBOL(sock_kfree_s);
2922 
2923 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2924 {
2925 	__sock_kfree_s(sk, mem, size, true);
2926 }
2927 EXPORT_SYMBOL(sock_kzfree_s);
2928 
2929 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2930    I think, these locks should be removed for datagram sockets.
2931  */
2932 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2933 {
2934 	DEFINE_WAIT(wait);
2935 
2936 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2937 	for (;;) {
2938 		if (!timeo)
2939 			break;
2940 		if (signal_pending(current))
2941 			break;
2942 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2943 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2944 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2945 			break;
2946 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2947 			break;
2948 		if (READ_ONCE(sk->sk_err))
2949 			break;
2950 		timeo = schedule_timeout(timeo);
2951 	}
2952 	finish_wait(sk_sleep(sk), &wait);
2953 	return timeo;
2954 }
2955 
2956 
2957 /*
2958  *	Generic send/receive buffer handlers
2959  */
2960 
2961 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2962 				     unsigned long data_len, int noblock,
2963 				     int *errcode, int max_page_order)
2964 {
2965 	struct sk_buff *skb;
2966 	long timeo;
2967 	int err;
2968 
2969 	timeo = sock_sndtimeo(sk, noblock);
2970 	for (;;) {
2971 		err = sock_error(sk);
2972 		if (err != 0)
2973 			goto failure;
2974 
2975 		err = -EPIPE;
2976 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2977 			goto failure;
2978 
2979 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2980 			break;
2981 
2982 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2983 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2984 		err = -EAGAIN;
2985 		if (!timeo)
2986 			goto failure;
2987 		if (signal_pending(current))
2988 			goto interrupted;
2989 		timeo = sock_wait_for_wmem(sk, timeo);
2990 	}
2991 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2992 				   errcode, sk->sk_allocation);
2993 	if (skb)
2994 		skb_set_owner_w(skb, sk);
2995 	return skb;
2996 
2997 interrupted:
2998 	err = sock_intr_errno(timeo);
2999 failure:
3000 	*errcode = err;
3001 	return NULL;
3002 }
3003 EXPORT_SYMBOL(sock_alloc_send_pskb);
3004 
3005 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
3006 		     struct sockcm_cookie *sockc)
3007 {
3008 	u32 tsflags;
3009 
3010 	BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));
3011 
3012 	switch (cmsg->cmsg_type) {
3013 	case SO_MARK:
3014 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
3015 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3016 			return -EPERM;
3017 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3018 			return -EINVAL;
3019 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
3020 		break;
3021 	case SO_TIMESTAMPING_OLD:
3022 	case SO_TIMESTAMPING_NEW:
3023 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3024 			return -EINVAL;
3025 
3026 		tsflags = *(u32 *)CMSG_DATA(cmsg);
3027 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
3028 			return -EINVAL;
3029 
3030 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
3031 		sockc->tsflags |= tsflags;
3032 		break;
3033 	case SCM_TXTIME:
3034 		if (!sock_flag(sk, SOCK_TXTIME))
3035 			return -EINVAL;
3036 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
3037 			return -EINVAL;
3038 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
3039 		break;
3040 	case SCM_TS_OPT_ID:
3041 		if (sk_is_tcp(sk))
3042 			return -EINVAL;
3043 		tsflags = READ_ONCE(sk->sk_tsflags);
3044 		if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
3045 			return -EINVAL;
3046 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3047 			return -EINVAL;
3048 		sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg);
3049 		sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID;
3050 		break;
3051 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
3052 	case SCM_RIGHTS:
3053 	case SCM_CREDENTIALS:
3054 		break;
3055 	case SO_PRIORITY:
3056 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3057 			return -EINVAL;
3058 		if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg)))
3059 			return -EPERM;
3060 		sockc->priority = *(u32 *)CMSG_DATA(cmsg);
3061 		break;
3062 	case SCM_DEVMEM_DMABUF:
3063 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3064 			return -EINVAL;
3065 		sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg);
3066 		break;
3067 	default:
3068 		return -EINVAL;
3069 	}
3070 	return 0;
3071 }
3072 EXPORT_SYMBOL(__sock_cmsg_send);
3073 
3074 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
3075 		   struct sockcm_cookie *sockc)
3076 {
3077 	struct cmsghdr *cmsg;
3078 	int ret;
3079 
3080 	for_each_cmsghdr(cmsg, msg) {
3081 		if (!CMSG_OK(msg, cmsg))
3082 			return -EINVAL;
3083 		if (cmsg->cmsg_level != SOL_SOCKET)
3084 			continue;
3085 		ret = __sock_cmsg_send(sk, cmsg, sockc);
3086 		if (ret)
3087 			return ret;
3088 	}
3089 	return 0;
3090 }
3091 EXPORT_SYMBOL(sock_cmsg_send);
3092 
3093 static void sk_enter_memory_pressure(struct sock *sk)
3094 {
3095 	if (!sk->sk_prot->enter_memory_pressure)
3096 		return;
3097 
3098 	sk->sk_prot->enter_memory_pressure(sk);
3099 }
3100 
3101 static void sk_leave_memory_pressure(struct sock *sk)
3102 {
3103 	if (sk->sk_prot->leave_memory_pressure) {
3104 		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
3105 				     tcp_leave_memory_pressure, sk);
3106 	} else {
3107 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
3108 
3109 		if (memory_pressure && READ_ONCE(*memory_pressure))
3110 			WRITE_ONCE(*memory_pressure, 0);
3111 	}
3112 }
3113 
3114 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
3115 
3116 /**
3117  * skb_page_frag_refill - check that a page_frag contains enough room
3118  * @sz: minimum size of the fragment we want to get
3119  * @pfrag: pointer to page_frag
3120  * @gfp: priority for memory allocation
3121  *
3122  * Note: While this allocator tries to use high order pages, there is
3123  * no guarantee that allocations succeed. Therefore, @sz MUST be
3124  * less or equal than PAGE_SIZE.
3125  */
3126 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
3127 {
3128 	if (pfrag->page) {
3129 		if (page_ref_count(pfrag->page) == 1) {
3130 			pfrag->offset = 0;
3131 			return true;
3132 		}
3133 		if (pfrag->offset + sz <= pfrag->size)
3134 			return true;
3135 		put_page(pfrag->page);
3136 	}
3137 
3138 	pfrag->offset = 0;
3139 	if (SKB_FRAG_PAGE_ORDER &&
3140 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
3141 		/* Avoid direct reclaim but allow kswapd to wake */
3142 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
3143 					  __GFP_COMP | __GFP_NOWARN |
3144 					  __GFP_NORETRY,
3145 					  SKB_FRAG_PAGE_ORDER);
3146 		if (likely(pfrag->page)) {
3147 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
3148 			return true;
3149 		}
3150 	}
3151 	pfrag->page = alloc_page(gfp);
3152 	if (likely(pfrag->page)) {
3153 		pfrag->size = PAGE_SIZE;
3154 		return true;
3155 	}
3156 	return false;
3157 }
3158 EXPORT_SYMBOL(skb_page_frag_refill);
3159 
3160 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
3161 {
3162 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
3163 		return true;
3164 
3165 	sk_enter_memory_pressure(sk);
3166 	sk_stream_moderate_sndbuf(sk);
3167 	return false;
3168 }
3169 EXPORT_SYMBOL(sk_page_frag_refill);
3170 
3171 void __lock_sock(struct sock *sk)
3172 	__releases(&sk->sk_lock.slock)
3173 	__acquires(&sk->sk_lock.slock)
3174 {
3175 	DEFINE_WAIT(wait);
3176 
3177 	for (;;) {
3178 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
3179 					TASK_UNINTERRUPTIBLE);
3180 		spin_unlock_bh(&sk->sk_lock.slock);
3181 		schedule();
3182 		spin_lock_bh(&sk->sk_lock.slock);
3183 		if (!sock_owned_by_user(sk))
3184 			break;
3185 	}
3186 	finish_wait(&sk->sk_lock.wq, &wait);
3187 }
3188 
3189 void __release_sock(struct sock *sk)
3190 	__releases(&sk->sk_lock.slock)
3191 	__acquires(&sk->sk_lock.slock)
3192 {
3193 	struct sk_buff *skb, *next;
3194 
3195 	while ((skb = sk->sk_backlog.head) != NULL) {
3196 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
3197 
3198 		spin_unlock_bh(&sk->sk_lock.slock);
3199 
3200 		do {
3201 			next = skb->next;
3202 			prefetch(next);
3203 			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3204 			skb_mark_not_on_list(skb);
3205 			sk_backlog_rcv(sk, skb);
3206 
3207 			cond_resched();
3208 
3209 			skb = next;
3210 		} while (skb != NULL);
3211 
3212 		spin_lock_bh(&sk->sk_lock.slock);
3213 	}
3214 
3215 	/*
3216 	 * Doing the zeroing here guarantee we can not loop forever
3217 	 * while a wild producer attempts to flood us.
3218 	 */
3219 	sk->sk_backlog.len = 0;
3220 }
3221 
3222 void __sk_flush_backlog(struct sock *sk)
3223 {
3224 	spin_lock_bh(&sk->sk_lock.slock);
3225 	__release_sock(sk);
3226 
3227 	if (sk->sk_prot->release_cb)
3228 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3229 				     tcp_release_cb, sk);
3230 
3231 	spin_unlock_bh(&sk->sk_lock.slock);
3232 }
3233 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3234 
3235 /**
3236  * sk_wait_data - wait for data to arrive at sk_receive_queue
3237  * @sk:    sock to wait on
3238  * @timeo: for how long
3239  * @skb:   last skb seen on sk_receive_queue
3240  *
3241  * Now socket state including sk->sk_err is changed only under lock,
3242  * hence we may omit checks after joining wait queue.
3243  * We check receive queue before schedule() only as optimization;
3244  * it is very likely that release_sock() added new data.
3245  */
3246 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3247 {
3248 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3249 	int rc;
3250 
3251 	add_wait_queue(sk_sleep(sk), &wait);
3252 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3253 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3254 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3255 	remove_wait_queue(sk_sleep(sk), &wait);
3256 	return rc;
3257 }
3258 EXPORT_SYMBOL(sk_wait_data);
3259 
3260 /**
3261  *	__sk_mem_raise_allocated - increase memory_allocated
3262  *	@sk: socket
3263  *	@size: memory size to allocate
3264  *	@amt: pages to allocate
3265  *	@kind: allocation type
3266  *
3267  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3268  *
3269  *	Unlike the globally shared limits among the sockets under same protocol,
3270  *	consuming the budget of a memcg won't have direct effect on other ones.
3271  *	So be optimistic about memcg's tolerance, and leave the callers to decide
3272  *	whether or not to raise allocated through sk_under_memory_pressure() or
3273  *	its variants.
3274  */
3275 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3276 {
3277 	struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3278 	struct proto *prot = sk->sk_prot;
3279 	bool charged = true;
3280 	long allocated;
3281 
3282 	sk_memory_allocated_add(sk, amt);
3283 	allocated = sk_memory_allocated(sk);
3284 
3285 	if (memcg) {
3286 		charged = mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge());
3287 		if (!charged)
3288 			goto suppress_allocation;
3289 	}
3290 
3291 	/* Under limit. */
3292 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3293 		sk_leave_memory_pressure(sk);
3294 		return 1;
3295 	}
3296 
3297 	/* Under pressure. */
3298 	if (allocated > sk_prot_mem_limits(sk, 1))
3299 		sk_enter_memory_pressure(sk);
3300 
3301 	/* Over hard limit. */
3302 	if (allocated > sk_prot_mem_limits(sk, 2))
3303 		goto suppress_allocation;
3304 
3305 	/* Guarantee minimum buffer size under pressure (either global
3306 	 * or memcg) to make sure features described in RFC 7323 (TCP
3307 	 * Extensions for High Performance) work properly.
3308 	 *
3309 	 * This rule does NOT stand when exceeds global or memcg's hard
3310 	 * limit, or else a DoS attack can be taken place by spawning
3311 	 * lots of sockets whose usage are under minimum buffer size.
3312 	 */
3313 	if (kind == SK_MEM_RECV) {
3314 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3315 			return 1;
3316 
3317 	} else { /* SK_MEM_SEND */
3318 		int wmem0 = sk_get_wmem0(sk, prot);
3319 
3320 		if (sk->sk_type == SOCK_STREAM) {
3321 			if (sk->sk_wmem_queued < wmem0)
3322 				return 1;
3323 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3324 				return 1;
3325 		}
3326 	}
3327 
3328 	if (sk_has_memory_pressure(sk)) {
3329 		u64 alloc;
3330 
3331 		/* The following 'average' heuristic is within the
3332 		 * scope of global accounting, so it only makes
3333 		 * sense for global memory pressure.
3334 		 */
3335 		if (!sk_under_global_memory_pressure(sk))
3336 			return 1;
3337 
3338 		/* Try to be fair among all the sockets under global
3339 		 * pressure by allowing the ones that below average
3340 		 * usage to raise.
3341 		 */
3342 		alloc = sk_sockets_allocated_read_positive(sk);
3343 		if (sk_prot_mem_limits(sk, 2) > alloc *
3344 		    sk_mem_pages(sk->sk_wmem_queued +
3345 				 atomic_read(&sk->sk_rmem_alloc) +
3346 				 sk->sk_forward_alloc))
3347 			return 1;
3348 	}
3349 
3350 suppress_allocation:
3351 
3352 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3353 		sk_stream_moderate_sndbuf(sk);
3354 
3355 		/* Fail only if socket is _under_ its sndbuf.
3356 		 * In this case we cannot block, so that we have to fail.
3357 		 */
3358 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3359 			/* Force charge with __GFP_NOFAIL */
3360 			if (memcg && !charged) {
3361 				mem_cgroup_charge_skmem(memcg, amt,
3362 					gfp_memcg_charge() | __GFP_NOFAIL);
3363 			}
3364 			return 1;
3365 		}
3366 	}
3367 
3368 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3369 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3370 
3371 	sk_memory_allocated_sub(sk, amt);
3372 
3373 	if (memcg && charged)
3374 		mem_cgroup_uncharge_skmem(memcg, amt);
3375 
3376 	return 0;
3377 }
3378 
3379 /**
3380  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3381  *	@sk: socket
3382  *	@size: memory size to allocate
3383  *	@kind: allocation type
3384  *
3385  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3386  *	rmem allocation. This function assumes that protocols which have
3387  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3388  */
3389 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3390 {
3391 	int ret, amt = sk_mem_pages(size);
3392 
3393 	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3394 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3395 	if (!ret)
3396 		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3397 	return ret;
3398 }
3399 EXPORT_SYMBOL(__sk_mem_schedule);
3400 
3401 /**
3402  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3403  *	@sk: socket
3404  *	@amount: number of quanta
3405  *
3406  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3407  */
3408 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3409 {
3410 	sk_memory_allocated_sub(sk, amount);
3411 
3412 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3413 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3414 
3415 	if (sk_under_global_memory_pressure(sk) &&
3416 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3417 		sk_leave_memory_pressure(sk);
3418 }
3419 
3420 /**
3421  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3422  *	@sk: socket
3423  *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3424  */
3425 void __sk_mem_reclaim(struct sock *sk, int amount)
3426 {
3427 	amount >>= PAGE_SHIFT;
3428 	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3429 	__sk_mem_reduce_allocated(sk, amount);
3430 }
3431 EXPORT_SYMBOL(__sk_mem_reclaim);
3432 
3433 int sk_set_peek_off(struct sock *sk, int val)
3434 {
3435 	WRITE_ONCE(sk->sk_peek_off, val);
3436 	return 0;
3437 }
3438 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3439 
3440 /*
3441  * Set of default routines for initialising struct proto_ops when
3442  * the protocol does not support a particular function. In certain
3443  * cases where it makes no sense for a protocol to have a "do nothing"
3444  * function, some default processing is provided.
3445  */
3446 
3447 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3448 {
3449 	return -EOPNOTSUPP;
3450 }
3451 EXPORT_SYMBOL(sock_no_bind);
3452 
3453 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3454 		    int len, int flags)
3455 {
3456 	return -EOPNOTSUPP;
3457 }
3458 EXPORT_SYMBOL(sock_no_connect);
3459 
3460 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3461 {
3462 	return -EOPNOTSUPP;
3463 }
3464 EXPORT_SYMBOL(sock_no_socketpair);
3465 
3466 int sock_no_accept(struct socket *sock, struct socket *newsock,
3467 		   struct proto_accept_arg *arg)
3468 {
3469 	return -EOPNOTSUPP;
3470 }
3471 EXPORT_SYMBOL(sock_no_accept);
3472 
3473 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3474 		    int peer)
3475 {
3476 	return -EOPNOTSUPP;
3477 }
3478 EXPORT_SYMBOL(sock_no_getname);
3479 
3480 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3481 {
3482 	return -EOPNOTSUPP;
3483 }
3484 EXPORT_SYMBOL(sock_no_ioctl);
3485 
3486 int sock_no_listen(struct socket *sock, int backlog)
3487 {
3488 	return -EOPNOTSUPP;
3489 }
3490 EXPORT_SYMBOL(sock_no_listen);
3491 
3492 int sock_no_shutdown(struct socket *sock, int how)
3493 {
3494 	return -EOPNOTSUPP;
3495 }
3496 EXPORT_SYMBOL(sock_no_shutdown);
3497 
3498 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3499 {
3500 	return -EOPNOTSUPP;
3501 }
3502 EXPORT_SYMBOL(sock_no_sendmsg);
3503 
3504 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3505 {
3506 	return -EOPNOTSUPP;
3507 }
3508 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3509 
3510 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3511 		    int flags)
3512 {
3513 	return -EOPNOTSUPP;
3514 }
3515 EXPORT_SYMBOL(sock_no_recvmsg);
3516 
3517 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3518 {
3519 	/* Mirror missing mmap method error code */
3520 	return -ENODEV;
3521 }
3522 EXPORT_SYMBOL(sock_no_mmap);
3523 
3524 /*
3525  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3526  * various sock-based usage counts.
3527  */
3528 void __receive_sock(struct file *file)
3529 {
3530 	struct socket *sock;
3531 
3532 	sock = sock_from_file(file);
3533 	if (sock) {
3534 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3535 		sock_update_classid(&sock->sk->sk_cgrp_data);
3536 	}
3537 }
3538 
3539 /*
3540  *	Default Socket Callbacks
3541  */
3542 
3543 static void sock_def_wakeup(struct sock *sk)
3544 {
3545 	struct socket_wq *wq;
3546 
3547 	rcu_read_lock();
3548 	wq = rcu_dereference(sk->sk_wq);
3549 	if (skwq_has_sleeper(wq))
3550 		wake_up_interruptible_all(&wq->wait);
3551 	rcu_read_unlock();
3552 }
3553 
3554 static void sock_def_error_report(struct sock *sk)
3555 {
3556 	struct socket_wq *wq;
3557 
3558 	rcu_read_lock();
3559 	wq = rcu_dereference(sk->sk_wq);
3560 	if (skwq_has_sleeper(wq))
3561 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3562 	sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3563 	rcu_read_unlock();
3564 }
3565 
3566 void sock_def_readable(struct sock *sk)
3567 {
3568 	struct socket_wq *wq;
3569 
3570 	trace_sk_data_ready(sk);
3571 
3572 	rcu_read_lock();
3573 	wq = rcu_dereference(sk->sk_wq);
3574 	if (skwq_has_sleeper(wq))
3575 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3576 						EPOLLRDNORM | EPOLLRDBAND);
3577 	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3578 	rcu_read_unlock();
3579 }
3580 
3581 static void sock_def_write_space(struct sock *sk)
3582 {
3583 	struct socket_wq *wq;
3584 
3585 	rcu_read_lock();
3586 
3587 	/* Do not wake up a writer until he can make "significant"
3588 	 * progress.  --DaveM
3589 	 */
3590 	if (sock_writeable(sk)) {
3591 		wq = rcu_dereference(sk->sk_wq);
3592 		if (skwq_has_sleeper(wq))
3593 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3594 						EPOLLWRNORM | EPOLLWRBAND);
3595 
3596 		/* Should agree with poll, otherwise some programs break */
3597 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3598 	}
3599 
3600 	rcu_read_unlock();
3601 }
3602 
3603 /* An optimised version of sock_def_write_space(), should only be called
3604  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3605  * ->sk_wmem_alloc.
3606  */
3607 static void sock_def_write_space_wfree(struct sock *sk)
3608 {
3609 	/* Do not wake up a writer until he can make "significant"
3610 	 * progress.  --DaveM
3611 	 */
3612 	if (sock_writeable(sk)) {
3613 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3614 
3615 		/* rely on refcount_sub from sock_wfree() */
3616 		smp_mb__after_atomic();
3617 		if (wq && waitqueue_active(&wq->wait))
3618 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3619 						EPOLLWRNORM | EPOLLWRBAND);
3620 
3621 		/* Should agree with poll, otherwise some programs break */
3622 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3623 	}
3624 }
3625 
3626 static void sock_def_destruct(struct sock *sk)
3627 {
3628 }
3629 
3630 void sk_send_sigurg(struct sock *sk)
3631 {
3632 	if (sk->sk_socket && sk->sk_socket->file)
3633 		if (send_sigurg(sk->sk_socket->file))
3634 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3635 }
3636 EXPORT_SYMBOL(sk_send_sigurg);
3637 
3638 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3639 		    unsigned long expires)
3640 {
3641 	if (!mod_timer(timer, expires))
3642 		sock_hold(sk);
3643 }
3644 EXPORT_SYMBOL(sk_reset_timer);
3645 
3646 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3647 {
3648 	if (timer_delete(timer))
3649 		__sock_put(sk);
3650 }
3651 EXPORT_SYMBOL(sk_stop_timer);
3652 
3653 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3654 {
3655 	if (timer_delete_sync(timer))
3656 		__sock_put(sk);
3657 }
3658 EXPORT_SYMBOL(sk_stop_timer_sync);
3659 
3660 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3661 {
3662 	sk_init_common(sk);
3663 	sk->sk_send_head	=	NULL;
3664 
3665 	timer_setup(&sk->sk_timer, NULL, 0);
3666 
3667 	sk->sk_allocation	=	GFP_KERNEL;
3668 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3669 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3670 	sk->sk_state		=	TCP_CLOSE;
3671 	sk->sk_use_task_frag	=	true;
3672 	sk_set_socket(sk, sock);
3673 
3674 	sock_set_flag(sk, SOCK_ZAPPED);
3675 
3676 	if (sock) {
3677 		sk->sk_type	=	sock->type;
3678 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3679 		sock->sk	=	sk;
3680 	} else {
3681 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3682 	}
3683 	sk->sk_uid	=	uid;
3684 
3685 	sk->sk_state_change	=	sock_def_wakeup;
3686 	sk->sk_data_ready	=	sock_def_readable;
3687 	sk->sk_write_space	=	sock_def_write_space;
3688 	sk->sk_error_report	=	sock_def_error_report;
3689 	sk->sk_destruct		=	sock_def_destruct;
3690 
3691 	sk->sk_frag.page	=	NULL;
3692 	sk->sk_frag.offset	=	0;
3693 	sk->sk_peek_off		=	-1;
3694 
3695 	sk->sk_peer_pid 	=	NULL;
3696 	sk->sk_peer_cred	=	NULL;
3697 	spin_lock_init(&sk->sk_peer_lock);
3698 
3699 	sk->sk_write_pending	=	0;
3700 	sk->sk_rcvlowat		=	1;
3701 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3702 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3703 
3704 	sk->sk_stamp = SK_DEFAULT_STAMP;
3705 #if BITS_PER_LONG==32
3706 	seqlock_init(&sk->sk_stamp_seq);
3707 #endif
3708 	atomic_set(&sk->sk_zckey, 0);
3709 
3710 #ifdef CONFIG_NET_RX_BUSY_POLL
3711 	sk->sk_napi_id		=	0;
3712 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3713 #endif
3714 
3715 	sk->sk_max_pacing_rate = ~0UL;
3716 	sk->sk_pacing_rate = ~0UL;
3717 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3718 	sk->sk_incoming_cpu = -1;
3719 
3720 	sk_rx_queue_clear(sk);
3721 	/*
3722 	 * Before updating sk_refcnt, we must commit prior changes to memory
3723 	 * (Documentation/RCU/rculist_nulls.rst for details)
3724 	 */
3725 	smp_wmb();
3726 	refcount_set(&sk->sk_refcnt, 1);
3727 	atomic_set(&sk->sk_drops, 0);
3728 }
3729 EXPORT_SYMBOL(sock_init_data_uid);
3730 
3731 void sock_init_data(struct socket *sock, struct sock *sk)
3732 {
3733 	kuid_t uid = sock ?
3734 		SOCK_INODE(sock)->i_uid :
3735 		make_kuid(sock_net(sk)->user_ns, 0);
3736 
3737 	sock_init_data_uid(sock, sk, uid);
3738 }
3739 EXPORT_SYMBOL(sock_init_data);
3740 
3741 void lock_sock_nested(struct sock *sk, int subclass)
3742 {
3743 	/* The sk_lock has mutex_lock() semantics here. */
3744 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3745 
3746 	might_sleep();
3747 	spin_lock_bh(&sk->sk_lock.slock);
3748 	if (sock_owned_by_user_nocheck(sk))
3749 		__lock_sock(sk);
3750 	sk->sk_lock.owned = 1;
3751 	spin_unlock_bh(&sk->sk_lock.slock);
3752 }
3753 EXPORT_SYMBOL(lock_sock_nested);
3754 
3755 void release_sock(struct sock *sk)
3756 {
3757 	spin_lock_bh(&sk->sk_lock.slock);
3758 	if (sk->sk_backlog.tail)
3759 		__release_sock(sk);
3760 
3761 	if (sk->sk_prot->release_cb)
3762 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3763 				     tcp_release_cb, sk);
3764 
3765 	sock_release_ownership(sk);
3766 	if (waitqueue_active(&sk->sk_lock.wq))
3767 		wake_up(&sk->sk_lock.wq);
3768 	spin_unlock_bh(&sk->sk_lock.slock);
3769 }
3770 EXPORT_SYMBOL(release_sock);
3771 
3772 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3773 {
3774 	might_sleep();
3775 	spin_lock_bh(&sk->sk_lock.slock);
3776 
3777 	if (!sock_owned_by_user_nocheck(sk)) {
3778 		/*
3779 		 * Fast path return with bottom halves disabled and
3780 		 * sock::sk_lock.slock held.
3781 		 *
3782 		 * The 'mutex' is not contended and holding
3783 		 * sock::sk_lock.slock prevents all other lockers to
3784 		 * proceed so the corresponding unlock_sock_fast() can
3785 		 * avoid the slow path of release_sock() completely and
3786 		 * just release slock.
3787 		 *
3788 		 * From a semantical POV this is equivalent to 'acquiring'
3789 		 * the 'mutex', hence the corresponding lockdep
3790 		 * mutex_release() has to happen in the fast path of
3791 		 * unlock_sock_fast().
3792 		 */
3793 		return false;
3794 	}
3795 
3796 	__lock_sock(sk);
3797 	sk->sk_lock.owned = 1;
3798 	__acquire(&sk->sk_lock.slock);
3799 	spin_unlock_bh(&sk->sk_lock.slock);
3800 	return true;
3801 }
3802 EXPORT_SYMBOL(__lock_sock_fast);
3803 
3804 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3805 		   bool timeval, bool time32)
3806 {
3807 	struct sock *sk = sock->sk;
3808 	struct timespec64 ts;
3809 
3810 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3811 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3812 	if (ts.tv_sec == -1)
3813 		return -ENOENT;
3814 	if (ts.tv_sec == 0) {
3815 		ktime_t kt = ktime_get_real();
3816 		sock_write_timestamp(sk, kt);
3817 		ts = ktime_to_timespec64(kt);
3818 	}
3819 
3820 	if (timeval)
3821 		ts.tv_nsec /= 1000;
3822 
3823 #ifdef CONFIG_COMPAT_32BIT_TIME
3824 	if (time32)
3825 		return put_old_timespec32(&ts, userstamp);
3826 #endif
3827 #ifdef CONFIG_SPARC64
3828 	/* beware of padding in sparc64 timeval */
3829 	if (timeval && !in_compat_syscall()) {
3830 		struct __kernel_old_timeval __user tv = {
3831 			.tv_sec = ts.tv_sec,
3832 			.tv_usec = ts.tv_nsec,
3833 		};
3834 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3835 			return -EFAULT;
3836 		return 0;
3837 	}
3838 #endif
3839 	return put_timespec64(&ts, userstamp);
3840 }
3841 EXPORT_SYMBOL(sock_gettstamp);
3842 
3843 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3844 {
3845 	if (!sock_flag(sk, flag)) {
3846 		unsigned long previous_flags = sk->sk_flags;
3847 
3848 		sock_set_flag(sk, flag);
3849 		/*
3850 		 * we just set one of the two flags which require net
3851 		 * time stamping, but time stamping might have been on
3852 		 * already because of the other one
3853 		 */
3854 		if (sock_needs_netstamp(sk) &&
3855 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3856 			net_enable_timestamp();
3857 	}
3858 }
3859 
3860 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3861 		       int level, int type)
3862 {
3863 	struct sock_exterr_skb *serr;
3864 	struct sk_buff *skb;
3865 	int copied, err;
3866 
3867 	err = -EAGAIN;
3868 	skb = sock_dequeue_err_skb(sk);
3869 	if (skb == NULL)
3870 		goto out;
3871 
3872 	copied = skb->len;
3873 	if (copied > len) {
3874 		msg->msg_flags |= MSG_TRUNC;
3875 		copied = len;
3876 	}
3877 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3878 	if (err)
3879 		goto out_free_skb;
3880 
3881 	sock_recv_timestamp(msg, sk, skb);
3882 
3883 	serr = SKB_EXT_ERR(skb);
3884 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3885 
3886 	msg->msg_flags |= MSG_ERRQUEUE;
3887 	err = copied;
3888 
3889 out_free_skb:
3890 	kfree_skb(skb);
3891 out:
3892 	return err;
3893 }
3894 EXPORT_SYMBOL(sock_recv_errqueue);
3895 
3896 /*
3897  *	Get a socket option on an socket.
3898  *
3899  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3900  *	asynchronous errors should be reported by getsockopt. We assume
3901  *	this means if you specify SO_ERROR (otherwise what is the point of it).
3902  */
3903 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3904 			   char __user *optval, int __user *optlen)
3905 {
3906 	struct sock *sk = sock->sk;
3907 
3908 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3909 	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3910 }
3911 EXPORT_SYMBOL(sock_common_getsockopt);
3912 
3913 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3914 			int flags)
3915 {
3916 	struct sock *sk = sock->sk;
3917 	int addr_len = 0;
3918 	int err;
3919 
3920 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3921 	if (err >= 0)
3922 		msg->msg_namelen = addr_len;
3923 	return err;
3924 }
3925 EXPORT_SYMBOL(sock_common_recvmsg);
3926 
3927 /*
3928  *	Set socket options on an inet socket.
3929  */
3930 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3931 			   sockptr_t optval, unsigned int optlen)
3932 {
3933 	struct sock *sk = sock->sk;
3934 
3935 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3936 	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3937 }
3938 EXPORT_SYMBOL(sock_common_setsockopt);
3939 
3940 void sk_common_release(struct sock *sk)
3941 {
3942 	if (sk->sk_prot->destroy)
3943 		sk->sk_prot->destroy(sk);
3944 
3945 	/*
3946 	 * Observation: when sk_common_release is called, processes have
3947 	 * no access to socket. But net still has.
3948 	 * Step one, detach it from networking:
3949 	 *
3950 	 * A. Remove from hash tables.
3951 	 */
3952 
3953 	sk->sk_prot->unhash(sk);
3954 
3955 	/*
3956 	 * In this point socket cannot receive new packets, but it is possible
3957 	 * that some packets are in flight because some CPU runs receiver and
3958 	 * did hash table lookup before we unhashed socket. They will achieve
3959 	 * receive queue and will be purged by socket destructor.
3960 	 *
3961 	 * Also we still have packets pending on receive queue and probably,
3962 	 * our own packets waiting in device queues. sock_destroy will drain
3963 	 * receive queue, but transmitted packets will delay socket destruction
3964 	 * until the last reference will be released.
3965 	 */
3966 
3967 	sock_orphan(sk);
3968 
3969 	xfrm_sk_free_policy(sk);
3970 
3971 	sock_put(sk);
3972 }
3973 EXPORT_SYMBOL(sk_common_release);
3974 
3975 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3976 {
3977 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3978 
3979 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3980 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3981 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3982 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3983 	mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc);
3984 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3985 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3986 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3987 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3988 }
3989 
3990 #ifdef CONFIG_PROC_FS
3991 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3992 
3993 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3994 {
3995 	int cpu, idx = prot->inuse_idx;
3996 	int res = 0;
3997 
3998 	for_each_possible_cpu(cpu)
3999 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
4000 
4001 	return res >= 0 ? res : 0;
4002 }
4003 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
4004 
4005 int sock_inuse_get(struct net *net)
4006 {
4007 	int cpu, res = 0;
4008 
4009 	for_each_possible_cpu(cpu)
4010 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
4011 
4012 	return res;
4013 }
4014 
4015 EXPORT_SYMBOL_GPL(sock_inuse_get);
4016 
4017 static int __net_init sock_inuse_init_net(struct net *net)
4018 {
4019 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
4020 	if (net->core.prot_inuse == NULL)
4021 		return -ENOMEM;
4022 	return 0;
4023 }
4024 
4025 static void __net_exit sock_inuse_exit_net(struct net *net)
4026 {
4027 	free_percpu(net->core.prot_inuse);
4028 }
4029 
4030 static struct pernet_operations net_inuse_ops = {
4031 	.init = sock_inuse_init_net,
4032 	.exit = sock_inuse_exit_net,
4033 };
4034 
4035 static __init int net_inuse_init(void)
4036 {
4037 	if (register_pernet_subsys(&net_inuse_ops))
4038 		panic("Cannot initialize net inuse counters");
4039 
4040 	return 0;
4041 }
4042 
4043 core_initcall(net_inuse_init);
4044 
4045 static int assign_proto_idx(struct proto *prot)
4046 {
4047 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
4048 
4049 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) {
4050 		pr_err("PROTO_INUSE_NR exhausted\n");
4051 		return -ENOSPC;
4052 	}
4053 
4054 	set_bit(prot->inuse_idx, proto_inuse_idx);
4055 	return 0;
4056 }
4057 
4058 static void release_proto_idx(struct proto *prot)
4059 {
4060 	if (prot->inuse_idx != PROTO_INUSE_NR)
4061 		clear_bit(prot->inuse_idx, proto_inuse_idx);
4062 }
4063 #else
4064 static inline int assign_proto_idx(struct proto *prot)
4065 {
4066 	return 0;
4067 }
4068 
4069 static inline void release_proto_idx(struct proto *prot)
4070 {
4071 }
4072 
4073 #endif
4074 
4075 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
4076 {
4077 	if (!twsk_prot)
4078 		return;
4079 	kfree(twsk_prot->twsk_slab_name);
4080 	twsk_prot->twsk_slab_name = NULL;
4081 	kmem_cache_destroy(twsk_prot->twsk_slab);
4082 	twsk_prot->twsk_slab = NULL;
4083 }
4084 
4085 static int tw_prot_init(const struct proto *prot)
4086 {
4087 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
4088 
4089 	if (!twsk_prot)
4090 		return 0;
4091 
4092 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
4093 					      prot->name);
4094 	if (!twsk_prot->twsk_slab_name)
4095 		return -ENOMEM;
4096 
4097 	twsk_prot->twsk_slab =
4098 		kmem_cache_create(twsk_prot->twsk_slab_name,
4099 				  twsk_prot->twsk_obj_size, 0,
4100 				  SLAB_ACCOUNT | prot->slab_flags,
4101 				  NULL);
4102 	if (!twsk_prot->twsk_slab) {
4103 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
4104 			prot->name);
4105 		return -ENOMEM;
4106 	}
4107 
4108 	return 0;
4109 }
4110 
4111 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
4112 {
4113 	if (!rsk_prot)
4114 		return;
4115 	kfree(rsk_prot->slab_name);
4116 	rsk_prot->slab_name = NULL;
4117 	kmem_cache_destroy(rsk_prot->slab);
4118 	rsk_prot->slab = NULL;
4119 }
4120 
4121 static int req_prot_init(const struct proto *prot)
4122 {
4123 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
4124 
4125 	if (!rsk_prot)
4126 		return 0;
4127 
4128 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
4129 					prot->name);
4130 	if (!rsk_prot->slab_name)
4131 		return -ENOMEM;
4132 
4133 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
4134 					   rsk_prot->obj_size, 0,
4135 					   SLAB_ACCOUNT | prot->slab_flags,
4136 					   NULL);
4137 
4138 	if (!rsk_prot->slab) {
4139 		pr_crit("%s: Can't create request sock SLAB cache!\n",
4140 			prot->name);
4141 		return -ENOMEM;
4142 	}
4143 	return 0;
4144 }
4145 
4146 int proto_register(struct proto *prot, int alloc_slab)
4147 {
4148 	int ret = -ENOBUFS;
4149 
4150 	if (prot->memory_allocated && !prot->sysctl_mem) {
4151 		pr_err("%s: missing sysctl_mem\n", prot->name);
4152 		return -EINVAL;
4153 	}
4154 	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
4155 		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
4156 		return -EINVAL;
4157 	}
4158 	if (alloc_slab) {
4159 		prot->slab = kmem_cache_create_usercopy(prot->name,
4160 					prot->obj_size, 0,
4161 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
4162 					prot->slab_flags,
4163 					prot->useroffset, prot->usersize,
4164 					NULL);
4165 
4166 		if (prot->slab == NULL) {
4167 			pr_crit("%s: Can't create sock SLAB cache!\n",
4168 				prot->name);
4169 			goto out;
4170 		}
4171 
4172 		if (req_prot_init(prot))
4173 			goto out_free_request_sock_slab;
4174 
4175 		if (tw_prot_init(prot))
4176 			goto out_free_timewait_sock_slab;
4177 	}
4178 
4179 	mutex_lock(&proto_list_mutex);
4180 	ret = assign_proto_idx(prot);
4181 	if (ret) {
4182 		mutex_unlock(&proto_list_mutex);
4183 		goto out_free_timewait_sock_slab;
4184 	}
4185 	list_add(&prot->node, &proto_list);
4186 	mutex_unlock(&proto_list_mutex);
4187 	return ret;
4188 
4189 out_free_timewait_sock_slab:
4190 	if (alloc_slab)
4191 		tw_prot_cleanup(prot->twsk_prot);
4192 out_free_request_sock_slab:
4193 	if (alloc_slab) {
4194 		req_prot_cleanup(prot->rsk_prot);
4195 
4196 		kmem_cache_destroy(prot->slab);
4197 		prot->slab = NULL;
4198 	}
4199 out:
4200 	return ret;
4201 }
4202 EXPORT_SYMBOL(proto_register);
4203 
4204 void proto_unregister(struct proto *prot)
4205 {
4206 	mutex_lock(&proto_list_mutex);
4207 	release_proto_idx(prot);
4208 	list_del(&prot->node);
4209 	mutex_unlock(&proto_list_mutex);
4210 
4211 	kmem_cache_destroy(prot->slab);
4212 	prot->slab = NULL;
4213 
4214 	req_prot_cleanup(prot->rsk_prot);
4215 	tw_prot_cleanup(prot->twsk_prot);
4216 }
4217 EXPORT_SYMBOL(proto_unregister);
4218 
4219 int sock_load_diag_module(int family, int protocol)
4220 {
4221 	if (!protocol) {
4222 		if (!sock_is_registered(family))
4223 			return -ENOENT;
4224 
4225 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4226 				      NETLINK_SOCK_DIAG, family);
4227 	}
4228 
4229 #ifdef CONFIG_INET
4230 	if (family == AF_INET &&
4231 	    protocol != IPPROTO_RAW &&
4232 	    protocol < MAX_INET_PROTOS &&
4233 	    !rcu_access_pointer(inet_protos[protocol]))
4234 		return -ENOENT;
4235 #endif
4236 
4237 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4238 			      NETLINK_SOCK_DIAG, family, protocol);
4239 }
4240 EXPORT_SYMBOL(sock_load_diag_module);
4241 
4242 #ifdef CONFIG_PROC_FS
4243 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4244 	__acquires(proto_list_mutex)
4245 {
4246 	mutex_lock(&proto_list_mutex);
4247 	return seq_list_start_head(&proto_list, *pos);
4248 }
4249 
4250 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4251 {
4252 	return seq_list_next(v, &proto_list, pos);
4253 }
4254 
4255 static void proto_seq_stop(struct seq_file *seq, void *v)
4256 	__releases(proto_list_mutex)
4257 {
4258 	mutex_unlock(&proto_list_mutex);
4259 }
4260 
4261 static char proto_method_implemented(const void *method)
4262 {
4263 	return method == NULL ? 'n' : 'y';
4264 }
4265 static long sock_prot_memory_allocated(struct proto *proto)
4266 {
4267 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4268 }
4269 
4270 static const char *sock_prot_memory_pressure(struct proto *proto)
4271 {
4272 	return proto->memory_pressure != NULL ?
4273 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4274 }
4275 
4276 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4277 {
4278 
4279 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4280 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4281 		   proto->name,
4282 		   proto->obj_size,
4283 		   sock_prot_inuse_get(seq_file_net(seq), proto),
4284 		   sock_prot_memory_allocated(proto),
4285 		   sock_prot_memory_pressure(proto),
4286 		   proto->max_header,
4287 		   proto->slab == NULL ? "no" : "yes",
4288 		   module_name(proto->owner),
4289 		   proto_method_implemented(proto->close),
4290 		   proto_method_implemented(proto->connect),
4291 		   proto_method_implemented(proto->disconnect),
4292 		   proto_method_implemented(proto->accept),
4293 		   proto_method_implemented(proto->ioctl),
4294 		   proto_method_implemented(proto->init),
4295 		   proto_method_implemented(proto->destroy),
4296 		   proto_method_implemented(proto->shutdown),
4297 		   proto_method_implemented(proto->setsockopt),
4298 		   proto_method_implemented(proto->getsockopt),
4299 		   proto_method_implemented(proto->sendmsg),
4300 		   proto_method_implemented(proto->recvmsg),
4301 		   proto_method_implemented(proto->bind),
4302 		   proto_method_implemented(proto->backlog_rcv),
4303 		   proto_method_implemented(proto->hash),
4304 		   proto_method_implemented(proto->unhash),
4305 		   proto_method_implemented(proto->get_port),
4306 		   proto_method_implemented(proto->enter_memory_pressure));
4307 }
4308 
4309 static int proto_seq_show(struct seq_file *seq, void *v)
4310 {
4311 	if (v == &proto_list)
4312 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4313 			   "protocol",
4314 			   "size",
4315 			   "sockets",
4316 			   "memory",
4317 			   "press",
4318 			   "maxhdr",
4319 			   "slab",
4320 			   "module",
4321 			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4322 	else
4323 		proto_seq_printf(seq, list_entry(v, struct proto, node));
4324 	return 0;
4325 }
4326 
4327 static const struct seq_operations proto_seq_ops = {
4328 	.start  = proto_seq_start,
4329 	.next   = proto_seq_next,
4330 	.stop   = proto_seq_stop,
4331 	.show   = proto_seq_show,
4332 };
4333 
4334 static __net_init int proto_init_net(struct net *net)
4335 {
4336 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4337 			sizeof(struct seq_net_private)))
4338 		return -ENOMEM;
4339 
4340 	return 0;
4341 }
4342 
4343 static __net_exit void proto_exit_net(struct net *net)
4344 {
4345 	remove_proc_entry("protocols", net->proc_net);
4346 }
4347 
4348 
4349 static __net_initdata struct pernet_operations proto_net_ops = {
4350 	.init = proto_init_net,
4351 	.exit = proto_exit_net,
4352 };
4353 
4354 static int __init proto_init(void)
4355 {
4356 	return register_pernet_subsys(&proto_net_ops);
4357 }
4358 
4359 subsys_initcall(proto_init);
4360 
4361 #endif /* PROC_FS */
4362 
4363 #ifdef CONFIG_NET_RX_BUSY_POLL
4364 bool sk_busy_loop_end(void *p, unsigned long start_time)
4365 {
4366 	struct sock *sk = p;
4367 
4368 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4369 		return true;
4370 
4371 	if (sk_is_udp(sk) &&
4372 	    !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4373 		return true;
4374 
4375 	return sk_busy_loop_timeout(sk, start_time);
4376 }
4377 EXPORT_SYMBOL(sk_busy_loop_end);
4378 #endif /* CONFIG_NET_RX_BUSY_POLL */
4379 
4380 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4381 {
4382 	if (!sk->sk_prot->bind_add)
4383 		return -EOPNOTSUPP;
4384 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4385 }
4386 EXPORT_SYMBOL(sock_bind_add);
4387 
4388 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4389 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4390 		     void __user *arg, void *karg, size_t size)
4391 {
4392 	int ret;
4393 
4394 	if (copy_from_user(karg, arg, size))
4395 		return -EFAULT;
4396 
4397 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4398 	if (ret)
4399 		return ret;
4400 
4401 	if (copy_to_user(arg, karg, size))
4402 		return -EFAULT;
4403 
4404 	return 0;
4405 }
4406 EXPORT_SYMBOL(sock_ioctl_inout);
4407 
4408 /* This is the most common ioctl prep function, where the result (4 bytes) is
4409  * copied back to userspace if the ioctl() returns successfully. No input is
4410  * copied from userspace as input argument.
4411  */
4412 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4413 {
4414 	int ret, karg = 0;
4415 
4416 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4417 	if (ret)
4418 		return ret;
4419 
4420 	return put_user(karg, (int __user *)arg);
4421 }
4422 
4423 /* A wrapper around sock ioctls, which copies the data from userspace
4424  * (depending on the protocol/ioctl), and copies back the result to userspace.
4425  * The main motivation for this function is to pass kernel memory to the
4426  * protocol ioctl callbacks, instead of userspace memory.
4427  */
4428 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4429 {
4430 	int rc = 1;
4431 
4432 	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4433 		rc = ipmr_sk_ioctl(sk, cmd, arg);
4434 	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4435 		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4436 	else if (sk_is_phonet(sk))
4437 		rc = phonet_sk_ioctl(sk, cmd, arg);
4438 
4439 	/* If ioctl was processed, returns its value */
4440 	if (rc <= 0)
4441 		return rc;
4442 
4443 	/* Otherwise call the default handler */
4444 	return sock_ioctl_out(sk, cmd, arg);
4445 }
4446 EXPORT_SYMBOL(sk_ioctl);
4447 
4448 static int __init sock_struct_check(void)
4449 {
4450 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4451 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4452 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4453 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4454 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4455 
4456 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4457 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4458 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4459 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4460 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4461 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4462 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4463 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4464 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4465 
4466 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4467 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4468 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4469 
4470 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4471 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4472 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4473 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4474 
4475 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4476 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4477 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
4478 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4479 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4480 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4481 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4482 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4483 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4484 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
4485 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
4486 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4487 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4488 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4489 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4490 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4491 
4492 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4493 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4494 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4495 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4496 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4497 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4498 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4499 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4500 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4501 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4502 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4503 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4504 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4505 	return 0;
4506 }
4507 
4508 core_initcall(sock_struct_check);
4509