xref: /linux/net/core/sock.c (revision cff9c565e65f3622e8dc1dcc21c1520a083dff35)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117 #include <linux/mroute.h>
118 #include <linux/mroute6.h>
119 #include <linux/icmpv6.h>
120 
121 #include <linux/uaccess.h>
122 
123 #include <linux/netdevice.h>
124 #include <net/protocol.h>
125 #include <linux/skbuff.h>
126 #include <net/net_namespace.h>
127 #include <net/request_sock.h>
128 #include <net/sock.h>
129 #include <linux/net_tstamp.h>
130 #include <net/xfrm.h>
131 #include <linux/ipsec.h>
132 #include <net/cls_cgroup.h>
133 #include <net/netprio_cgroup.h>
134 #include <linux/sock_diag.h>
135 
136 #include <linux/filter.h>
137 #include <net/sock_reuseport.h>
138 #include <net/bpf_sk_storage.h>
139 
140 #include <trace/events/sock.h>
141 
142 #include <net/tcp.h>
143 #include <net/busy_poll.h>
144 #include <net/phonet/phonet.h>
145 
146 #include <linux/ethtool.h>
147 
148 #include "dev.h"
149 
150 static DEFINE_MUTEX(proto_list_mutex);
151 static LIST_HEAD(proto_list);
152 
153 static void sock_def_write_space_wfree(struct sock *sk);
154 static void sock_def_write_space(struct sock *sk);
155 
156 /**
157  * sk_ns_capable - General socket capability test
158  * @sk: Socket to use a capability on or through
159  * @user_ns: The user namespace of the capability to use
160  * @cap: The capability to use
161  *
162  * Test to see if the opener of the socket had when the socket was
163  * created and the current process has the capability @cap in the user
164  * namespace @user_ns.
165  */
166 bool sk_ns_capable(const struct sock *sk,
167 		   struct user_namespace *user_ns, int cap)
168 {
169 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
170 		ns_capable(user_ns, cap);
171 }
172 EXPORT_SYMBOL(sk_ns_capable);
173 
174 /**
175  * sk_capable - Socket global capability test
176  * @sk: Socket to use a capability on or through
177  * @cap: The global capability to use
178  *
179  * Test to see if the opener of the socket had when the socket was
180  * created and the current process has the capability @cap in all user
181  * namespaces.
182  */
183 bool sk_capable(const struct sock *sk, int cap)
184 {
185 	return sk_ns_capable(sk, &init_user_ns, cap);
186 }
187 EXPORT_SYMBOL(sk_capable);
188 
189 /**
190  * sk_net_capable - Network namespace socket capability test
191  * @sk: Socket to use a capability on or through
192  * @cap: The capability to use
193  *
194  * Test to see if the opener of the socket had when the socket was created
195  * and the current process has the capability @cap over the network namespace
196  * the socket is a member of.
197  */
198 bool sk_net_capable(const struct sock *sk, int cap)
199 {
200 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
201 }
202 EXPORT_SYMBOL(sk_net_capable);
203 
204 /*
205  * Each address family might have different locking rules, so we have
206  * one slock key per address family and separate keys for internal and
207  * userspace sockets.
208  */
209 static struct lock_class_key af_family_keys[AF_MAX];
210 static struct lock_class_key af_family_kern_keys[AF_MAX];
211 static struct lock_class_key af_family_slock_keys[AF_MAX];
212 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
213 
214 /*
215  * Make lock validator output more readable. (we pre-construct these
216  * strings build-time, so that runtime initialization of socket
217  * locks is fast):
218  */
219 
220 #define _sock_locks(x)						  \
221   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
222   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
223   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
224   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
225   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
226   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
227   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
228   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
229   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
230   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
231   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
232   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
233   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
234   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
235   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
236   x "AF_MCTP"  , \
237   x "AF_MAX"
238 
239 static const char *const af_family_key_strings[AF_MAX+1] = {
240 	_sock_locks("sk_lock-")
241 };
242 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
243 	_sock_locks("slock-")
244 };
245 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
246 	_sock_locks("clock-")
247 };
248 
249 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
250 	_sock_locks("k-sk_lock-")
251 };
252 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
253 	_sock_locks("k-slock-")
254 };
255 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
256 	_sock_locks("k-clock-")
257 };
258 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
259 	_sock_locks("rlock-")
260 };
261 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
262 	_sock_locks("wlock-")
263 };
264 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
265 	_sock_locks("elock-")
266 };
267 
268 /*
269  * sk_callback_lock and sk queues locking rules are per-address-family,
270  * so split the lock classes by using a per-AF key:
271  */
272 static struct lock_class_key af_callback_keys[AF_MAX];
273 static struct lock_class_key af_rlock_keys[AF_MAX];
274 static struct lock_class_key af_wlock_keys[AF_MAX];
275 static struct lock_class_key af_elock_keys[AF_MAX];
276 static struct lock_class_key af_kern_callback_keys[AF_MAX];
277 
278 /* Run time adjustable parameters. */
279 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
280 EXPORT_SYMBOL(sysctl_wmem_max);
281 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
282 EXPORT_SYMBOL(sysctl_rmem_max);
283 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
284 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
285 
286 int sysctl_tstamp_allow_data __read_mostly = 1;
287 
288 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
289 EXPORT_SYMBOL_GPL(memalloc_socks_key);
290 
291 /**
292  * sk_set_memalloc - sets %SOCK_MEMALLOC
293  * @sk: socket to set it on
294  *
295  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
296  * It's the responsibility of the admin to adjust min_free_kbytes
297  * to meet the requirements
298  */
299 void sk_set_memalloc(struct sock *sk)
300 {
301 	sock_set_flag(sk, SOCK_MEMALLOC);
302 	sk->sk_allocation |= __GFP_MEMALLOC;
303 	static_branch_inc(&memalloc_socks_key);
304 }
305 EXPORT_SYMBOL_GPL(sk_set_memalloc);
306 
307 void sk_clear_memalloc(struct sock *sk)
308 {
309 	sock_reset_flag(sk, SOCK_MEMALLOC);
310 	sk->sk_allocation &= ~__GFP_MEMALLOC;
311 	static_branch_dec(&memalloc_socks_key);
312 
313 	/*
314 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
315 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
316 	 * it has rmem allocations due to the last swapfile being deactivated
317 	 * but there is a risk that the socket is unusable due to exceeding
318 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
319 	 */
320 	sk_mem_reclaim(sk);
321 }
322 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
323 
324 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
325 {
326 	int ret;
327 	unsigned int noreclaim_flag;
328 
329 	/* these should have been dropped before queueing */
330 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
331 
332 	noreclaim_flag = memalloc_noreclaim_save();
333 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
334 				 tcp_v6_do_rcv,
335 				 tcp_v4_do_rcv,
336 				 sk, skb);
337 	memalloc_noreclaim_restore(noreclaim_flag);
338 
339 	return ret;
340 }
341 EXPORT_SYMBOL(__sk_backlog_rcv);
342 
343 void sk_error_report(struct sock *sk)
344 {
345 	sk->sk_error_report(sk);
346 
347 	switch (sk->sk_family) {
348 	case AF_INET:
349 		fallthrough;
350 	case AF_INET6:
351 		trace_inet_sk_error_report(sk);
352 		break;
353 	default:
354 		break;
355 	}
356 }
357 EXPORT_SYMBOL(sk_error_report);
358 
359 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
360 {
361 	struct __kernel_sock_timeval tv;
362 
363 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
364 		tv.tv_sec = 0;
365 		tv.tv_usec = 0;
366 	} else {
367 		tv.tv_sec = timeo / HZ;
368 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
369 	}
370 
371 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
372 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
373 		*(struct old_timeval32 *)optval = tv32;
374 		return sizeof(tv32);
375 	}
376 
377 	if (old_timeval) {
378 		struct __kernel_old_timeval old_tv;
379 		old_tv.tv_sec = tv.tv_sec;
380 		old_tv.tv_usec = tv.tv_usec;
381 		*(struct __kernel_old_timeval *)optval = old_tv;
382 		return sizeof(old_tv);
383 	}
384 
385 	*(struct __kernel_sock_timeval *)optval = tv;
386 	return sizeof(tv);
387 }
388 EXPORT_SYMBOL(sock_get_timeout);
389 
390 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
391 			   sockptr_t optval, int optlen, bool old_timeval)
392 {
393 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
394 		struct old_timeval32 tv32;
395 
396 		if (optlen < sizeof(tv32))
397 			return -EINVAL;
398 
399 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
400 			return -EFAULT;
401 		tv->tv_sec = tv32.tv_sec;
402 		tv->tv_usec = tv32.tv_usec;
403 	} else if (old_timeval) {
404 		struct __kernel_old_timeval old_tv;
405 
406 		if (optlen < sizeof(old_tv))
407 			return -EINVAL;
408 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
409 			return -EFAULT;
410 		tv->tv_sec = old_tv.tv_sec;
411 		tv->tv_usec = old_tv.tv_usec;
412 	} else {
413 		if (optlen < sizeof(*tv))
414 			return -EINVAL;
415 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
416 			return -EFAULT;
417 	}
418 
419 	return 0;
420 }
421 EXPORT_SYMBOL(sock_copy_user_timeval);
422 
423 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
424 			    bool old_timeval)
425 {
426 	struct __kernel_sock_timeval tv;
427 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
428 	long val;
429 
430 	if (err)
431 		return err;
432 
433 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
434 		return -EDOM;
435 
436 	if (tv.tv_sec < 0) {
437 		static int warned __read_mostly;
438 
439 		WRITE_ONCE(*timeo_p, 0);
440 		if (warned < 10 && net_ratelimit()) {
441 			warned++;
442 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
443 				__func__, current->comm, task_pid_nr(current));
444 		}
445 		return 0;
446 	}
447 	val = MAX_SCHEDULE_TIMEOUT;
448 	if ((tv.tv_sec || tv.tv_usec) &&
449 	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
450 		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
451 						    USEC_PER_SEC / HZ);
452 	WRITE_ONCE(*timeo_p, val);
453 	return 0;
454 }
455 
456 static bool sock_needs_netstamp(const struct sock *sk)
457 {
458 	switch (sk->sk_family) {
459 	case AF_UNSPEC:
460 	case AF_UNIX:
461 		return false;
462 	default:
463 		return true;
464 	}
465 }
466 
467 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
468 {
469 	if (sk->sk_flags & flags) {
470 		sk->sk_flags &= ~flags;
471 		if (sock_needs_netstamp(sk) &&
472 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
473 			net_disable_timestamp();
474 	}
475 }
476 
477 
478 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
479 {
480 	unsigned long flags;
481 	struct sk_buff_head *list = &sk->sk_receive_queue;
482 
483 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
484 		atomic_inc(&sk->sk_drops);
485 		trace_sock_rcvqueue_full(sk, skb);
486 		return -ENOMEM;
487 	}
488 
489 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
490 		atomic_inc(&sk->sk_drops);
491 		return -ENOBUFS;
492 	}
493 
494 	skb->dev = NULL;
495 	skb_set_owner_r(skb, sk);
496 
497 	/* we escape from rcu protected region, make sure we dont leak
498 	 * a norefcounted dst
499 	 */
500 	skb_dst_force(skb);
501 
502 	spin_lock_irqsave(&list->lock, flags);
503 	sock_skb_set_dropcount(sk, skb);
504 	__skb_queue_tail(list, skb);
505 	spin_unlock_irqrestore(&list->lock, flags);
506 
507 	if (!sock_flag(sk, SOCK_DEAD))
508 		sk->sk_data_ready(sk);
509 	return 0;
510 }
511 EXPORT_SYMBOL(__sock_queue_rcv_skb);
512 
513 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
514 			      enum skb_drop_reason *reason)
515 {
516 	enum skb_drop_reason drop_reason;
517 	int err;
518 
519 	err = sk_filter(sk, skb);
520 	if (err) {
521 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
522 		goto out;
523 	}
524 	err = __sock_queue_rcv_skb(sk, skb);
525 	switch (err) {
526 	case -ENOMEM:
527 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
528 		break;
529 	case -ENOBUFS:
530 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
531 		break;
532 	default:
533 		drop_reason = SKB_NOT_DROPPED_YET;
534 		break;
535 	}
536 out:
537 	if (reason)
538 		*reason = drop_reason;
539 	return err;
540 }
541 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
542 
543 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
544 		     const int nested, unsigned int trim_cap, bool refcounted)
545 {
546 	int rc = NET_RX_SUCCESS;
547 
548 	if (sk_filter_trim_cap(sk, skb, trim_cap))
549 		goto discard_and_relse;
550 
551 	skb->dev = NULL;
552 
553 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
554 		atomic_inc(&sk->sk_drops);
555 		goto discard_and_relse;
556 	}
557 	if (nested)
558 		bh_lock_sock_nested(sk);
559 	else
560 		bh_lock_sock(sk);
561 	if (!sock_owned_by_user(sk)) {
562 		/*
563 		 * trylock + unlock semantics:
564 		 */
565 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
566 
567 		rc = sk_backlog_rcv(sk, skb);
568 
569 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
570 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
571 		bh_unlock_sock(sk);
572 		atomic_inc(&sk->sk_drops);
573 		goto discard_and_relse;
574 	}
575 
576 	bh_unlock_sock(sk);
577 out:
578 	if (refcounted)
579 		sock_put(sk);
580 	return rc;
581 discard_and_relse:
582 	kfree_skb(skb);
583 	goto out;
584 }
585 EXPORT_SYMBOL(__sk_receive_skb);
586 
587 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
588 							  u32));
589 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
590 							   u32));
591 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
592 {
593 	struct dst_entry *dst = __sk_dst_get(sk);
594 
595 	if (dst && dst->obsolete &&
596 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
597 			       dst, cookie) == NULL) {
598 		sk_tx_queue_clear(sk);
599 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
600 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
601 		dst_release(dst);
602 		return NULL;
603 	}
604 
605 	return dst;
606 }
607 EXPORT_SYMBOL(__sk_dst_check);
608 
609 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
610 {
611 	struct dst_entry *dst = sk_dst_get(sk);
612 
613 	if (dst && dst->obsolete &&
614 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
615 			       dst, cookie) == NULL) {
616 		sk_dst_reset(sk);
617 		dst_release(dst);
618 		return NULL;
619 	}
620 
621 	return dst;
622 }
623 EXPORT_SYMBOL(sk_dst_check);
624 
625 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
626 {
627 	int ret = -ENOPROTOOPT;
628 #ifdef CONFIG_NETDEVICES
629 	struct net *net = sock_net(sk);
630 
631 	/* Sorry... */
632 	ret = -EPERM;
633 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
634 		goto out;
635 
636 	ret = -EINVAL;
637 	if (ifindex < 0)
638 		goto out;
639 
640 	/* Paired with all READ_ONCE() done locklessly. */
641 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
642 
643 	if (sk->sk_prot->rehash)
644 		sk->sk_prot->rehash(sk);
645 	sk_dst_reset(sk);
646 
647 	ret = 0;
648 
649 out:
650 #endif
651 
652 	return ret;
653 }
654 
655 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
656 {
657 	int ret;
658 
659 	if (lock_sk)
660 		lock_sock(sk);
661 	ret = sock_bindtoindex_locked(sk, ifindex);
662 	if (lock_sk)
663 		release_sock(sk);
664 
665 	return ret;
666 }
667 EXPORT_SYMBOL(sock_bindtoindex);
668 
669 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
670 {
671 	int ret = -ENOPROTOOPT;
672 #ifdef CONFIG_NETDEVICES
673 	struct net *net = sock_net(sk);
674 	char devname[IFNAMSIZ];
675 	int index;
676 
677 	ret = -EINVAL;
678 	if (optlen < 0)
679 		goto out;
680 
681 	/* Bind this socket to a particular device like "eth0",
682 	 * as specified in the passed interface name. If the
683 	 * name is "" or the option length is zero the socket
684 	 * is not bound.
685 	 */
686 	if (optlen > IFNAMSIZ - 1)
687 		optlen = IFNAMSIZ - 1;
688 	memset(devname, 0, sizeof(devname));
689 
690 	ret = -EFAULT;
691 	if (copy_from_sockptr(devname, optval, optlen))
692 		goto out;
693 
694 	index = 0;
695 	if (devname[0] != '\0') {
696 		struct net_device *dev;
697 
698 		rcu_read_lock();
699 		dev = dev_get_by_name_rcu(net, devname);
700 		if (dev)
701 			index = dev->ifindex;
702 		rcu_read_unlock();
703 		ret = -ENODEV;
704 		if (!dev)
705 			goto out;
706 	}
707 
708 	sockopt_lock_sock(sk);
709 	ret = sock_bindtoindex_locked(sk, index);
710 	sockopt_release_sock(sk);
711 out:
712 #endif
713 
714 	return ret;
715 }
716 
717 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
718 				sockptr_t optlen, int len)
719 {
720 	int ret = -ENOPROTOOPT;
721 #ifdef CONFIG_NETDEVICES
722 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
723 	struct net *net = sock_net(sk);
724 	char devname[IFNAMSIZ];
725 
726 	if (bound_dev_if == 0) {
727 		len = 0;
728 		goto zero;
729 	}
730 
731 	ret = -EINVAL;
732 	if (len < IFNAMSIZ)
733 		goto out;
734 
735 	ret = netdev_get_name(net, devname, bound_dev_if);
736 	if (ret)
737 		goto out;
738 
739 	len = strlen(devname) + 1;
740 
741 	ret = -EFAULT;
742 	if (copy_to_sockptr(optval, devname, len))
743 		goto out;
744 
745 zero:
746 	ret = -EFAULT;
747 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
748 		goto out;
749 
750 	ret = 0;
751 
752 out:
753 #endif
754 
755 	return ret;
756 }
757 
758 bool sk_mc_loop(const struct sock *sk)
759 {
760 	if (dev_recursion_level())
761 		return false;
762 	if (!sk)
763 		return true;
764 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
765 	switch (READ_ONCE(sk->sk_family)) {
766 	case AF_INET:
767 		return inet_test_bit(MC_LOOP, sk);
768 #if IS_ENABLED(CONFIG_IPV6)
769 	case AF_INET6:
770 		return inet6_test_bit(MC6_LOOP, sk);
771 #endif
772 	}
773 	WARN_ON_ONCE(1);
774 	return true;
775 }
776 EXPORT_SYMBOL(sk_mc_loop);
777 
778 void sock_set_reuseaddr(struct sock *sk)
779 {
780 	lock_sock(sk);
781 	sk->sk_reuse = SK_CAN_REUSE;
782 	release_sock(sk);
783 }
784 EXPORT_SYMBOL(sock_set_reuseaddr);
785 
786 void sock_set_reuseport(struct sock *sk)
787 {
788 	lock_sock(sk);
789 	sk->sk_reuseport = true;
790 	release_sock(sk);
791 }
792 EXPORT_SYMBOL(sock_set_reuseport);
793 
794 void sock_no_linger(struct sock *sk)
795 {
796 	lock_sock(sk);
797 	WRITE_ONCE(sk->sk_lingertime, 0);
798 	sock_set_flag(sk, SOCK_LINGER);
799 	release_sock(sk);
800 }
801 EXPORT_SYMBOL(sock_no_linger);
802 
803 void sock_set_priority(struct sock *sk, u32 priority)
804 {
805 	WRITE_ONCE(sk->sk_priority, priority);
806 }
807 EXPORT_SYMBOL(sock_set_priority);
808 
809 void sock_set_sndtimeo(struct sock *sk, s64 secs)
810 {
811 	lock_sock(sk);
812 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
813 		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
814 	else
815 		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
816 	release_sock(sk);
817 }
818 EXPORT_SYMBOL(sock_set_sndtimeo);
819 
820 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
821 {
822 	if (val)  {
823 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
824 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
825 		sock_set_flag(sk, SOCK_RCVTSTAMP);
826 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
827 	} else {
828 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
829 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
830 	}
831 }
832 
833 void sock_enable_timestamps(struct sock *sk)
834 {
835 	lock_sock(sk);
836 	__sock_set_timestamps(sk, true, false, true);
837 	release_sock(sk);
838 }
839 EXPORT_SYMBOL(sock_enable_timestamps);
840 
841 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
842 {
843 	switch (optname) {
844 	case SO_TIMESTAMP_OLD:
845 		__sock_set_timestamps(sk, valbool, false, false);
846 		break;
847 	case SO_TIMESTAMP_NEW:
848 		__sock_set_timestamps(sk, valbool, true, false);
849 		break;
850 	case SO_TIMESTAMPNS_OLD:
851 		__sock_set_timestamps(sk, valbool, false, true);
852 		break;
853 	case SO_TIMESTAMPNS_NEW:
854 		__sock_set_timestamps(sk, valbool, true, true);
855 		break;
856 	}
857 }
858 
859 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
860 {
861 	struct net *net = sock_net(sk);
862 	struct net_device *dev = NULL;
863 	bool match = false;
864 	int *vclock_index;
865 	int i, num;
866 
867 	if (sk->sk_bound_dev_if)
868 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
869 
870 	if (!dev) {
871 		pr_err("%s: sock not bind to device\n", __func__);
872 		return -EOPNOTSUPP;
873 	}
874 
875 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
876 	dev_put(dev);
877 
878 	for (i = 0; i < num; i++) {
879 		if (*(vclock_index + i) == phc_index) {
880 			match = true;
881 			break;
882 		}
883 	}
884 
885 	if (num > 0)
886 		kfree(vclock_index);
887 
888 	if (!match)
889 		return -EINVAL;
890 
891 	WRITE_ONCE(sk->sk_bind_phc, phc_index);
892 
893 	return 0;
894 }
895 
896 int sock_set_timestamping(struct sock *sk, int optname,
897 			  struct so_timestamping timestamping)
898 {
899 	int val = timestamping.flags;
900 	int ret;
901 
902 	if (val & ~SOF_TIMESTAMPING_MASK)
903 		return -EINVAL;
904 
905 	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
906 	    !(val & SOF_TIMESTAMPING_OPT_ID))
907 		return -EINVAL;
908 
909 	if (val & SOF_TIMESTAMPING_OPT_ID &&
910 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
911 		if (sk_is_tcp(sk)) {
912 			if ((1 << sk->sk_state) &
913 			    (TCPF_CLOSE | TCPF_LISTEN))
914 				return -EINVAL;
915 			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
916 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
917 			else
918 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
919 		} else {
920 			atomic_set(&sk->sk_tskey, 0);
921 		}
922 	}
923 
924 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
925 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
926 		return -EINVAL;
927 
928 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
929 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
930 		if (ret)
931 			return ret;
932 	}
933 
934 	WRITE_ONCE(sk->sk_tsflags, val);
935 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
936 
937 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
938 		sock_enable_timestamp(sk,
939 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
940 	else
941 		sock_disable_timestamp(sk,
942 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
943 	return 0;
944 }
945 
946 void sock_set_keepalive(struct sock *sk)
947 {
948 	lock_sock(sk);
949 	if (sk->sk_prot->keepalive)
950 		sk->sk_prot->keepalive(sk, true);
951 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
952 	release_sock(sk);
953 }
954 EXPORT_SYMBOL(sock_set_keepalive);
955 
956 static void __sock_set_rcvbuf(struct sock *sk, int val)
957 {
958 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
959 	 * as a negative value.
960 	 */
961 	val = min_t(int, val, INT_MAX / 2);
962 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
963 
964 	/* We double it on the way in to account for "struct sk_buff" etc.
965 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
966 	 * will allow that much actual data to be received on that socket.
967 	 *
968 	 * Applications are unaware that "struct sk_buff" and other overheads
969 	 * allocate from the receive buffer during socket buffer allocation.
970 	 *
971 	 * And after considering the possible alternatives, returning the value
972 	 * we actually used in getsockopt is the most desirable behavior.
973 	 */
974 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
975 }
976 
977 void sock_set_rcvbuf(struct sock *sk, int val)
978 {
979 	lock_sock(sk);
980 	__sock_set_rcvbuf(sk, val);
981 	release_sock(sk);
982 }
983 EXPORT_SYMBOL(sock_set_rcvbuf);
984 
985 static void __sock_set_mark(struct sock *sk, u32 val)
986 {
987 	if (val != sk->sk_mark) {
988 		WRITE_ONCE(sk->sk_mark, val);
989 		sk_dst_reset(sk);
990 	}
991 }
992 
993 void sock_set_mark(struct sock *sk, u32 val)
994 {
995 	lock_sock(sk);
996 	__sock_set_mark(sk, val);
997 	release_sock(sk);
998 }
999 EXPORT_SYMBOL(sock_set_mark);
1000 
1001 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1002 {
1003 	/* Round down bytes to multiple of pages */
1004 	bytes = round_down(bytes, PAGE_SIZE);
1005 
1006 	WARN_ON(bytes > sk->sk_reserved_mem);
1007 	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1008 	sk_mem_reclaim(sk);
1009 }
1010 
1011 static int sock_reserve_memory(struct sock *sk, int bytes)
1012 {
1013 	long allocated;
1014 	bool charged;
1015 	int pages;
1016 
1017 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1018 		return -EOPNOTSUPP;
1019 
1020 	if (!bytes)
1021 		return 0;
1022 
1023 	pages = sk_mem_pages(bytes);
1024 
1025 	/* pre-charge to memcg */
1026 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1027 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1028 	if (!charged)
1029 		return -ENOMEM;
1030 
1031 	/* pre-charge to forward_alloc */
1032 	sk_memory_allocated_add(sk, pages);
1033 	allocated = sk_memory_allocated(sk);
1034 	/* If the system goes into memory pressure with this
1035 	 * precharge, give up and return error.
1036 	 */
1037 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1038 		sk_memory_allocated_sub(sk, pages);
1039 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1040 		return -ENOMEM;
1041 	}
1042 	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1043 
1044 	WRITE_ONCE(sk->sk_reserved_mem,
1045 		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1046 
1047 	return 0;
1048 }
1049 
1050 void sockopt_lock_sock(struct sock *sk)
1051 {
1052 	/* When current->bpf_ctx is set, the setsockopt is called from
1053 	 * a bpf prog.  bpf has ensured the sk lock has been
1054 	 * acquired before calling setsockopt().
1055 	 */
1056 	if (has_current_bpf_ctx())
1057 		return;
1058 
1059 	lock_sock(sk);
1060 }
1061 EXPORT_SYMBOL(sockopt_lock_sock);
1062 
1063 void sockopt_release_sock(struct sock *sk)
1064 {
1065 	if (has_current_bpf_ctx())
1066 		return;
1067 
1068 	release_sock(sk);
1069 }
1070 EXPORT_SYMBOL(sockopt_release_sock);
1071 
1072 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1073 {
1074 	return has_current_bpf_ctx() || ns_capable(ns, cap);
1075 }
1076 EXPORT_SYMBOL(sockopt_ns_capable);
1077 
1078 bool sockopt_capable(int cap)
1079 {
1080 	return has_current_bpf_ctx() || capable(cap);
1081 }
1082 EXPORT_SYMBOL(sockopt_capable);
1083 
1084 /*
1085  *	This is meant for all protocols to use and covers goings on
1086  *	at the socket level. Everything here is generic.
1087  */
1088 
1089 int sk_setsockopt(struct sock *sk, int level, int optname,
1090 		  sockptr_t optval, unsigned int optlen)
1091 {
1092 	struct so_timestamping timestamping;
1093 	struct socket *sock = sk->sk_socket;
1094 	struct sock_txtime sk_txtime;
1095 	int val;
1096 	int valbool;
1097 	struct linger ling;
1098 	int ret = 0;
1099 
1100 	/*
1101 	 *	Options without arguments
1102 	 */
1103 
1104 	if (optname == SO_BINDTODEVICE)
1105 		return sock_setbindtodevice(sk, optval, optlen);
1106 
1107 	if (optlen < sizeof(int))
1108 		return -EINVAL;
1109 
1110 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1111 		return -EFAULT;
1112 
1113 	valbool = val ? 1 : 0;
1114 
1115 	/* handle options which do not require locking the socket. */
1116 	switch (optname) {
1117 	case SO_PRIORITY:
1118 		if ((val >= 0 && val <= 6) ||
1119 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1120 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1121 			sock_set_priority(sk, val);
1122 			return 0;
1123 		}
1124 		return -EPERM;
1125 	case SO_PASSSEC:
1126 		assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1127 		return 0;
1128 	case SO_PASSCRED:
1129 		assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1130 		return 0;
1131 	case SO_PASSPIDFD:
1132 		assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1133 		return 0;
1134 	case SO_TYPE:
1135 	case SO_PROTOCOL:
1136 	case SO_DOMAIN:
1137 	case SO_ERROR:
1138 		return -ENOPROTOOPT;
1139 #ifdef CONFIG_NET_RX_BUSY_POLL
1140 	case SO_BUSY_POLL:
1141 		if (val < 0)
1142 			return -EINVAL;
1143 		WRITE_ONCE(sk->sk_ll_usec, val);
1144 		return 0;
1145 	case SO_PREFER_BUSY_POLL:
1146 		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1147 			return -EPERM;
1148 		WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1149 		return 0;
1150 	case SO_BUSY_POLL_BUDGET:
1151 		if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1152 		    !sockopt_capable(CAP_NET_ADMIN))
1153 			return -EPERM;
1154 		if (val < 0 || val > U16_MAX)
1155 			return -EINVAL;
1156 		WRITE_ONCE(sk->sk_busy_poll_budget, val);
1157 		return 0;
1158 #endif
1159 	case SO_MAX_PACING_RATE:
1160 		{
1161 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1162 		unsigned long pacing_rate;
1163 
1164 		if (sizeof(ulval) != sizeof(val) &&
1165 		    optlen >= sizeof(ulval) &&
1166 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1167 			return -EFAULT;
1168 		}
1169 		if (ulval != ~0UL)
1170 			cmpxchg(&sk->sk_pacing_status,
1171 				SK_PACING_NONE,
1172 				SK_PACING_NEEDED);
1173 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1174 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1175 		pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1176 		if (ulval < pacing_rate)
1177 			WRITE_ONCE(sk->sk_pacing_rate, ulval);
1178 		return 0;
1179 		}
1180 	case SO_TXREHASH:
1181 		if (val < -1 || val > 1)
1182 			return -EINVAL;
1183 		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1184 			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1185 		/* Paired with READ_ONCE() in tcp_rtx_synack()
1186 		 * and sk_getsockopt().
1187 		 */
1188 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1189 		return 0;
1190 	}
1191 
1192 	sockopt_lock_sock(sk);
1193 
1194 	switch (optname) {
1195 	case SO_DEBUG:
1196 		if (val && !sockopt_capable(CAP_NET_ADMIN))
1197 			ret = -EACCES;
1198 		else
1199 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1200 		break;
1201 	case SO_REUSEADDR:
1202 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1203 		break;
1204 	case SO_REUSEPORT:
1205 		sk->sk_reuseport = valbool;
1206 		break;
1207 	case SO_DONTROUTE:
1208 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1209 		sk_dst_reset(sk);
1210 		break;
1211 	case SO_BROADCAST:
1212 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1213 		break;
1214 	case SO_SNDBUF:
1215 		/* Don't error on this BSD doesn't and if you think
1216 		 * about it this is right. Otherwise apps have to
1217 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1218 		 * are treated in BSD as hints
1219 		 */
1220 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1221 set_sndbuf:
1222 		/* Ensure val * 2 fits into an int, to prevent max_t()
1223 		 * from treating it as a negative value.
1224 		 */
1225 		val = min_t(int, val, INT_MAX / 2);
1226 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1227 		WRITE_ONCE(sk->sk_sndbuf,
1228 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1229 		/* Wake up sending tasks if we upped the value. */
1230 		sk->sk_write_space(sk);
1231 		break;
1232 
1233 	case SO_SNDBUFFORCE:
1234 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1235 			ret = -EPERM;
1236 			break;
1237 		}
1238 
1239 		/* No negative values (to prevent underflow, as val will be
1240 		 * multiplied by 2).
1241 		 */
1242 		if (val < 0)
1243 			val = 0;
1244 		goto set_sndbuf;
1245 
1246 	case SO_RCVBUF:
1247 		/* Don't error on this BSD doesn't and if you think
1248 		 * about it this is right. Otherwise apps have to
1249 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1250 		 * are treated in BSD as hints
1251 		 */
1252 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1253 		break;
1254 
1255 	case SO_RCVBUFFORCE:
1256 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1257 			ret = -EPERM;
1258 			break;
1259 		}
1260 
1261 		/* No negative values (to prevent underflow, as val will be
1262 		 * multiplied by 2).
1263 		 */
1264 		__sock_set_rcvbuf(sk, max(val, 0));
1265 		break;
1266 
1267 	case SO_KEEPALIVE:
1268 		if (sk->sk_prot->keepalive)
1269 			sk->sk_prot->keepalive(sk, valbool);
1270 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1271 		break;
1272 
1273 	case SO_OOBINLINE:
1274 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1275 		break;
1276 
1277 	case SO_NO_CHECK:
1278 		sk->sk_no_check_tx = valbool;
1279 		break;
1280 
1281 	case SO_LINGER:
1282 		if (optlen < sizeof(ling)) {
1283 			ret = -EINVAL;	/* 1003.1g */
1284 			break;
1285 		}
1286 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1287 			ret = -EFAULT;
1288 			break;
1289 		}
1290 		if (!ling.l_onoff) {
1291 			sock_reset_flag(sk, SOCK_LINGER);
1292 		} else {
1293 			unsigned long t_sec = ling.l_linger;
1294 
1295 			if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1296 				WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1297 			else
1298 				WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1299 			sock_set_flag(sk, SOCK_LINGER);
1300 		}
1301 		break;
1302 
1303 	case SO_BSDCOMPAT:
1304 		break;
1305 
1306 	case SO_TIMESTAMP_OLD:
1307 	case SO_TIMESTAMP_NEW:
1308 	case SO_TIMESTAMPNS_OLD:
1309 	case SO_TIMESTAMPNS_NEW:
1310 		sock_set_timestamp(sk, optname, valbool);
1311 		break;
1312 
1313 	case SO_TIMESTAMPING_NEW:
1314 	case SO_TIMESTAMPING_OLD:
1315 		if (optlen == sizeof(timestamping)) {
1316 			if (copy_from_sockptr(&timestamping, optval,
1317 					      sizeof(timestamping))) {
1318 				ret = -EFAULT;
1319 				break;
1320 			}
1321 		} else {
1322 			memset(&timestamping, 0, sizeof(timestamping));
1323 			timestamping.flags = val;
1324 		}
1325 		ret = sock_set_timestamping(sk, optname, timestamping);
1326 		break;
1327 
1328 	case SO_RCVLOWAT:
1329 		{
1330 		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1331 
1332 		if (val < 0)
1333 			val = INT_MAX;
1334 		if (sock)
1335 			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1336 		if (set_rcvlowat)
1337 			ret = set_rcvlowat(sk, val);
1338 		else
1339 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1340 		break;
1341 		}
1342 	case SO_RCVTIMEO_OLD:
1343 	case SO_RCVTIMEO_NEW:
1344 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1345 				       optlen, optname == SO_RCVTIMEO_OLD);
1346 		break;
1347 
1348 	case SO_SNDTIMEO_OLD:
1349 	case SO_SNDTIMEO_NEW:
1350 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1351 				       optlen, optname == SO_SNDTIMEO_OLD);
1352 		break;
1353 
1354 	case SO_ATTACH_FILTER: {
1355 		struct sock_fprog fprog;
1356 
1357 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1358 		if (!ret)
1359 			ret = sk_attach_filter(&fprog, sk);
1360 		break;
1361 	}
1362 	case SO_ATTACH_BPF:
1363 		ret = -EINVAL;
1364 		if (optlen == sizeof(u32)) {
1365 			u32 ufd;
1366 
1367 			ret = -EFAULT;
1368 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1369 				break;
1370 
1371 			ret = sk_attach_bpf(ufd, sk);
1372 		}
1373 		break;
1374 
1375 	case SO_ATTACH_REUSEPORT_CBPF: {
1376 		struct sock_fprog fprog;
1377 
1378 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1379 		if (!ret)
1380 			ret = sk_reuseport_attach_filter(&fprog, sk);
1381 		break;
1382 	}
1383 	case SO_ATTACH_REUSEPORT_EBPF:
1384 		ret = -EINVAL;
1385 		if (optlen == sizeof(u32)) {
1386 			u32 ufd;
1387 
1388 			ret = -EFAULT;
1389 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1390 				break;
1391 
1392 			ret = sk_reuseport_attach_bpf(ufd, sk);
1393 		}
1394 		break;
1395 
1396 	case SO_DETACH_REUSEPORT_BPF:
1397 		ret = reuseport_detach_prog(sk);
1398 		break;
1399 
1400 	case SO_DETACH_FILTER:
1401 		ret = sk_detach_filter(sk);
1402 		break;
1403 
1404 	case SO_LOCK_FILTER:
1405 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1406 			ret = -EPERM;
1407 		else
1408 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1409 		break;
1410 
1411 	case SO_MARK:
1412 		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1413 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1414 			ret = -EPERM;
1415 			break;
1416 		}
1417 
1418 		__sock_set_mark(sk, val);
1419 		break;
1420 	case SO_RCVMARK:
1421 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1422 		break;
1423 
1424 	case SO_RXQ_OVFL:
1425 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1426 		break;
1427 
1428 	case SO_WIFI_STATUS:
1429 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1430 		break;
1431 
1432 	case SO_PEEK_OFF:
1433 		{
1434 		int (*set_peek_off)(struct sock *sk, int val);
1435 
1436 		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1437 		if (set_peek_off)
1438 			ret = set_peek_off(sk, val);
1439 		else
1440 			ret = -EOPNOTSUPP;
1441 		break;
1442 		}
1443 
1444 	case SO_NOFCS:
1445 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1446 		break;
1447 
1448 	case SO_SELECT_ERR_QUEUE:
1449 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1450 		break;
1451 
1452 
1453 	case SO_INCOMING_CPU:
1454 		reuseport_update_incoming_cpu(sk, val);
1455 		break;
1456 
1457 	case SO_CNX_ADVICE:
1458 		if (val == 1)
1459 			dst_negative_advice(sk);
1460 		break;
1461 
1462 	case SO_ZEROCOPY:
1463 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1464 			if (!(sk_is_tcp(sk) ||
1465 			      (sk->sk_type == SOCK_DGRAM &&
1466 			       sk->sk_protocol == IPPROTO_UDP)))
1467 				ret = -EOPNOTSUPP;
1468 		} else if (sk->sk_family != PF_RDS) {
1469 			ret = -EOPNOTSUPP;
1470 		}
1471 		if (!ret) {
1472 			if (val < 0 || val > 1)
1473 				ret = -EINVAL;
1474 			else
1475 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1476 		}
1477 		break;
1478 
1479 	case SO_TXTIME:
1480 		if (optlen != sizeof(struct sock_txtime)) {
1481 			ret = -EINVAL;
1482 			break;
1483 		} else if (copy_from_sockptr(&sk_txtime, optval,
1484 			   sizeof(struct sock_txtime))) {
1485 			ret = -EFAULT;
1486 			break;
1487 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1488 			ret = -EINVAL;
1489 			break;
1490 		}
1491 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1492 		 * scheduler has enough safe guards.
1493 		 */
1494 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1495 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1496 			ret = -EPERM;
1497 			break;
1498 		}
1499 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1500 		sk->sk_clockid = sk_txtime.clockid;
1501 		sk->sk_txtime_deadline_mode =
1502 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1503 		sk->sk_txtime_report_errors =
1504 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1505 		break;
1506 
1507 	case SO_BINDTOIFINDEX:
1508 		ret = sock_bindtoindex_locked(sk, val);
1509 		break;
1510 
1511 	case SO_BUF_LOCK:
1512 		if (val & ~SOCK_BUF_LOCK_MASK) {
1513 			ret = -EINVAL;
1514 			break;
1515 		}
1516 		sk->sk_userlocks = val | (sk->sk_userlocks &
1517 					  ~SOCK_BUF_LOCK_MASK);
1518 		break;
1519 
1520 	case SO_RESERVE_MEM:
1521 	{
1522 		int delta;
1523 
1524 		if (val < 0) {
1525 			ret = -EINVAL;
1526 			break;
1527 		}
1528 
1529 		delta = val - sk->sk_reserved_mem;
1530 		if (delta < 0)
1531 			sock_release_reserved_memory(sk, -delta);
1532 		else
1533 			ret = sock_reserve_memory(sk, delta);
1534 		break;
1535 	}
1536 
1537 	default:
1538 		ret = -ENOPROTOOPT;
1539 		break;
1540 	}
1541 	sockopt_release_sock(sk);
1542 	return ret;
1543 }
1544 
1545 int sock_setsockopt(struct socket *sock, int level, int optname,
1546 		    sockptr_t optval, unsigned int optlen)
1547 {
1548 	return sk_setsockopt(sock->sk, level, optname,
1549 			     optval, optlen);
1550 }
1551 EXPORT_SYMBOL(sock_setsockopt);
1552 
1553 static const struct cred *sk_get_peer_cred(struct sock *sk)
1554 {
1555 	const struct cred *cred;
1556 
1557 	spin_lock(&sk->sk_peer_lock);
1558 	cred = get_cred(sk->sk_peer_cred);
1559 	spin_unlock(&sk->sk_peer_lock);
1560 
1561 	return cred;
1562 }
1563 
1564 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1565 			  struct ucred *ucred)
1566 {
1567 	ucred->pid = pid_vnr(pid);
1568 	ucred->uid = ucred->gid = -1;
1569 	if (cred) {
1570 		struct user_namespace *current_ns = current_user_ns();
1571 
1572 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1573 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1574 	}
1575 }
1576 
1577 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1578 {
1579 	struct user_namespace *user_ns = current_user_ns();
1580 	int i;
1581 
1582 	for (i = 0; i < src->ngroups; i++) {
1583 		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1584 
1585 		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1586 			return -EFAULT;
1587 	}
1588 
1589 	return 0;
1590 }
1591 
1592 int sk_getsockopt(struct sock *sk, int level, int optname,
1593 		  sockptr_t optval, sockptr_t optlen)
1594 {
1595 	struct socket *sock = sk->sk_socket;
1596 
1597 	union {
1598 		int val;
1599 		u64 val64;
1600 		unsigned long ulval;
1601 		struct linger ling;
1602 		struct old_timeval32 tm32;
1603 		struct __kernel_old_timeval tm;
1604 		struct  __kernel_sock_timeval stm;
1605 		struct sock_txtime txtime;
1606 		struct so_timestamping timestamping;
1607 	} v;
1608 
1609 	int lv = sizeof(int);
1610 	int len;
1611 
1612 	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1613 		return -EFAULT;
1614 	if (len < 0)
1615 		return -EINVAL;
1616 
1617 	memset(&v, 0, sizeof(v));
1618 
1619 	switch (optname) {
1620 	case SO_DEBUG:
1621 		v.val = sock_flag(sk, SOCK_DBG);
1622 		break;
1623 
1624 	case SO_DONTROUTE:
1625 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1626 		break;
1627 
1628 	case SO_BROADCAST:
1629 		v.val = sock_flag(sk, SOCK_BROADCAST);
1630 		break;
1631 
1632 	case SO_SNDBUF:
1633 		v.val = READ_ONCE(sk->sk_sndbuf);
1634 		break;
1635 
1636 	case SO_RCVBUF:
1637 		v.val = READ_ONCE(sk->sk_rcvbuf);
1638 		break;
1639 
1640 	case SO_REUSEADDR:
1641 		v.val = sk->sk_reuse;
1642 		break;
1643 
1644 	case SO_REUSEPORT:
1645 		v.val = sk->sk_reuseport;
1646 		break;
1647 
1648 	case SO_KEEPALIVE:
1649 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1650 		break;
1651 
1652 	case SO_TYPE:
1653 		v.val = sk->sk_type;
1654 		break;
1655 
1656 	case SO_PROTOCOL:
1657 		v.val = sk->sk_protocol;
1658 		break;
1659 
1660 	case SO_DOMAIN:
1661 		v.val = sk->sk_family;
1662 		break;
1663 
1664 	case SO_ERROR:
1665 		v.val = -sock_error(sk);
1666 		if (v.val == 0)
1667 			v.val = xchg(&sk->sk_err_soft, 0);
1668 		break;
1669 
1670 	case SO_OOBINLINE:
1671 		v.val = sock_flag(sk, SOCK_URGINLINE);
1672 		break;
1673 
1674 	case SO_NO_CHECK:
1675 		v.val = sk->sk_no_check_tx;
1676 		break;
1677 
1678 	case SO_PRIORITY:
1679 		v.val = READ_ONCE(sk->sk_priority);
1680 		break;
1681 
1682 	case SO_LINGER:
1683 		lv		= sizeof(v.ling);
1684 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1685 		v.ling.l_linger	= READ_ONCE(sk->sk_lingertime) / HZ;
1686 		break;
1687 
1688 	case SO_BSDCOMPAT:
1689 		break;
1690 
1691 	case SO_TIMESTAMP_OLD:
1692 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1693 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1694 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1695 		break;
1696 
1697 	case SO_TIMESTAMPNS_OLD:
1698 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1699 		break;
1700 
1701 	case SO_TIMESTAMP_NEW:
1702 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1703 		break;
1704 
1705 	case SO_TIMESTAMPNS_NEW:
1706 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1707 		break;
1708 
1709 	case SO_TIMESTAMPING_OLD:
1710 		lv = sizeof(v.timestamping);
1711 		v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1712 		v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1713 		break;
1714 
1715 	case SO_RCVTIMEO_OLD:
1716 	case SO_RCVTIMEO_NEW:
1717 		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1718 				      SO_RCVTIMEO_OLD == optname);
1719 		break;
1720 
1721 	case SO_SNDTIMEO_OLD:
1722 	case SO_SNDTIMEO_NEW:
1723 		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1724 				      SO_SNDTIMEO_OLD == optname);
1725 		break;
1726 
1727 	case SO_RCVLOWAT:
1728 		v.val = READ_ONCE(sk->sk_rcvlowat);
1729 		break;
1730 
1731 	case SO_SNDLOWAT:
1732 		v.val = 1;
1733 		break;
1734 
1735 	case SO_PASSCRED:
1736 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1737 		break;
1738 
1739 	case SO_PASSPIDFD:
1740 		v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1741 		break;
1742 
1743 	case SO_PEERCRED:
1744 	{
1745 		struct ucred peercred;
1746 		if (len > sizeof(peercred))
1747 			len = sizeof(peercred);
1748 
1749 		spin_lock(&sk->sk_peer_lock);
1750 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1751 		spin_unlock(&sk->sk_peer_lock);
1752 
1753 		if (copy_to_sockptr(optval, &peercred, len))
1754 			return -EFAULT;
1755 		goto lenout;
1756 	}
1757 
1758 	case SO_PEERPIDFD:
1759 	{
1760 		struct pid *peer_pid;
1761 		struct file *pidfd_file = NULL;
1762 		int pidfd;
1763 
1764 		if (len > sizeof(pidfd))
1765 			len = sizeof(pidfd);
1766 
1767 		spin_lock(&sk->sk_peer_lock);
1768 		peer_pid = get_pid(sk->sk_peer_pid);
1769 		spin_unlock(&sk->sk_peer_lock);
1770 
1771 		if (!peer_pid)
1772 			return -ENODATA;
1773 
1774 		pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1775 		put_pid(peer_pid);
1776 		if (pidfd < 0)
1777 			return pidfd;
1778 
1779 		if (copy_to_sockptr(optval, &pidfd, len) ||
1780 		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1781 			put_unused_fd(pidfd);
1782 			fput(pidfd_file);
1783 
1784 			return -EFAULT;
1785 		}
1786 
1787 		fd_install(pidfd, pidfd_file);
1788 		return 0;
1789 	}
1790 
1791 	case SO_PEERGROUPS:
1792 	{
1793 		const struct cred *cred;
1794 		int ret, n;
1795 
1796 		cred = sk_get_peer_cred(sk);
1797 		if (!cred)
1798 			return -ENODATA;
1799 
1800 		n = cred->group_info->ngroups;
1801 		if (len < n * sizeof(gid_t)) {
1802 			len = n * sizeof(gid_t);
1803 			put_cred(cred);
1804 			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1805 		}
1806 		len = n * sizeof(gid_t);
1807 
1808 		ret = groups_to_user(optval, cred->group_info);
1809 		put_cred(cred);
1810 		if (ret)
1811 			return ret;
1812 		goto lenout;
1813 	}
1814 
1815 	case SO_PEERNAME:
1816 	{
1817 		struct sockaddr_storage address;
1818 
1819 		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1820 		if (lv < 0)
1821 			return -ENOTCONN;
1822 		if (lv < len)
1823 			return -EINVAL;
1824 		if (copy_to_sockptr(optval, &address, len))
1825 			return -EFAULT;
1826 		goto lenout;
1827 	}
1828 
1829 	/* Dubious BSD thing... Probably nobody even uses it, but
1830 	 * the UNIX standard wants it for whatever reason... -DaveM
1831 	 */
1832 	case SO_ACCEPTCONN:
1833 		v.val = sk->sk_state == TCP_LISTEN;
1834 		break;
1835 
1836 	case SO_PASSSEC:
1837 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1838 		break;
1839 
1840 	case SO_PEERSEC:
1841 		return security_socket_getpeersec_stream(sock,
1842 							 optval, optlen, len);
1843 
1844 	case SO_MARK:
1845 		v.val = READ_ONCE(sk->sk_mark);
1846 		break;
1847 
1848 	case SO_RCVMARK:
1849 		v.val = sock_flag(sk, SOCK_RCVMARK);
1850 		break;
1851 
1852 	case SO_RXQ_OVFL:
1853 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1854 		break;
1855 
1856 	case SO_WIFI_STATUS:
1857 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1858 		break;
1859 
1860 	case SO_PEEK_OFF:
1861 		if (!READ_ONCE(sock->ops)->set_peek_off)
1862 			return -EOPNOTSUPP;
1863 
1864 		v.val = READ_ONCE(sk->sk_peek_off);
1865 		break;
1866 	case SO_NOFCS:
1867 		v.val = sock_flag(sk, SOCK_NOFCS);
1868 		break;
1869 
1870 	case SO_BINDTODEVICE:
1871 		return sock_getbindtodevice(sk, optval, optlen, len);
1872 
1873 	case SO_GET_FILTER:
1874 		len = sk_get_filter(sk, optval, len);
1875 		if (len < 0)
1876 			return len;
1877 
1878 		goto lenout;
1879 
1880 	case SO_LOCK_FILTER:
1881 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1882 		break;
1883 
1884 	case SO_BPF_EXTENSIONS:
1885 		v.val = bpf_tell_extensions();
1886 		break;
1887 
1888 	case SO_SELECT_ERR_QUEUE:
1889 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1890 		break;
1891 
1892 #ifdef CONFIG_NET_RX_BUSY_POLL
1893 	case SO_BUSY_POLL:
1894 		v.val = READ_ONCE(sk->sk_ll_usec);
1895 		break;
1896 	case SO_PREFER_BUSY_POLL:
1897 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1898 		break;
1899 #endif
1900 
1901 	case SO_MAX_PACING_RATE:
1902 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1903 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1904 			lv = sizeof(v.ulval);
1905 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1906 		} else {
1907 			/* 32bit version */
1908 			v.val = min_t(unsigned long, ~0U,
1909 				      READ_ONCE(sk->sk_max_pacing_rate));
1910 		}
1911 		break;
1912 
1913 	case SO_INCOMING_CPU:
1914 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1915 		break;
1916 
1917 	case SO_MEMINFO:
1918 	{
1919 		u32 meminfo[SK_MEMINFO_VARS];
1920 
1921 		sk_get_meminfo(sk, meminfo);
1922 
1923 		len = min_t(unsigned int, len, sizeof(meminfo));
1924 		if (copy_to_sockptr(optval, &meminfo, len))
1925 			return -EFAULT;
1926 
1927 		goto lenout;
1928 	}
1929 
1930 #ifdef CONFIG_NET_RX_BUSY_POLL
1931 	case SO_INCOMING_NAPI_ID:
1932 		v.val = READ_ONCE(sk->sk_napi_id);
1933 
1934 		/* aggregate non-NAPI IDs down to 0 */
1935 		if (v.val < MIN_NAPI_ID)
1936 			v.val = 0;
1937 
1938 		break;
1939 #endif
1940 
1941 	case SO_COOKIE:
1942 		lv = sizeof(u64);
1943 		if (len < lv)
1944 			return -EINVAL;
1945 		v.val64 = sock_gen_cookie(sk);
1946 		break;
1947 
1948 	case SO_ZEROCOPY:
1949 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1950 		break;
1951 
1952 	case SO_TXTIME:
1953 		lv = sizeof(v.txtime);
1954 		v.txtime.clockid = sk->sk_clockid;
1955 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1956 				  SOF_TXTIME_DEADLINE_MODE : 0;
1957 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1958 				  SOF_TXTIME_REPORT_ERRORS : 0;
1959 		break;
1960 
1961 	case SO_BINDTOIFINDEX:
1962 		v.val = READ_ONCE(sk->sk_bound_dev_if);
1963 		break;
1964 
1965 	case SO_NETNS_COOKIE:
1966 		lv = sizeof(u64);
1967 		if (len != lv)
1968 			return -EINVAL;
1969 		v.val64 = sock_net(sk)->net_cookie;
1970 		break;
1971 
1972 	case SO_BUF_LOCK:
1973 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1974 		break;
1975 
1976 	case SO_RESERVE_MEM:
1977 		v.val = READ_ONCE(sk->sk_reserved_mem);
1978 		break;
1979 
1980 	case SO_TXREHASH:
1981 		/* Paired with WRITE_ONCE() in sk_setsockopt() */
1982 		v.val = READ_ONCE(sk->sk_txrehash);
1983 		break;
1984 
1985 	default:
1986 		/* We implement the SO_SNDLOWAT etc to not be settable
1987 		 * (1003.1g 7).
1988 		 */
1989 		return -ENOPROTOOPT;
1990 	}
1991 
1992 	if (len > lv)
1993 		len = lv;
1994 	if (copy_to_sockptr(optval, &v, len))
1995 		return -EFAULT;
1996 lenout:
1997 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
1998 		return -EFAULT;
1999 	return 0;
2000 }
2001 
2002 /*
2003  * Initialize an sk_lock.
2004  *
2005  * (We also register the sk_lock with the lock validator.)
2006  */
2007 static inline void sock_lock_init(struct sock *sk)
2008 {
2009 	if (sk->sk_kern_sock)
2010 		sock_lock_init_class_and_name(
2011 			sk,
2012 			af_family_kern_slock_key_strings[sk->sk_family],
2013 			af_family_kern_slock_keys + sk->sk_family,
2014 			af_family_kern_key_strings[sk->sk_family],
2015 			af_family_kern_keys + sk->sk_family);
2016 	else
2017 		sock_lock_init_class_and_name(
2018 			sk,
2019 			af_family_slock_key_strings[sk->sk_family],
2020 			af_family_slock_keys + sk->sk_family,
2021 			af_family_key_strings[sk->sk_family],
2022 			af_family_keys + sk->sk_family);
2023 }
2024 
2025 /*
2026  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2027  * even temporarly, because of RCU lookups. sk_node should also be left as is.
2028  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2029  */
2030 static void sock_copy(struct sock *nsk, const struct sock *osk)
2031 {
2032 	const struct proto *prot = READ_ONCE(osk->sk_prot);
2033 #ifdef CONFIG_SECURITY_NETWORK
2034 	void *sptr = nsk->sk_security;
2035 #endif
2036 
2037 	/* If we move sk_tx_queue_mapping out of the private section,
2038 	 * we must check if sk_tx_queue_clear() is called after
2039 	 * sock_copy() in sk_clone_lock().
2040 	 */
2041 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2042 		     offsetof(struct sock, sk_dontcopy_begin) ||
2043 		     offsetof(struct sock, sk_tx_queue_mapping) >=
2044 		     offsetof(struct sock, sk_dontcopy_end));
2045 
2046 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2047 
2048 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2049 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
2050 
2051 #ifdef CONFIG_SECURITY_NETWORK
2052 	nsk->sk_security = sptr;
2053 	security_sk_clone(osk, nsk);
2054 #endif
2055 }
2056 
2057 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2058 		int family)
2059 {
2060 	struct sock *sk;
2061 	struct kmem_cache *slab;
2062 
2063 	slab = prot->slab;
2064 	if (slab != NULL) {
2065 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2066 		if (!sk)
2067 			return sk;
2068 		if (want_init_on_alloc(priority))
2069 			sk_prot_clear_nulls(sk, prot->obj_size);
2070 	} else
2071 		sk = kmalloc(prot->obj_size, priority);
2072 
2073 	if (sk != NULL) {
2074 		if (security_sk_alloc(sk, family, priority))
2075 			goto out_free;
2076 
2077 		if (!try_module_get(prot->owner))
2078 			goto out_free_sec;
2079 	}
2080 
2081 	return sk;
2082 
2083 out_free_sec:
2084 	security_sk_free(sk);
2085 out_free:
2086 	if (slab != NULL)
2087 		kmem_cache_free(slab, sk);
2088 	else
2089 		kfree(sk);
2090 	return NULL;
2091 }
2092 
2093 static void sk_prot_free(struct proto *prot, struct sock *sk)
2094 {
2095 	struct kmem_cache *slab;
2096 	struct module *owner;
2097 
2098 	owner = prot->owner;
2099 	slab = prot->slab;
2100 
2101 	cgroup_sk_free(&sk->sk_cgrp_data);
2102 	mem_cgroup_sk_free(sk);
2103 	security_sk_free(sk);
2104 	if (slab != NULL)
2105 		kmem_cache_free(slab, sk);
2106 	else
2107 		kfree(sk);
2108 	module_put(owner);
2109 }
2110 
2111 /**
2112  *	sk_alloc - All socket objects are allocated here
2113  *	@net: the applicable net namespace
2114  *	@family: protocol family
2115  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2116  *	@prot: struct proto associated with this new sock instance
2117  *	@kern: is this to be a kernel socket?
2118  */
2119 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2120 		      struct proto *prot, int kern)
2121 {
2122 	struct sock *sk;
2123 
2124 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2125 	if (sk) {
2126 		sk->sk_family = family;
2127 		/*
2128 		 * See comment in struct sock definition to understand
2129 		 * why we need sk_prot_creator -acme
2130 		 */
2131 		sk->sk_prot = sk->sk_prot_creator = prot;
2132 		sk->sk_kern_sock = kern;
2133 		sock_lock_init(sk);
2134 		sk->sk_net_refcnt = kern ? 0 : 1;
2135 		if (likely(sk->sk_net_refcnt)) {
2136 			get_net_track(net, &sk->ns_tracker, priority);
2137 			sock_inuse_add(net, 1);
2138 		} else {
2139 			__netns_tracker_alloc(net, &sk->ns_tracker,
2140 					      false, priority);
2141 		}
2142 
2143 		sock_net_set(sk, net);
2144 		refcount_set(&sk->sk_wmem_alloc, 1);
2145 
2146 		mem_cgroup_sk_alloc(sk);
2147 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2148 		sock_update_classid(&sk->sk_cgrp_data);
2149 		sock_update_netprioidx(&sk->sk_cgrp_data);
2150 		sk_tx_queue_clear(sk);
2151 	}
2152 
2153 	return sk;
2154 }
2155 EXPORT_SYMBOL(sk_alloc);
2156 
2157 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2158  * grace period. This is the case for UDP sockets and TCP listeners.
2159  */
2160 static void __sk_destruct(struct rcu_head *head)
2161 {
2162 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2163 	struct sk_filter *filter;
2164 
2165 	if (sk->sk_destruct)
2166 		sk->sk_destruct(sk);
2167 
2168 	filter = rcu_dereference_check(sk->sk_filter,
2169 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2170 	if (filter) {
2171 		sk_filter_uncharge(sk, filter);
2172 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2173 	}
2174 
2175 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2176 
2177 #ifdef CONFIG_BPF_SYSCALL
2178 	bpf_sk_storage_free(sk);
2179 #endif
2180 
2181 	if (atomic_read(&sk->sk_omem_alloc))
2182 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2183 			 __func__, atomic_read(&sk->sk_omem_alloc));
2184 
2185 	if (sk->sk_frag.page) {
2186 		put_page(sk->sk_frag.page);
2187 		sk->sk_frag.page = NULL;
2188 	}
2189 
2190 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2191 	put_cred(sk->sk_peer_cred);
2192 	put_pid(sk->sk_peer_pid);
2193 
2194 	if (likely(sk->sk_net_refcnt))
2195 		put_net_track(sock_net(sk), &sk->ns_tracker);
2196 	else
2197 		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2198 
2199 	sk_prot_free(sk->sk_prot_creator, sk);
2200 }
2201 
2202 void sk_destruct(struct sock *sk)
2203 {
2204 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2205 
2206 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2207 		reuseport_detach_sock(sk);
2208 		use_call_rcu = true;
2209 	}
2210 
2211 	if (use_call_rcu)
2212 		call_rcu(&sk->sk_rcu, __sk_destruct);
2213 	else
2214 		__sk_destruct(&sk->sk_rcu);
2215 }
2216 
2217 static void __sk_free(struct sock *sk)
2218 {
2219 	if (likely(sk->sk_net_refcnt))
2220 		sock_inuse_add(sock_net(sk), -1);
2221 
2222 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2223 		sock_diag_broadcast_destroy(sk);
2224 	else
2225 		sk_destruct(sk);
2226 }
2227 
2228 void sk_free(struct sock *sk)
2229 {
2230 	/*
2231 	 * We subtract one from sk_wmem_alloc and can know if
2232 	 * some packets are still in some tx queue.
2233 	 * If not null, sock_wfree() will call __sk_free(sk) later
2234 	 */
2235 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2236 		__sk_free(sk);
2237 }
2238 EXPORT_SYMBOL(sk_free);
2239 
2240 static void sk_init_common(struct sock *sk)
2241 {
2242 	skb_queue_head_init(&sk->sk_receive_queue);
2243 	skb_queue_head_init(&sk->sk_write_queue);
2244 	skb_queue_head_init(&sk->sk_error_queue);
2245 
2246 	rwlock_init(&sk->sk_callback_lock);
2247 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2248 			af_rlock_keys + sk->sk_family,
2249 			af_family_rlock_key_strings[sk->sk_family]);
2250 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2251 			af_wlock_keys + sk->sk_family,
2252 			af_family_wlock_key_strings[sk->sk_family]);
2253 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2254 			af_elock_keys + sk->sk_family,
2255 			af_family_elock_key_strings[sk->sk_family]);
2256 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2257 			af_callback_keys + sk->sk_family,
2258 			af_family_clock_key_strings[sk->sk_family]);
2259 }
2260 
2261 /**
2262  *	sk_clone_lock - clone a socket, and lock its clone
2263  *	@sk: the socket to clone
2264  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2265  *
2266  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2267  */
2268 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2269 {
2270 	struct proto *prot = READ_ONCE(sk->sk_prot);
2271 	struct sk_filter *filter;
2272 	bool is_charged = true;
2273 	struct sock *newsk;
2274 
2275 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2276 	if (!newsk)
2277 		goto out;
2278 
2279 	sock_copy(newsk, sk);
2280 
2281 	newsk->sk_prot_creator = prot;
2282 
2283 	/* SANITY */
2284 	if (likely(newsk->sk_net_refcnt)) {
2285 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2286 		sock_inuse_add(sock_net(newsk), 1);
2287 	} else {
2288 		/* Kernel sockets are not elevating the struct net refcount.
2289 		 * Instead, use a tracker to more easily detect if a layer
2290 		 * is not properly dismantling its kernel sockets at netns
2291 		 * destroy time.
2292 		 */
2293 		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2294 				      false, priority);
2295 	}
2296 	sk_node_init(&newsk->sk_node);
2297 	sock_lock_init(newsk);
2298 	bh_lock_sock(newsk);
2299 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2300 	newsk->sk_backlog.len = 0;
2301 
2302 	atomic_set(&newsk->sk_rmem_alloc, 0);
2303 
2304 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2305 	refcount_set(&newsk->sk_wmem_alloc, 1);
2306 
2307 	atomic_set(&newsk->sk_omem_alloc, 0);
2308 	sk_init_common(newsk);
2309 
2310 	newsk->sk_dst_cache	= NULL;
2311 	newsk->sk_dst_pending_confirm = 0;
2312 	newsk->sk_wmem_queued	= 0;
2313 	newsk->sk_forward_alloc = 0;
2314 	newsk->sk_reserved_mem  = 0;
2315 	atomic_set(&newsk->sk_drops, 0);
2316 	newsk->sk_send_head	= NULL;
2317 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2318 	atomic_set(&newsk->sk_zckey, 0);
2319 
2320 	sock_reset_flag(newsk, SOCK_DONE);
2321 
2322 	/* sk->sk_memcg will be populated at accept() time */
2323 	newsk->sk_memcg = NULL;
2324 
2325 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2326 
2327 	rcu_read_lock();
2328 	filter = rcu_dereference(sk->sk_filter);
2329 	if (filter != NULL)
2330 		/* though it's an empty new sock, the charging may fail
2331 		 * if sysctl_optmem_max was changed between creation of
2332 		 * original socket and cloning
2333 		 */
2334 		is_charged = sk_filter_charge(newsk, filter);
2335 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2336 	rcu_read_unlock();
2337 
2338 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2339 		/* We need to make sure that we don't uncharge the new
2340 		 * socket if we couldn't charge it in the first place
2341 		 * as otherwise we uncharge the parent's filter.
2342 		 */
2343 		if (!is_charged)
2344 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2345 		sk_free_unlock_clone(newsk);
2346 		newsk = NULL;
2347 		goto out;
2348 	}
2349 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2350 
2351 	if (bpf_sk_storage_clone(sk, newsk)) {
2352 		sk_free_unlock_clone(newsk);
2353 		newsk = NULL;
2354 		goto out;
2355 	}
2356 
2357 	/* Clear sk_user_data if parent had the pointer tagged
2358 	 * as not suitable for copying when cloning.
2359 	 */
2360 	if (sk_user_data_is_nocopy(newsk))
2361 		newsk->sk_user_data = NULL;
2362 
2363 	newsk->sk_err	   = 0;
2364 	newsk->sk_err_soft = 0;
2365 	newsk->sk_priority = 0;
2366 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2367 
2368 	/* Before updating sk_refcnt, we must commit prior changes to memory
2369 	 * (Documentation/RCU/rculist_nulls.rst for details)
2370 	 */
2371 	smp_wmb();
2372 	refcount_set(&newsk->sk_refcnt, 2);
2373 
2374 	sk_set_socket(newsk, NULL);
2375 	sk_tx_queue_clear(newsk);
2376 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2377 
2378 	if (newsk->sk_prot->sockets_allocated)
2379 		sk_sockets_allocated_inc(newsk);
2380 
2381 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2382 		net_enable_timestamp();
2383 out:
2384 	return newsk;
2385 }
2386 EXPORT_SYMBOL_GPL(sk_clone_lock);
2387 
2388 void sk_free_unlock_clone(struct sock *sk)
2389 {
2390 	/* It is still raw copy of parent, so invalidate
2391 	 * destructor and make plain sk_free() */
2392 	sk->sk_destruct = NULL;
2393 	bh_unlock_sock(sk);
2394 	sk_free(sk);
2395 }
2396 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2397 
2398 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2399 {
2400 	bool is_ipv6 = false;
2401 	u32 max_size;
2402 
2403 #if IS_ENABLED(CONFIG_IPV6)
2404 	is_ipv6 = (sk->sk_family == AF_INET6 &&
2405 		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2406 #endif
2407 	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2408 	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2409 			READ_ONCE(dst->dev->gso_ipv4_max_size);
2410 	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2411 		max_size = GSO_LEGACY_MAX_SIZE;
2412 
2413 	return max_size - (MAX_TCP_HEADER + 1);
2414 }
2415 
2416 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2417 {
2418 	u32 max_segs = 1;
2419 
2420 	sk->sk_route_caps = dst->dev->features;
2421 	if (sk_is_tcp(sk))
2422 		sk->sk_route_caps |= NETIF_F_GSO;
2423 	if (sk->sk_route_caps & NETIF_F_GSO)
2424 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2425 	if (unlikely(sk->sk_gso_disabled))
2426 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2427 	if (sk_can_gso(sk)) {
2428 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2429 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2430 		} else {
2431 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2432 			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2433 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2434 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2435 		}
2436 	}
2437 	sk->sk_gso_max_segs = max_segs;
2438 	sk_dst_set(sk, dst);
2439 }
2440 EXPORT_SYMBOL_GPL(sk_setup_caps);
2441 
2442 /*
2443  *	Simple resource managers for sockets.
2444  */
2445 
2446 
2447 /*
2448  * Write buffer destructor automatically called from kfree_skb.
2449  */
2450 void sock_wfree(struct sk_buff *skb)
2451 {
2452 	struct sock *sk = skb->sk;
2453 	unsigned int len = skb->truesize;
2454 	bool free;
2455 
2456 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2457 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2458 		    sk->sk_write_space == sock_def_write_space) {
2459 			rcu_read_lock();
2460 			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2461 			sock_def_write_space_wfree(sk);
2462 			rcu_read_unlock();
2463 			if (unlikely(free))
2464 				__sk_free(sk);
2465 			return;
2466 		}
2467 
2468 		/*
2469 		 * Keep a reference on sk_wmem_alloc, this will be released
2470 		 * after sk_write_space() call
2471 		 */
2472 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2473 		sk->sk_write_space(sk);
2474 		len = 1;
2475 	}
2476 	/*
2477 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2478 	 * could not do because of in-flight packets
2479 	 */
2480 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2481 		__sk_free(sk);
2482 }
2483 EXPORT_SYMBOL(sock_wfree);
2484 
2485 /* This variant of sock_wfree() is used by TCP,
2486  * since it sets SOCK_USE_WRITE_QUEUE.
2487  */
2488 void __sock_wfree(struct sk_buff *skb)
2489 {
2490 	struct sock *sk = skb->sk;
2491 
2492 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2493 		__sk_free(sk);
2494 }
2495 
2496 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2497 {
2498 	skb_orphan(skb);
2499 	skb->sk = sk;
2500 #ifdef CONFIG_INET
2501 	if (unlikely(!sk_fullsock(sk))) {
2502 		skb->destructor = sock_edemux;
2503 		sock_hold(sk);
2504 		return;
2505 	}
2506 #endif
2507 	skb->destructor = sock_wfree;
2508 	skb_set_hash_from_sk(skb, sk);
2509 	/*
2510 	 * We used to take a refcount on sk, but following operation
2511 	 * is enough to guarantee sk_free() wont free this sock until
2512 	 * all in-flight packets are completed
2513 	 */
2514 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2515 }
2516 EXPORT_SYMBOL(skb_set_owner_w);
2517 
2518 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2519 {
2520 #ifdef CONFIG_TLS_DEVICE
2521 	/* Drivers depend on in-order delivery for crypto offload,
2522 	 * partial orphan breaks out-of-order-OK logic.
2523 	 */
2524 	if (skb->decrypted)
2525 		return false;
2526 #endif
2527 	return (skb->destructor == sock_wfree ||
2528 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2529 }
2530 
2531 /* This helper is used by netem, as it can hold packets in its
2532  * delay queue. We want to allow the owner socket to send more
2533  * packets, as if they were already TX completed by a typical driver.
2534  * But we also want to keep skb->sk set because some packet schedulers
2535  * rely on it (sch_fq for example).
2536  */
2537 void skb_orphan_partial(struct sk_buff *skb)
2538 {
2539 	if (skb_is_tcp_pure_ack(skb))
2540 		return;
2541 
2542 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2543 		return;
2544 
2545 	skb_orphan(skb);
2546 }
2547 EXPORT_SYMBOL(skb_orphan_partial);
2548 
2549 /*
2550  * Read buffer destructor automatically called from kfree_skb.
2551  */
2552 void sock_rfree(struct sk_buff *skb)
2553 {
2554 	struct sock *sk = skb->sk;
2555 	unsigned int len = skb->truesize;
2556 
2557 	atomic_sub(len, &sk->sk_rmem_alloc);
2558 	sk_mem_uncharge(sk, len);
2559 }
2560 EXPORT_SYMBOL(sock_rfree);
2561 
2562 /*
2563  * Buffer destructor for skbs that are not used directly in read or write
2564  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2565  */
2566 void sock_efree(struct sk_buff *skb)
2567 {
2568 	sock_put(skb->sk);
2569 }
2570 EXPORT_SYMBOL(sock_efree);
2571 
2572 /* Buffer destructor for prefetch/receive path where reference count may
2573  * not be held, e.g. for listen sockets.
2574  */
2575 #ifdef CONFIG_INET
2576 void sock_pfree(struct sk_buff *skb)
2577 {
2578 	if (sk_is_refcounted(skb->sk))
2579 		sock_gen_put(skb->sk);
2580 }
2581 EXPORT_SYMBOL(sock_pfree);
2582 #endif /* CONFIG_INET */
2583 
2584 kuid_t sock_i_uid(struct sock *sk)
2585 {
2586 	kuid_t uid;
2587 
2588 	read_lock_bh(&sk->sk_callback_lock);
2589 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2590 	read_unlock_bh(&sk->sk_callback_lock);
2591 	return uid;
2592 }
2593 EXPORT_SYMBOL(sock_i_uid);
2594 
2595 unsigned long __sock_i_ino(struct sock *sk)
2596 {
2597 	unsigned long ino;
2598 
2599 	read_lock(&sk->sk_callback_lock);
2600 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2601 	read_unlock(&sk->sk_callback_lock);
2602 	return ino;
2603 }
2604 EXPORT_SYMBOL(__sock_i_ino);
2605 
2606 unsigned long sock_i_ino(struct sock *sk)
2607 {
2608 	unsigned long ino;
2609 
2610 	local_bh_disable();
2611 	ino = __sock_i_ino(sk);
2612 	local_bh_enable();
2613 	return ino;
2614 }
2615 EXPORT_SYMBOL(sock_i_ino);
2616 
2617 /*
2618  * Allocate a skb from the socket's send buffer.
2619  */
2620 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2621 			     gfp_t priority)
2622 {
2623 	if (force ||
2624 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2625 		struct sk_buff *skb = alloc_skb(size, priority);
2626 
2627 		if (skb) {
2628 			skb_set_owner_w(skb, sk);
2629 			return skb;
2630 		}
2631 	}
2632 	return NULL;
2633 }
2634 EXPORT_SYMBOL(sock_wmalloc);
2635 
2636 static void sock_ofree(struct sk_buff *skb)
2637 {
2638 	struct sock *sk = skb->sk;
2639 
2640 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2641 }
2642 
2643 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2644 			     gfp_t priority)
2645 {
2646 	struct sk_buff *skb;
2647 
2648 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2649 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2650 	    READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2651 		return NULL;
2652 
2653 	skb = alloc_skb(size, priority);
2654 	if (!skb)
2655 		return NULL;
2656 
2657 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2658 	skb->sk = sk;
2659 	skb->destructor = sock_ofree;
2660 	return skb;
2661 }
2662 
2663 /*
2664  * Allocate a memory block from the socket's option memory buffer.
2665  */
2666 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2667 {
2668 	int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2669 
2670 	if ((unsigned int)size <= optmem_max &&
2671 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2672 		void *mem;
2673 		/* First do the add, to avoid the race if kmalloc
2674 		 * might sleep.
2675 		 */
2676 		atomic_add(size, &sk->sk_omem_alloc);
2677 		mem = kmalloc(size, priority);
2678 		if (mem)
2679 			return mem;
2680 		atomic_sub(size, &sk->sk_omem_alloc);
2681 	}
2682 	return NULL;
2683 }
2684 EXPORT_SYMBOL(sock_kmalloc);
2685 
2686 /* Free an option memory block. Note, we actually want the inline
2687  * here as this allows gcc to detect the nullify and fold away the
2688  * condition entirely.
2689  */
2690 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2691 				  const bool nullify)
2692 {
2693 	if (WARN_ON_ONCE(!mem))
2694 		return;
2695 	if (nullify)
2696 		kfree_sensitive(mem);
2697 	else
2698 		kfree(mem);
2699 	atomic_sub(size, &sk->sk_omem_alloc);
2700 }
2701 
2702 void sock_kfree_s(struct sock *sk, void *mem, int size)
2703 {
2704 	__sock_kfree_s(sk, mem, size, false);
2705 }
2706 EXPORT_SYMBOL(sock_kfree_s);
2707 
2708 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2709 {
2710 	__sock_kfree_s(sk, mem, size, true);
2711 }
2712 EXPORT_SYMBOL(sock_kzfree_s);
2713 
2714 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2715    I think, these locks should be removed for datagram sockets.
2716  */
2717 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2718 {
2719 	DEFINE_WAIT(wait);
2720 
2721 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2722 	for (;;) {
2723 		if (!timeo)
2724 			break;
2725 		if (signal_pending(current))
2726 			break;
2727 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2728 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2729 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2730 			break;
2731 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2732 			break;
2733 		if (READ_ONCE(sk->sk_err))
2734 			break;
2735 		timeo = schedule_timeout(timeo);
2736 	}
2737 	finish_wait(sk_sleep(sk), &wait);
2738 	return timeo;
2739 }
2740 
2741 
2742 /*
2743  *	Generic send/receive buffer handlers
2744  */
2745 
2746 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2747 				     unsigned long data_len, int noblock,
2748 				     int *errcode, int max_page_order)
2749 {
2750 	struct sk_buff *skb;
2751 	long timeo;
2752 	int err;
2753 
2754 	timeo = sock_sndtimeo(sk, noblock);
2755 	for (;;) {
2756 		err = sock_error(sk);
2757 		if (err != 0)
2758 			goto failure;
2759 
2760 		err = -EPIPE;
2761 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2762 			goto failure;
2763 
2764 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2765 			break;
2766 
2767 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2768 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2769 		err = -EAGAIN;
2770 		if (!timeo)
2771 			goto failure;
2772 		if (signal_pending(current))
2773 			goto interrupted;
2774 		timeo = sock_wait_for_wmem(sk, timeo);
2775 	}
2776 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2777 				   errcode, sk->sk_allocation);
2778 	if (skb)
2779 		skb_set_owner_w(skb, sk);
2780 	return skb;
2781 
2782 interrupted:
2783 	err = sock_intr_errno(timeo);
2784 failure:
2785 	*errcode = err;
2786 	return NULL;
2787 }
2788 EXPORT_SYMBOL(sock_alloc_send_pskb);
2789 
2790 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2791 		     struct sockcm_cookie *sockc)
2792 {
2793 	u32 tsflags;
2794 
2795 	switch (cmsg->cmsg_type) {
2796 	case SO_MARK:
2797 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2798 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2799 			return -EPERM;
2800 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2801 			return -EINVAL;
2802 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2803 		break;
2804 	case SO_TIMESTAMPING_OLD:
2805 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2806 			return -EINVAL;
2807 
2808 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2809 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2810 			return -EINVAL;
2811 
2812 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2813 		sockc->tsflags |= tsflags;
2814 		break;
2815 	case SCM_TXTIME:
2816 		if (!sock_flag(sk, SOCK_TXTIME))
2817 			return -EINVAL;
2818 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2819 			return -EINVAL;
2820 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2821 		break;
2822 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2823 	case SCM_RIGHTS:
2824 	case SCM_CREDENTIALS:
2825 		break;
2826 	default:
2827 		return -EINVAL;
2828 	}
2829 	return 0;
2830 }
2831 EXPORT_SYMBOL(__sock_cmsg_send);
2832 
2833 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2834 		   struct sockcm_cookie *sockc)
2835 {
2836 	struct cmsghdr *cmsg;
2837 	int ret;
2838 
2839 	for_each_cmsghdr(cmsg, msg) {
2840 		if (!CMSG_OK(msg, cmsg))
2841 			return -EINVAL;
2842 		if (cmsg->cmsg_level != SOL_SOCKET)
2843 			continue;
2844 		ret = __sock_cmsg_send(sk, cmsg, sockc);
2845 		if (ret)
2846 			return ret;
2847 	}
2848 	return 0;
2849 }
2850 EXPORT_SYMBOL(sock_cmsg_send);
2851 
2852 static void sk_enter_memory_pressure(struct sock *sk)
2853 {
2854 	if (!sk->sk_prot->enter_memory_pressure)
2855 		return;
2856 
2857 	sk->sk_prot->enter_memory_pressure(sk);
2858 }
2859 
2860 static void sk_leave_memory_pressure(struct sock *sk)
2861 {
2862 	if (sk->sk_prot->leave_memory_pressure) {
2863 		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2864 				     tcp_leave_memory_pressure, sk);
2865 	} else {
2866 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2867 
2868 		if (memory_pressure && READ_ONCE(*memory_pressure))
2869 			WRITE_ONCE(*memory_pressure, 0);
2870 	}
2871 }
2872 
2873 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2874 
2875 /**
2876  * skb_page_frag_refill - check that a page_frag contains enough room
2877  * @sz: minimum size of the fragment we want to get
2878  * @pfrag: pointer to page_frag
2879  * @gfp: priority for memory allocation
2880  *
2881  * Note: While this allocator tries to use high order pages, there is
2882  * no guarantee that allocations succeed. Therefore, @sz MUST be
2883  * less or equal than PAGE_SIZE.
2884  */
2885 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2886 {
2887 	if (pfrag->page) {
2888 		if (page_ref_count(pfrag->page) == 1) {
2889 			pfrag->offset = 0;
2890 			return true;
2891 		}
2892 		if (pfrag->offset + sz <= pfrag->size)
2893 			return true;
2894 		put_page(pfrag->page);
2895 	}
2896 
2897 	pfrag->offset = 0;
2898 	if (SKB_FRAG_PAGE_ORDER &&
2899 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2900 		/* Avoid direct reclaim but allow kswapd to wake */
2901 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2902 					  __GFP_COMP | __GFP_NOWARN |
2903 					  __GFP_NORETRY,
2904 					  SKB_FRAG_PAGE_ORDER);
2905 		if (likely(pfrag->page)) {
2906 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2907 			return true;
2908 		}
2909 	}
2910 	pfrag->page = alloc_page(gfp);
2911 	if (likely(pfrag->page)) {
2912 		pfrag->size = PAGE_SIZE;
2913 		return true;
2914 	}
2915 	return false;
2916 }
2917 EXPORT_SYMBOL(skb_page_frag_refill);
2918 
2919 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2920 {
2921 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2922 		return true;
2923 
2924 	sk_enter_memory_pressure(sk);
2925 	sk_stream_moderate_sndbuf(sk);
2926 	return false;
2927 }
2928 EXPORT_SYMBOL(sk_page_frag_refill);
2929 
2930 void __lock_sock(struct sock *sk)
2931 	__releases(&sk->sk_lock.slock)
2932 	__acquires(&sk->sk_lock.slock)
2933 {
2934 	DEFINE_WAIT(wait);
2935 
2936 	for (;;) {
2937 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2938 					TASK_UNINTERRUPTIBLE);
2939 		spin_unlock_bh(&sk->sk_lock.slock);
2940 		schedule();
2941 		spin_lock_bh(&sk->sk_lock.slock);
2942 		if (!sock_owned_by_user(sk))
2943 			break;
2944 	}
2945 	finish_wait(&sk->sk_lock.wq, &wait);
2946 }
2947 
2948 void __release_sock(struct sock *sk)
2949 	__releases(&sk->sk_lock.slock)
2950 	__acquires(&sk->sk_lock.slock)
2951 {
2952 	struct sk_buff *skb, *next;
2953 
2954 	while ((skb = sk->sk_backlog.head) != NULL) {
2955 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2956 
2957 		spin_unlock_bh(&sk->sk_lock.slock);
2958 
2959 		do {
2960 			next = skb->next;
2961 			prefetch(next);
2962 			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2963 			skb_mark_not_on_list(skb);
2964 			sk_backlog_rcv(sk, skb);
2965 
2966 			cond_resched();
2967 
2968 			skb = next;
2969 		} while (skb != NULL);
2970 
2971 		spin_lock_bh(&sk->sk_lock.slock);
2972 	}
2973 
2974 	/*
2975 	 * Doing the zeroing here guarantee we can not loop forever
2976 	 * while a wild producer attempts to flood us.
2977 	 */
2978 	sk->sk_backlog.len = 0;
2979 }
2980 
2981 void __sk_flush_backlog(struct sock *sk)
2982 {
2983 	spin_lock_bh(&sk->sk_lock.slock);
2984 	__release_sock(sk);
2985 
2986 	if (sk->sk_prot->release_cb)
2987 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
2988 				     tcp_release_cb, sk);
2989 
2990 	spin_unlock_bh(&sk->sk_lock.slock);
2991 }
2992 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
2993 
2994 /**
2995  * sk_wait_data - wait for data to arrive at sk_receive_queue
2996  * @sk:    sock to wait on
2997  * @timeo: for how long
2998  * @skb:   last skb seen on sk_receive_queue
2999  *
3000  * Now socket state including sk->sk_err is changed only under lock,
3001  * hence we may omit checks after joining wait queue.
3002  * We check receive queue before schedule() only as optimization;
3003  * it is very likely that release_sock() added new data.
3004  */
3005 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3006 {
3007 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3008 	int rc;
3009 
3010 	add_wait_queue(sk_sleep(sk), &wait);
3011 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3012 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3013 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3014 	remove_wait_queue(sk_sleep(sk), &wait);
3015 	return rc;
3016 }
3017 EXPORT_SYMBOL(sk_wait_data);
3018 
3019 /**
3020  *	__sk_mem_raise_allocated - increase memory_allocated
3021  *	@sk: socket
3022  *	@size: memory size to allocate
3023  *	@amt: pages to allocate
3024  *	@kind: allocation type
3025  *
3026  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3027  *
3028  *	Unlike the globally shared limits among the sockets under same protocol,
3029  *	consuming the budget of a memcg won't have direct effect on other ones.
3030  *	So be optimistic about memcg's tolerance, and leave the callers to decide
3031  *	whether or not to raise allocated through sk_under_memory_pressure() or
3032  *	its variants.
3033  */
3034 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3035 {
3036 	struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3037 	struct proto *prot = sk->sk_prot;
3038 	bool charged = false;
3039 	long allocated;
3040 
3041 	sk_memory_allocated_add(sk, amt);
3042 	allocated = sk_memory_allocated(sk);
3043 
3044 	if (memcg) {
3045 		if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
3046 			goto suppress_allocation;
3047 		charged = true;
3048 	}
3049 
3050 	/* Under limit. */
3051 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3052 		sk_leave_memory_pressure(sk);
3053 		return 1;
3054 	}
3055 
3056 	/* Under pressure. */
3057 	if (allocated > sk_prot_mem_limits(sk, 1))
3058 		sk_enter_memory_pressure(sk);
3059 
3060 	/* Over hard limit. */
3061 	if (allocated > sk_prot_mem_limits(sk, 2))
3062 		goto suppress_allocation;
3063 
3064 	/* Guarantee minimum buffer size under pressure (either global
3065 	 * or memcg) to make sure features described in RFC 7323 (TCP
3066 	 * Extensions for High Performance) work properly.
3067 	 *
3068 	 * This rule does NOT stand when exceeds global or memcg's hard
3069 	 * limit, or else a DoS attack can be taken place by spawning
3070 	 * lots of sockets whose usage are under minimum buffer size.
3071 	 */
3072 	if (kind == SK_MEM_RECV) {
3073 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3074 			return 1;
3075 
3076 	} else { /* SK_MEM_SEND */
3077 		int wmem0 = sk_get_wmem0(sk, prot);
3078 
3079 		if (sk->sk_type == SOCK_STREAM) {
3080 			if (sk->sk_wmem_queued < wmem0)
3081 				return 1;
3082 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3083 				return 1;
3084 		}
3085 	}
3086 
3087 	if (sk_has_memory_pressure(sk)) {
3088 		u64 alloc;
3089 
3090 		/* The following 'average' heuristic is within the
3091 		 * scope of global accounting, so it only makes
3092 		 * sense for global memory pressure.
3093 		 */
3094 		if (!sk_under_global_memory_pressure(sk))
3095 			return 1;
3096 
3097 		/* Try to be fair among all the sockets under global
3098 		 * pressure by allowing the ones that below average
3099 		 * usage to raise.
3100 		 */
3101 		alloc = sk_sockets_allocated_read_positive(sk);
3102 		if (sk_prot_mem_limits(sk, 2) > alloc *
3103 		    sk_mem_pages(sk->sk_wmem_queued +
3104 				 atomic_read(&sk->sk_rmem_alloc) +
3105 				 sk->sk_forward_alloc))
3106 			return 1;
3107 	}
3108 
3109 suppress_allocation:
3110 
3111 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3112 		sk_stream_moderate_sndbuf(sk);
3113 
3114 		/* Fail only if socket is _under_ its sndbuf.
3115 		 * In this case we cannot block, so that we have to fail.
3116 		 */
3117 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3118 			/* Force charge with __GFP_NOFAIL */
3119 			if (memcg && !charged) {
3120 				mem_cgroup_charge_skmem(memcg, amt,
3121 					gfp_memcg_charge() | __GFP_NOFAIL);
3122 			}
3123 			return 1;
3124 		}
3125 	}
3126 
3127 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3128 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3129 
3130 	sk_memory_allocated_sub(sk, amt);
3131 
3132 	if (charged)
3133 		mem_cgroup_uncharge_skmem(memcg, amt);
3134 
3135 	return 0;
3136 }
3137 
3138 /**
3139  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3140  *	@sk: socket
3141  *	@size: memory size to allocate
3142  *	@kind: allocation type
3143  *
3144  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3145  *	rmem allocation. This function assumes that protocols which have
3146  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3147  */
3148 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3149 {
3150 	int ret, amt = sk_mem_pages(size);
3151 
3152 	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3153 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3154 	if (!ret)
3155 		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3156 	return ret;
3157 }
3158 EXPORT_SYMBOL(__sk_mem_schedule);
3159 
3160 /**
3161  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3162  *	@sk: socket
3163  *	@amount: number of quanta
3164  *
3165  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3166  */
3167 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3168 {
3169 	sk_memory_allocated_sub(sk, amount);
3170 
3171 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3172 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3173 
3174 	if (sk_under_global_memory_pressure(sk) &&
3175 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3176 		sk_leave_memory_pressure(sk);
3177 }
3178 
3179 /**
3180  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3181  *	@sk: socket
3182  *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3183  */
3184 void __sk_mem_reclaim(struct sock *sk, int amount)
3185 {
3186 	amount >>= PAGE_SHIFT;
3187 	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3188 	__sk_mem_reduce_allocated(sk, amount);
3189 }
3190 EXPORT_SYMBOL(__sk_mem_reclaim);
3191 
3192 int sk_set_peek_off(struct sock *sk, int val)
3193 {
3194 	WRITE_ONCE(sk->sk_peek_off, val);
3195 	return 0;
3196 }
3197 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3198 
3199 /*
3200  * Set of default routines for initialising struct proto_ops when
3201  * the protocol does not support a particular function. In certain
3202  * cases where it makes no sense for a protocol to have a "do nothing"
3203  * function, some default processing is provided.
3204  */
3205 
3206 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3207 {
3208 	return -EOPNOTSUPP;
3209 }
3210 EXPORT_SYMBOL(sock_no_bind);
3211 
3212 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3213 		    int len, int flags)
3214 {
3215 	return -EOPNOTSUPP;
3216 }
3217 EXPORT_SYMBOL(sock_no_connect);
3218 
3219 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3220 {
3221 	return -EOPNOTSUPP;
3222 }
3223 EXPORT_SYMBOL(sock_no_socketpair);
3224 
3225 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3226 		   bool kern)
3227 {
3228 	return -EOPNOTSUPP;
3229 }
3230 EXPORT_SYMBOL(sock_no_accept);
3231 
3232 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3233 		    int peer)
3234 {
3235 	return -EOPNOTSUPP;
3236 }
3237 EXPORT_SYMBOL(sock_no_getname);
3238 
3239 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3240 {
3241 	return -EOPNOTSUPP;
3242 }
3243 EXPORT_SYMBOL(sock_no_ioctl);
3244 
3245 int sock_no_listen(struct socket *sock, int backlog)
3246 {
3247 	return -EOPNOTSUPP;
3248 }
3249 EXPORT_SYMBOL(sock_no_listen);
3250 
3251 int sock_no_shutdown(struct socket *sock, int how)
3252 {
3253 	return -EOPNOTSUPP;
3254 }
3255 EXPORT_SYMBOL(sock_no_shutdown);
3256 
3257 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3258 {
3259 	return -EOPNOTSUPP;
3260 }
3261 EXPORT_SYMBOL(sock_no_sendmsg);
3262 
3263 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3264 {
3265 	return -EOPNOTSUPP;
3266 }
3267 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3268 
3269 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3270 		    int flags)
3271 {
3272 	return -EOPNOTSUPP;
3273 }
3274 EXPORT_SYMBOL(sock_no_recvmsg);
3275 
3276 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3277 {
3278 	/* Mirror missing mmap method error code */
3279 	return -ENODEV;
3280 }
3281 EXPORT_SYMBOL(sock_no_mmap);
3282 
3283 /*
3284  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3285  * various sock-based usage counts.
3286  */
3287 void __receive_sock(struct file *file)
3288 {
3289 	struct socket *sock;
3290 
3291 	sock = sock_from_file(file);
3292 	if (sock) {
3293 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3294 		sock_update_classid(&sock->sk->sk_cgrp_data);
3295 	}
3296 }
3297 
3298 /*
3299  *	Default Socket Callbacks
3300  */
3301 
3302 static void sock_def_wakeup(struct sock *sk)
3303 {
3304 	struct socket_wq *wq;
3305 
3306 	rcu_read_lock();
3307 	wq = rcu_dereference(sk->sk_wq);
3308 	if (skwq_has_sleeper(wq))
3309 		wake_up_interruptible_all(&wq->wait);
3310 	rcu_read_unlock();
3311 }
3312 
3313 static void sock_def_error_report(struct sock *sk)
3314 {
3315 	struct socket_wq *wq;
3316 
3317 	rcu_read_lock();
3318 	wq = rcu_dereference(sk->sk_wq);
3319 	if (skwq_has_sleeper(wq))
3320 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3321 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3322 	rcu_read_unlock();
3323 }
3324 
3325 void sock_def_readable(struct sock *sk)
3326 {
3327 	struct socket_wq *wq;
3328 
3329 	trace_sk_data_ready(sk);
3330 
3331 	rcu_read_lock();
3332 	wq = rcu_dereference(sk->sk_wq);
3333 	if (skwq_has_sleeper(wq))
3334 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3335 						EPOLLRDNORM | EPOLLRDBAND);
3336 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3337 	rcu_read_unlock();
3338 }
3339 
3340 static void sock_def_write_space(struct sock *sk)
3341 {
3342 	struct socket_wq *wq;
3343 
3344 	rcu_read_lock();
3345 
3346 	/* Do not wake up a writer until he can make "significant"
3347 	 * progress.  --DaveM
3348 	 */
3349 	if (sock_writeable(sk)) {
3350 		wq = rcu_dereference(sk->sk_wq);
3351 		if (skwq_has_sleeper(wq))
3352 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3353 						EPOLLWRNORM | EPOLLWRBAND);
3354 
3355 		/* Should agree with poll, otherwise some programs break */
3356 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3357 	}
3358 
3359 	rcu_read_unlock();
3360 }
3361 
3362 /* An optimised version of sock_def_write_space(), should only be called
3363  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3364  * ->sk_wmem_alloc.
3365  */
3366 static void sock_def_write_space_wfree(struct sock *sk)
3367 {
3368 	/* Do not wake up a writer until he can make "significant"
3369 	 * progress.  --DaveM
3370 	 */
3371 	if (sock_writeable(sk)) {
3372 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3373 
3374 		/* rely on refcount_sub from sock_wfree() */
3375 		smp_mb__after_atomic();
3376 		if (wq && waitqueue_active(&wq->wait))
3377 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3378 						EPOLLWRNORM | EPOLLWRBAND);
3379 
3380 		/* Should agree with poll, otherwise some programs break */
3381 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3382 	}
3383 }
3384 
3385 static void sock_def_destruct(struct sock *sk)
3386 {
3387 }
3388 
3389 void sk_send_sigurg(struct sock *sk)
3390 {
3391 	if (sk->sk_socket && sk->sk_socket->file)
3392 		if (send_sigurg(&sk->sk_socket->file->f_owner))
3393 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3394 }
3395 EXPORT_SYMBOL(sk_send_sigurg);
3396 
3397 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3398 		    unsigned long expires)
3399 {
3400 	if (!mod_timer(timer, expires))
3401 		sock_hold(sk);
3402 }
3403 EXPORT_SYMBOL(sk_reset_timer);
3404 
3405 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3406 {
3407 	if (del_timer(timer))
3408 		__sock_put(sk);
3409 }
3410 EXPORT_SYMBOL(sk_stop_timer);
3411 
3412 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3413 {
3414 	if (del_timer_sync(timer))
3415 		__sock_put(sk);
3416 }
3417 EXPORT_SYMBOL(sk_stop_timer_sync);
3418 
3419 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3420 {
3421 	sk_init_common(sk);
3422 	sk->sk_send_head	=	NULL;
3423 
3424 	timer_setup(&sk->sk_timer, NULL, 0);
3425 
3426 	sk->sk_allocation	=	GFP_KERNEL;
3427 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3428 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3429 	sk->sk_state		=	TCP_CLOSE;
3430 	sk->sk_use_task_frag	=	true;
3431 	sk_set_socket(sk, sock);
3432 
3433 	sock_set_flag(sk, SOCK_ZAPPED);
3434 
3435 	if (sock) {
3436 		sk->sk_type	=	sock->type;
3437 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3438 		sock->sk	=	sk;
3439 	} else {
3440 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3441 	}
3442 	sk->sk_uid	=	uid;
3443 
3444 	rwlock_init(&sk->sk_callback_lock);
3445 	if (sk->sk_kern_sock)
3446 		lockdep_set_class_and_name(
3447 			&sk->sk_callback_lock,
3448 			af_kern_callback_keys + sk->sk_family,
3449 			af_family_kern_clock_key_strings[sk->sk_family]);
3450 	else
3451 		lockdep_set_class_and_name(
3452 			&sk->sk_callback_lock,
3453 			af_callback_keys + sk->sk_family,
3454 			af_family_clock_key_strings[sk->sk_family]);
3455 
3456 	sk->sk_state_change	=	sock_def_wakeup;
3457 	sk->sk_data_ready	=	sock_def_readable;
3458 	sk->sk_write_space	=	sock_def_write_space;
3459 	sk->sk_error_report	=	sock_def_error_report;
3460 	sk->sk_destruct		=	sock_def_destruct;
3461 
3462 	sk->sk_frag.page	=	NULL;
3463 	sk->sk_frag.offset	=	0;
3464 	sk->sk_peek_off		=	-1;
3465 
3466 	sk->sk_peer_pid 	=	NULL;
3467 	sk->sk_peer_cred	=	NULL;
3468 	spin_lock_init(&sk->sk_peer_lock);
3469 
3470 	sk->sk_write_pending	=	0;
3471 	sk->sk_rcvlowat		=	1;
3472 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3473 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3474 
3475 	sk->sk_stamp = SK_DEFAULT_STAMP;
3476 #if BITS_PER_LONG==32
3477 	seqlock_init(&sk->sk_stamp_seq);
3478 #endif
3479 	atomic_set(&sk->sk_zckey, 0);
3480 
3481 #ifdef CONFIG_NET_RX_BUSY_POLL
3482 	sk->sk_napi_id		=	0;
3483 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3484 #endif
3485 
3486 	sk->sk_max_pacing_rate = ~0UL;
3487 	sk->sk_pacing_rate = ~0UL;
3488 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3489 	sk->sk_incoming_cpu = -1;
3490 
3491 	sk_rx_queue_clear(sk);
3492 	/*
3493 	 * Before updating sk_refcnt, we must commit prior changes to memory
3494 	 * (Documentation/RCU/rculist_nulls.rst for details)
3495 	 */
3496 	smp_wmb();
3497 	refcount_set(&sk->sk_refcnt, 1);
3498 	atomic_set(&sk->sk_drops, 0);
3499 }
3500 EXPORT_SYMBOL(sock_init_data_uid);
3501 
3502 void sock_init_data(struct socket *sock, struct sock *sk)
3503 {
3504 	kuid_t uid = sock ?
3505 		SOCK_INODE(sock)->i_uid :
3506 		make_kuid(sock_net(sk)->user_ns, 0);
3507 
3508 	sock_init_data_uid(sock, sk, uid);
3509 }
3510 EXPORT_SYMBOL(sock_init_data);
3511 
3512 void lock_sock_nested(struct sock *sk, int subclass)
3513 {
3514 	/* The sk_lock has mutex_lock() semantics here. */
3515 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3516 
3517 	might_sleep();
3518 	spin_lock_bh(&sk->sk_lock.slock);
3519 	if (sock_owned_by_user_nocheck(sk))
3520 		__lock_sock(sk);
3521 	sk->sk_lock.owned = 1;
3522 	spin_unlock_bh(&sk->sk_lock.slock);
3523 }
3524 EXPORT_SYMBOL(lock_sock_nested);
3525 
3526 void release_sock(struct sock *sk)
3527 {
3528 	spin_lock_bh(&sk->sk_lock.slock);
3529 	if (sk->sk_backlog.tail)
3530 		__release_sock(sk);
3531 
3532 	if (sk->sk_prot->release_cb)
3533 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3534 				     tcp_release_cb, sk);
3535 
3536 	sock_release_ownership(sk);
3537 	if (waitqueue_active(&sk->sk_lock.wq))
3538 		wake_up(&sk->sk_lock.wq);
3539 	spin_unlock_bh(&sk->sk_lock.slock);
3540 }
3541 EXPORT_SYMBOL(release_sock);
3542 
3543 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3544 {
3545 	might_sleep();
3546 	spin_lock_bh(&sk->sk_lock.slock);
3547 
3548 	if (!sock_owned_by_user_nocheck(sk)) {
3549 		/*
3550 		 * Fast path return with bottom halves disabled and
3551 		 * sock::sk_lock.slock held.
3552 		 *
3553 		 * The 'mutex' is not contended and holding
3554 		 * sock::sk_lock.slock prevents all other lockers to
3555 		 * proceed so the corresponding unlock_sock_fast() can
3556 		 * avoid the slow path of release_sock() completely and
3557 		 * just release slock.
3558 		 *
3559 		 * From a semantical POV this is equivalent to 'acquiring'
3560 		 * the 'mutex', hence the corresponding lockdep
3561 		 * mutex_release() has to happen in the fast path of
3562 		 * unlock_sock_fast().
3563 		 */
3564 		return false;
3565 	}
3566 
3567 	__lock_sock(sk);
3568 	sk->sk_lock.owned = 1;
3569 	__acquire(&sk->sk_lock.slock);
3570 	spin_unlock_bh(&sk->sk_lock.slock);
3571 	return true;
3572 }
3573 EXPORT_SYMBOL(__lock_sock_fast);
3574 
3575 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3576 		   bool timeval, bool time32)
3577 {
3578 	struct sock *sk = sock->sk;
3579 	struct timespec64 ts;
3580 
3581 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3582 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3583 	if (ts.tv_sec == -1)
3584 		return -ENOENT;
3585 	if (ts.tv_sec == 0) {
3586 		ktime_t kt = ktime_get_real();
3587 		sock_write_timestamp(sk, kt);
3588 		ts = ktime_to_timespec64(kt);
3589 	}
3590 
3591 	if (timeval)
3592 		ts.tv_nsec /= 1000;
3593 
3594 #ifdef CONFIG_COMPAT_32BIT_TIME
3595 	if (time32)
3596 		return put_old_timespec32(&ts, userstamp);
3597 #endif
3598 #ifdef CONFIG_SPARC64
3599 	/* beware of padding in sparc64 timeval */
3600 	if (timeval && !in_compat_syscall()) {
3601 		struct __kernel_old_timeval __user tv = {
3602 			.tv_sec = ts.tv_sec,
3603 			.tv_usec = ts.tv_nsec,
3604 		};
3605 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3606 			return -EFAULT;
3607 		return 0;
3608 	}
3609 #endif
3610 	return put_timespec64(&ts, userstamp);
3611 }
3612 EXPORT_SYMBOL(sock_gettstamp);
3613 
3614 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3615 {
3616 	if (!sock_flag(sk, flag)) {
3617 		unsigned long previous_flags = sk->sk_flags;
3618 
3619 		sock_set_flag(sk, flag);
3620 		/*
3621 		 * we just set one of the two flags which require net
3622 		 * time stamping, but time stamping might have been on
3623 		 * already because of the other one
3624 		 */
3625 		if (sock_needs_netstamp(sk) &&
3626 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3627 			net_enable_timestamp();
3628 	}
3629 }
3630 
3631 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3632 		       int level, int type)
3633 {
3634 	struct sock_exterr_skb *serr;
3635 	struct sk_buff *skb;
3636 	int copied, err;
3637 
3638 	err = -EAGAIN;
3639 	skb = sock_dequeue_err_skb(sk);
3640 	if (skb == NULL)
3641 		goto out;
3642 
3643 	copied = skb->len;
3644 	if (copied > len) {
3645 		msg->msg_flags |= MSG_TRUNC;
3646 		copied = len;
3647 	}
3648 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3649 	if (err)
3650 		goto out_free_skb;
3651 
3652 	sock_recv_timestamp(msg, sk, skb);
3653 
3654 	serr = SKB_EXT_ERR(skb);
3655 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3656 
3657 	msg->msg_flags |= MSG_ERRQUEUE;
3658 	err = copied;
3659 
3660 out_free_skb:
3661 	kfree_skb(skb);
3662 out:
3663 	return err;
3664 }
3665 EXPORT_SYMBOL(sock_recv_errqueue);
3666 
3667 /*
3668  *	Get a socket option on an socket.
3669  *
3670  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3671  *	asynchronous errors should be reported by getsockopt. We assume
3672  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3673  */
3674 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3675 			   char __user *optval, int __user *optlen)
3676 {
3677 	struct sock *sk = sock->sk;
3678 
3679 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3680 	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3681 }
3682 EXPORT_SYMBOL(sock_common_getsockopt);
3683 
3684 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3685 			int flags)
3686 {
3687 	struct sock *sk = sock->sk;
3688 	int addr_len = 0;
3689 	int err;
3690 
3691 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3692 	if (err >= 0)
3693 		msg->msg_namelen = addr_len;
3694 	return err;
3695 }
3696 EXPORT_SYMBOL(sock_common_recvmsg);
3697 
3698 /*
3699  *	Set socket options on an inet socket.
3700  */
3701 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3702 			   sockptr_t optval, unsigned int optlen)
3703 {
3704 	struct sock *sk = sock->sk;
3705 
3706 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3707 	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3708 }
3709 EXPORT_SYMBOL(sock_common_setsockopt);
3710 
3711 void sk_common_release(struct sock *sk)
3712 {
3713 	if (sk->sk_prot->destroy)
3714 		sk->sk_prot->destroy(sk);
3715 
3716 	/*
3717 	 * Observation: when sk_common_release is called, processes have
3718 	 * no access to socket. But net still has.
3719 	 * Step one, detach it from networking:
3720 	 *
3721 	 * A. Remove from hash tables.
3722 	 */
3723 
3724 	sk->sk_prot->unhash(sk);
3725 
3726 	/*
3727 	 * In this point socket cannot receive new packets, but it is possible
3728 	 * that some packets are in flight because some CPU runs receiver and
3729 	 * did hash table lookup before we unhashed socket. They will achieve
3730 	 * receive queue and will be purged by socket destructor.
3731 	 *
3732 	 * Also we still have packets pending on receive queue and probably,
3733 	 * our own packets waiting in device queues. sock_destroy will drain
3734 	 * receive queue, but transmitted packets will delay socket destruction
3735 	 * until the last reference will be released.
3736 	 */
3737 
3738 	sock_orphan(sk);
3739 
3740 	xfrm_sk_free_policy(sk);
3741 
3742 	sock_put(sk);
3743 }
3744 EXPORT_SYMBOL(sk_common_release);
3745 
3746 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3747 {
3748 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3749 
3750 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3751 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3752 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3753 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3754 	mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3755 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3756 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3757 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3758 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3759 }
3760 
3761 #ifdef CONFIG_PROC_FS
3762 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3763 
3764 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3765 {
3766 	int cpu, idx = prot->inuse_idx;
3767 	int res = 0;
3768 
3769 	for_each_possible_cpu(cpu)
3770 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3771 
3772 	return res >= 0 ? res : 0;
3773 }
3774 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3775 
3776 int sock_inuse_get(struct net *net)
3777 {
3778 	int cpu, res = 0;
3779 
3780 	for_each_possible_cpu(cpu)
3781 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3782 
3783 	return res;
3784 }
3785 
3786 EXPORT_SYMBOL_GPL(sock_inuse_get);
3787 
3788 static int __net_init sock_inuse_init_net(struct net *net)
3789 {
3790 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3791 	if (net->core.prot_inuse == NULL)
3792 		return -ENOMEM;
3793 	return 0;
3794 }
3795 
3796 static void __net_exit sock_inuse_exit_net(struct net *net)
3797 {
3798 	free_percpu(net->core.prot_inuse);
3799 }
3800 
3801 static struct pernet_operations net_inuse_ops = {
3802 	.init = sock_inuse_init_net,
3803 	.exit = sock_inuse_exit_net,
3804 };
3805 
3806 static __init int net_inuse_init(void)
3807 {
3808 	if (register_pernet_subsys(&net_inuse_ops))
3809 		panic("Cannot initialize net inuse counters");
3810 
3811 	return 0;
3812 }
3813 
3814 core_initcall(net_inuse_init);
3815 
3816 static int assign_proto_idx(struct proto *prot)
3817 {
3818 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3819 
3820 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3821 		pr_err("PROTO_INUSE_NR exhausted\n");
3822 		return -ENOSPC;
3823 	}
3824 
3825 	set_bit(prot->inuse_idx, proto_inuse_idx);
3826 	return 0;
3827 }
3828 
3829 static void release_proto_idx(struct proto *prot)
3830 {
3831 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3832 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3833 }
3834 #else
3835 static inline int assign_proto_idx(struct proto *prot)
3836 {
3837 	return 0;
3838 }
3839 
3840 static inline void release_proto_idx(struct proto *prot)
3841 {
3842 }
3843 
3844 #endif
3845 
3846 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3847 {
3848 	if (!twsk_prot)
3849 		return;
3850 	kfree(twsk_prot->twsk_slab_name);
3851 	twsk_prot->twsk_slab_name = NULL;
3852 	kmem_cache_destroy(twsk_prot->twsk_slab);
3853 	twsk_prot->twsk_slab = NULL;
3854 }
3855 
3856 static int tw_prot_init(const struct proto *prot)
3857 {
3858 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3859 
3860 	if (!twsk_prot)
3861 		return 0;
3862 
3863 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3864 					      prot->name);
3865 	if (!twsk_prot->twsk_slab_name)
3866 		return -ENOMEM;
3867 
3868 	twsk_prot->twsk_slab =
3869 		kmem_cache_create(twsk_prot->twsk_slab_name,
3870 				  twsk_prot->twsk_obj_size, 0,
3871 				  SLAB_ACCOUNT | prot->slab_flags,
3872 				  NULL);
3873 	if (!twsk_prot->twsk_slab) {
3874 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3875 			prot->name);
3876 		return -ENOMEM;
3877 	}
3878 
3879 	return 0;
3880 }
3881 
3882 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3883 {
3884 	if (!rsk_prot)
3885 		return;
3886 	kfree(rsk_prot->slab_name);
3887 	rsk_prot->slab_name = NULL;
3888 	kmem_cache_destroy(rsk_prot->slab);
3889 	rsk_prot->slab = NULL;
3890 }
3891 
3892 static int req_prot_init(const struct proto *prot)
3893 {
3894 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3895 
3896 	if (!rsk_prot)
3897 		return 0;
3898 
3899 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3900 					prot->name);
3901 	if (!rsk_prot->slab_name)
3902 		return -ENOMEM;
3903 
3904 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3905 					   rsk_prot->obj_size, 0,
3906 					   SLAB_ACCOUNT | prot->slab_flags,
3907 					   NULL);
3908 
3909 	if (!rsk_prot->slab) {
3910 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3911 			prot->name);
3912 		return -ENOMEM;
3913 	}
3914 	return 0;
3915 }
3916 
3917 int proto_register(struct proto *prot, int alloc_slab)
3918 {
3919 	int ret = -ENOBUFS;
3920 
3921 	if (prot->memory_allocated && !prot->sysctl_mem) {
3922 		pr_err("%s: missing sysctl_mem\n", prot->name);
3923 		return -EINVAL;
3924 	}
3925 	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3926 		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3927 		return -EINVAL;
3928 	}
3929 	if (alloc_slab) {
3930 		prot->slab = kmem_cache_create_usercopy(prot->name,
3931 					prot->obj_size, 0,
3932 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3933 					prot->slab_flags,
3934 					prot->useroffset, prot->usersize,
3935 					NULL);
3936 
3937 		if (prot->slab == NULL) {
3938 			pr_crit("%s: Can't create sock SLAB cache!\n",
3939 				prot->name);
3940 			goto out;
3941 		}
3942 
3943 		if (req_prot_init(prot))
3944 			goto out_free_request_sock_slab;
3945 
3946 		if (tw_prot_init(prot))
3947 			goto out_free_timewait_sock_slab;
3948 	}
3949 
3950 	mutex_lock(&proto_list_mutex);
3951 	ret = assign_proto_idx(prot);
3952 	if (ret) {
3953 		mutex_unlock(&proto_list_mutex);
3954 		goto out_free_timewait_sock_slab;
3955 	}
3956 	list_add(&prot->node, &proto_list);
3957 	mutex_unlock(&proto_list_mutex);
3958 	return ret;
3959 
3960 out_free_timewait_sock_slab:
3961 	if (alloc_slab)
3962 		tw_prot_cleanup(prot->twsk_prot);
3963 out_free_request_sock_slab:
3964 	if (alloc_slab) {
3965 		req_prot_cleanup(prot->rsk_prot);
3966 
3967 		kmem_cache_destroy(prot->slab);
3968 		prot->slab = NULL;
3969 	}
3970 out:
3971 	return ret;
3972 }
3973 EXPORT_SYMBOL(proto_register);
3974 
3975 void proto_unregister(struct proto *prot)
3976 {
3977 	mutex_lock(&proto_list_mutex);
3978 	release_proto_idx(prot);
3979 	list_del(&prot->node);
3980 	mutex_unlock(&proto_list_mutex);
3981 
3982 	kmem_cache_destroy(prot->slab);
3983 	prot->slab = NULL;
3984 
3985 	req_prot_cleanup(prot->rsk_prot);
3986 	tw_prot_cleanup(prot->twsk_prot);
3987 }
3988 EXPORT_SYMBOL(proto_unregister);
3989 
3990 int sock_load_diag_module(int family, int protocol)
3991 {
3992 	if (!protocol) {
3993 		if (!sock_is_registered(family))
3994 			return -ENOENT;
3995 
3996 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3997 				      NETLINK_SOCK_DIAG, family);
3998 	}
3999 
4000 #ifdef CONFIG_INET
4001 	if (family == AF_INET &&
4002 	    protocol != IPPROTO_RAW &&
4003 	    protocol < MAX_INET_PROTOS &&
4004 	    !rcu_access_pointer(inet_protos[protocol]))
4005 		return -ENOENT;
4006 #endif
4007 
4008 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4009 			      NETLINK_SOCK_DIAG, family, protocol);
4010 }
4011 EXPORT_SYMBOL(sock_load_diag_module);
4012 
4013 #ifdef CONFIG_PROC_FS
4014 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4015 	__acquires(proto_list_mutex)
4016 {
4017 	mutex_lock(&proto_list_mutex);
4018 	return seq_list_start_head(&proto_list, *pos);
4019 }
4020 
4021 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4022 {
4023 	return seq_list_next(v, &proto_list, pos);
4024 }
4025 
4026 static void proto_seq_stop(struct seq_file *seq, void *v)
4027 	__releases(proto_list_mutex)
4028 {
4029 	mutex_unlock(&proto_list_mutex);
4030 }
4031 
4032 static char proto_method_implemented(const void *method)
4033 {
4034 	return method == NULL ? 'n' : 'y';
4035 }
4036 static long sock_prot_memory_allocated(struct proto *proto)
4037 {
4038 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4039 }
4040 
4041 static const char *sock_prot_memory_pressure(struct proto *proto)
4042 {
4043 	return proto->memory_pressure != NULL ?
4044 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4045 }
4046 
4047 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4048 {
4049 
4050 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4051 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4052 		   proto->name,
4053 		   proto->obj_size,
4054 		   sock_prot_inuse_get(seq_file_net(seq), proto),
4055 		   sock_prot_memory_allocated(proto),
4056 		   sock_prot_memory_pressure(proto),
4057 		   proto->max_header,
4058 		   proto->slab == NULL ? "no" : "yes",
4059 		   module_name(proto->owner),
4060 		   proto_method_implemented(proto->close),
4061 		   proto_method_implemented(proto->connect),
4062 		   proto_method_implemented(proto->disconnect),
4063 		   proto_method_implemented(proto->accept),
4064 		   proto_method_implemented(proto->ioctl),
4065 		   proto_method_implemented(proto->init),
4066 		   proto_method_implemented(proto->destroy),
4067 		   proto_method_implemented(proto->shutdown),
4068 		   proto_method_implemented(proto->setsockopt),
4069 		   proto_method_implemented(proto->getsockopt),
4070 		   proto_method_implemented(proto->sendmsg),
4071 		   proto_method_implemented(proto->recvmsg),
4072 		   proto_method_implemented(proto->bind),
4073 		   proto_method_implemented(proto->backlog_rcv),
4074 		   proto_method_implemented(proto->hash),
4075 		   proto_method_implemented(proto->unhash),
4076 		   proto_method_implemented(proto->get_port),
4077 		   proto_method_implemented(proto->enter_memory_pressure));
4078 }
4079 
4080 static int proto_seq_show(struct seq_file *seq, void *v)
4081 {
4082 	if (v == &proto_list)
4083 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4084 			   "protocol",
4085 			   "size",
4086 			   "sockets",
4087 			   "memory",
4088 			   "press",
4089 			   "maxhdr",
4090 			   "slab",
4091 			   "module",
4092 			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4093 	else
4094 		proto_seq_printf(seq, list_entry(v, struct proto, node));
4095 	return 0;
4096 }
4097 
4098 static const struct seq_operations proto_seq_ops = {
4099 	.start  = proto_seq_start,
4100 	.next   = proto_seq_next,
4101 	.stop   = proto_seq_stop,
4102 	.show   = proto_seq_show,
4103 };
4104 
4105 static __net_init int proto_init_net(struct net *net)
4106 {
4107 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4108 			sizeof(struct seq_net_private)))
4109 		return -ENOMEM;
4110 
4111 	return 0;
4112 }
4113 
4114 static __net_exit void proto_exit_net(struct net *net)
4115 {
4116 	remove_proc_entry("protocols", net->proc_net);
4117 }
4118 
4119 
4120 static __net_initdata struct pernet_operations proto_net_ops = {
4121 	.init = proto_init_net,
4122 	.exit = proto_exit_net,
4123 };
4124 
4125 static int __init proto_init(void)
4126 {
4127 	return register_pernet_subsys(&proto_net_ops);
4128 }
4129 
4130 subsys_initcall(proto_init);
4131 
4132 #endif /* PROC_FS */
4133 
4134 #ifdef CONFIG_NET_RX_BUSY_POLL
4135 bool sk_busy_loop_end(void *p, unsigned long start_time)
4136 {
4137 	struct sock *sk = p;
4138 
4139 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
4140 	       sk_busy_loop_timeout(sk, start_time);
4141 }
4142 EXPORT_SYMBOL(sk_busy_loop_end);
4143 #endif /* CONFIG_NET_RX_BUSY_POLL */
4144 
4145 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4146 {
4147 	if (!sk->sk_prot->bind_add)
4148 		return -EOPNOTSUPP;
4149 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4150 }
4151 EXPORT_SYMBOL(sock_bind_add);
4152 
4153 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4154 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4155 		     void __user *arg, void *karg, size_t size)
4156 {
4157 	int ret;
4158 
4159 	if (copy_from_user(karg, arg, size))
4160 		return -EFAULT;
4161 
4162 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4163 	if (ret)
4164 		return ret;
4165 
4166 	if (copy_to_user(arg, karg, size))
4167 		return -EFAULT;
4168 
4169 	return 0;
4170 }
4171 EXPORT_SYMBOL(sock_ioctl_inout);
4172 
4173 /* This is the most common ioctl prep function, where the result (4 bytes) is
4174  * copied back to userspace if the ioctl() returns successfully. No input is
4175  * copied from userspace as input argument.
4176  */
4177 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4178 {
4179 	int ret, karg = 0;
4180 
4181 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4182 	if (ret)
4183 		return ret;
4184 
4185 	return put_user(karg, (int __user *)arg);
4186 }
4187 
4188 /* A wrapper around sock ioctls, which copies the data from userspace
4189  * (depending on the protocol/ioctl), and copies back the result to userspace.
4190  * The main motivation for this function is to pass kernel memory to the
4191  * protocol ioctl callbacks, instead of userspace memory.
4192  */
4193 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4194 {
4195 	int rc = 1;
4196 
4197 	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4198 		rc = ipmr_sk_ioctl(sk, cmd, arg);
4199 	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4200 		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4201 	else if (sk_is_phonet(sk))
4202 		rc = phonet_sk_ioctl(sk, cmd, arg);
4203 
4204 	/* If ioctl was processed, returns its value */
4205 	if (rc <= 0)
4206 		return rc;
4207 
4208 	/* Otherwise call the default handler */
4209 	return sock_ioctl_out(sk, cmd, arg);
4210 }
4211 EXPORT_SYMBOL(sk_ioctl);
4212