xref: /linux/net/core/sock.c (revision 4e73826089ce899357580bbf6e0afe4e6f9900b7)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <net/net_namespace.h>
128 #include <net/request_sock.h>
129 #include <net/sock.h>
130 #include <linux/net_tstamp.h>
131 #include <net/xfrm.h>
132 #include <linux/ipsec.h>
133 #include <net/cls_cgroup.h>
134 #include <net/netprio_cgroup.h>
135 #include <linux/sock_diag.h>
136 
137 #include <linux/filter.h>
138 #include <net/sock_reuseport.h>
139 #include <net/bpf_sk_storage.h>
140 
141 #include <trace/events/sock.h>
142 
143 #include <net/tcp.h>
144 #include <net/busy_poll.h>
145 #include <net/phonet/phonet.h>
146 
147 #include <linux/ethtool.h>
148 
149 #include "dev.h"
150 
151 static DEFINE_MUTEX(proto_list_mutex);
152 static LIST_HEAD(proto_list);
153 
154 static void sock_def_write_space_wfree(struct sock *sk);
155 static void sock_def_write_space(struct sock *sk);
156 
157 /**
158  * sk_ns_capable - General socket capability test
159  * @sk: Socket to use a capability on or through
160  * @user_ns: The user namespace of the capability to use
161  * @cap: The capability to use
162  *
163  * Test to see if the opener of the socket had when the socket was
164  * created and the current process has the capability @cap in the user
165  * namespace @user_ns.
166  */
167 bool sk_ns_capable(const struct sock *sk,
168 		   struct user_namespace *user_ns, int cap)
169 {
170 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
171 		ns_capable(user_ns, cap);
172 }
173 EXPORT_SYMBOL(sk_ns_capable);
174 
175 /**
176  * sk_capable - Socket global capability test
177  * @sk: Socket to use a capability on or through
178  * @cap: The global capability to use
179  *
180  * Test to see if the opener of the socket had when the socket was
181  * created and the current process has the capability @cap in all user
182  * namespaces.
183  */
184 bool sk_capable(const struct sock *sk, int cap)
185 {
186 	return sk_ns_capable(sk, &init_user_ns, cap);
187 }
188 EXPORT_SYMBOL(sk_capable);
189 
190 /**
191  * sk_net_capable - Network namespace socket capability test
192  * @sk: Socket to use a capability on or through
193  * @cap: The capability to use
194  *
195  * Test to see if the opener of the socket had when the socket was created
196  * and the current process has the capability @cap over the network namespace
197  * the socket is a member of.
198  */
199 bool sk_net_capable(const struct sock *sk, int cap)
200 {
201 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
202 }
203 EXPORT_SYMBOL(sk_net_capable);
204 
205 /*
206  * Each address family might have different locking rules, so we have
207  * one slock key per address family and separate keys for internal and
208  * userspace sockets.
209  */
210 static struct lock_class_key af_family_keys[AF_MAX];
211 static struct lock_class_key af_family_kern_keys[AF_MAX];
212 static struct lock_class_key af_family_slock_keys[AF_MAX];
213 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
214 
215 /*
216  * Make lock validator output more readable. (we pre-construct these
217  * strings build-time, so that runtime initialization of socket
218  * locks is fast):
219  */
220 
221 #define _sock_locks(x)						  \
222   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
223   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
224   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
225   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
226   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
227   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
228   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
229   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
230   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
231   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
232   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
233   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
234   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
235   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
236   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
237   x "AF_MCTP"  , \
238   x "AF_MAX"
239 
240 static const char *const af_family_key_strings[AF_MAX+1] = {
241 	_sock_locks("sk_lock-")
242 };
243 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
244 	_sock_locks("slock-")
245 };
246 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
247 	_sock_locks("clock-")
248 };
249 
250 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
251 	_sock_locks("k-sk_lock-")
252 };
253 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
254 	_sock_locks("k-slock-")
255 };
256 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
257 	_sock_locks("k-clock-")
258 };
259 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
260 	_sock_locks("rlock-")
261 };
262 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
263 	_sock_locks("wlock-")
264 };
265 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
266 	_sock_locks("elock-")
267 };
268 
269 /*
270  * sk_callback_lock and sk queues locking rules are per-address-family,
271  * so split the lock classes by using a per-AF key:
272  */
273 static struct lock_class_key af_callback_keys[AF_MAX];
274 static struct lock_class_key af_rlock_keys[AF_MAX];
275 static struct lock_class_key af_wlock_keys[AF_MAX];
276 static struct lock_class_key af_elock_keys[AF_MAX];
277 static struct lock_class_key af_kern_callback_keys[AF_MAX];
278 
279 /* Run time adjustable parameters. */
280 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
281 EXPORT_SYMBOL(sysctl_wmem_max);
282 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
283 EXPORT_SYMBOL(sysctl_rmem_max);
284 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
285 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
286 
287 int sysctl_tstamp_allow_data __read_mostly = 1;
288 
289 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
290 EXPORT_SYMBOL_GPL(memalloc_socks_key);
291 
292 /**
293  * sk_set_memalloc - sets %SOCK_MEMALLOC
294  * @sk: socket to set it on
295  *
296  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
297  * It's the responsibility of the admin to adjust min_free_kbytes
298  * to meet the requirements
299  */
300 void sk_set_memalloc(struct sock *sk)
301 {
302 	sock_set_flag(sk, SOCK_MEMALLOC);
303 	sk->sk_allocation |= __GFP_MEMALLOC;
304 	static_branch_inc(&memalloc_socks_key);
305 }
306 EXPORT_SYMBOL_GPL(sk_set_memalloc);
307 
308 void sk_clear_memalloc(struct sock *sk)
309 {
310 	sock_reset_flag(sk, SOCK_MEMALLOC);
311 	sk->sk_allocation &= ~__GFP_MEMALLOC;
312 	static_branch_dec(&memalloc_socks_key);
313 
314 	/*
315 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
316 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
317 	 * it has rmem allocations due to the last swapfile being deactivated
318 	 * but there is a risk that the socket is unusable due to exceeding
319 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
320 	 */
321 	sk_mem_reclaim(sk);
322 }
323 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
324 
325 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
326 {
327 	int ret;
328 	unsigned int noreclaim_flag;
329 
330 	/* these should have been dropped before queueing */
331 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
332 
333 	noreclaim_flag = memalloc_noreclaim_save();
334 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
335 				 tcp_v6_do_rcv,
336 				 tcp_v4_do_rcv,
337 				 sk, skb);
338 	memalloc_noreclaim_restore(noreclaim_flag);
339 
340 	return ret;
341 }
342 EXPORT_SYMBOL(__sk_backlog_rcv);
343 
344 void sk_error_report(struct sock *sk)
345 {
346 	sk->sk_error_report(sk);
347 
348 	switch (sk->sk_family) {
349 	case AF_INET:
350 		fallthrough;
351 	case AF_INET6:
352 		trace_inet_sk_error_report(sk);
353 		break;
354 	default:
355 		break;
356 	}
357 }
358 EXPORT_SYMBOL(sk_error_report);
359 
360 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
361 {
362 	struct __kernel_sock_timeval tv;
363 
364 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
365 		tv.tv_sec = 0;
366 		tv.tv_usec = 0;
367 	} else {
368 		tv.tv_sec = timeo / HZ;
369 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
370 	}
371 
372 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
373 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
374 		*(struct old_timeval32 *)optval = tv32;
375 		return sizeof(tv32);
376 	}
377 
378 	if (old_timeval) {
379 		struct __kernel_old_timeval old_tv;
380 		old_tv.tv_sec = tv.tv_sec;
381 		old_tv.tv_usec = tv.tv_usec;
382 		*(struct __kernel_old_timeval *)optval = old_tv;
383 		return sizeof(old_tv);
384 	}
385 
386 	*(struct __kernel_sock_timeval *)optval = tv;
387 	return sizeof(tv);
388 }
389 EXPORT_SYMBOL(sock_get_timeout);
390 
391 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
392 			   sockptr_t optval, int optlen, bool old_timeval)
393 {
394 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
395 		struct old_timeval32 tv32;
396 
397 		if (optlen < sizeof(tv32))
398 			return -EINVAL;
399 
400 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
401 			return -EFAULT;
402 		tv->tv_sec = tv32.tv_sec;
403 		tv->tv_usec = tv32.tv_usec;
404 	} else if (old_timeval) {
405 		struct __kernel_old_timeval old_tv;
406 
407 		if (optlen < sizeof(old_tv))
408 			return -EINVAL;
409 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
410 			return -EFAULT;
411 		tv->tv_sec = old_tv.tv_sec;
412 		tv->tv_usec = old_tv.tv_usec;
413 	} else {
414 		if (optlen < sizeof(*tv))
415 			return -EINVAL;
416 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
417 			return -EFAULT;
418 	}
419 
420 	return 0;
421 }
422 EXPORT_SYMBOL(sock_copy_user_timeval);
423 
424 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
425 			    bool old_timeval)
426 {
427 	struct __kernel_sock_timeval tv;
428 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
429 	long val;
430 
431 	if (err)
432 		return err;
433 
434 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
435 		return -EDOM;
436 
437 	if (tv.tv_sec < 0) {
438 		static int warned __read_mostly;
439 
440 		WRITE_ONCE(*timeo_p, 0);
441 		if (warned < 10 && net_ratelimit()) {
442 			warned++;
443 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
444 				__func__, current->comm, task_pid_nr(current));
445 		}
446 		return 0;
447 	}
448 	val = MAX_SCHEDULE_TIMEOUT;
449 	if ((tv.tv_sec || tv.tv_usec) &&
450 	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
451 		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
452 						    USEC_PER_SEC / HZ);
453 	WRITE_ONCE(*timeo_p, val);
454 	return 0;
455 }
456 
457 static bool sock_needs_netstamp(const struct sock *sk)
458 {
459 	switch (sk->sk_family) {
460 	case AF_UNSPEC:
461 	case AF_UNIX:
462 		return false;
463 	default:
464 		return true;
465 	}
466 }
467 
468 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
469 {
470 	if (sk->sk_flags & flags) {
471 		sk->sk_flags &= ~flags;
472 		if (sock_needs_netstamp(sk) &&
473 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
474 			net_disable_timestamp();
475 	}
476 }
477 
478 
479 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
480 {
481 	unsigned long flags;
482 	struct sk_buff_head *list = &sk->sk_receive_queue;
483 
484 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
485 		atomic_inc(&sk->sk_drops);
486 		trace_sock_rcvqueue_full(sk, skb);
487 		return -ENOMEM;
488 	}
489 
490 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
491 		atomic_inc(&sk->sk_drops);
492 		return -ENOBUFS;
493 	}
494 
495 	skb->dev = NULL;
496 	skb_set_owner_r(skb, sk);
497 
498 	/* we escape from rcu protected region, make sure we dont leak
499 	 * a norefcounted dst
500 	 */
501 	skb_dst_force(skb);
502 
503 	spin_lock_irqsave(&list->lock, flags);
504 	sock_skb_set_dropcount(sk, skb);
505 	__skb_queue_tail(list, skb);
506 	spin_unlock_irqrestore(&list->lock, flags);
507 
508 	if (!sock_flag(sk, SOCK_DEAD))
509 		sk->sk_data_ready(sk);
510 	return 0;
511 }
512 EXPORT_SYMBOL(__sock_queue_rcv_skb);
513 
514 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
515 			      enum skb_drop_reason *reason)
516 {
517 	enum skb_drop_reason drop_reason;
518 	int err;
519 
520 	err = sk_filter(sk, skb);
521 	if (err) {
522 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
523 		goto out;
524 	}
525 	err = __sock_queue_rcv_skb(sk, skb);
526 	switch (err) {
527 	case -ENOMEM:
528 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
529 		break;
530 	case -ENOBUFS:
531 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
532 		break;
533 	default:
534 		drop_reason = SKB_NOT_DROPPED_YET;
535 		break;
536 	}
537 out:
538 	if (reason)
539 		*reason = drop_reason;
540 	return err;
541 }
542 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
543 
544 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
545 		     const int nested, unsigned int trim_cap, bool refcounted)
546 {
547 	int rc = NET_RX_SUCCESS;
548 
549 	if (sk_filter_trim_cap(sk, skb, trim_cap))
550 		goto discard_and_relse;
551 
552 	skb->dev = NULL;
553 
554 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
555 		atomic_inc(&sk->sk_drops);
556 		goto discard_and_relse;
557 	}
558 	if (nested)
559 		bh_lock_sock_nested(sk);
560 	else
561 		bh_lock_sock(sk);
562 	if (!sock_owned_by_user(sk)) {
563 		/*
564 		 * trylock + unlock semantics:
565 		 */
566 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
567 
568 		rc = sk_backlog_rcv(sk, skb);
569 
570 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
571 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
572 		bh_unlock_sock(sk);
573 		atomic_inc(&sk->sk_drops);
574 		goto discard_and_relse;
575 	}
576 
577 	bh_unlock_sock(sk);
578 out:
579 	if (refcounted)
580 		sock_put(sk);
581 	return rc;
582 discard_and_relse:
583 	kfree_skb(skb);
584 	goto out;
585 }
586 EXPORT_SYMBOL(__sk_receive_skb);
587 
588 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
589 							  u32));
590 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
591 							   u32));
592 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
593 {
594 	struct dst_entry *dst = __sk_dst_get(sk);
595 
596 	if (dst && dst->obsolete &&
597 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
598 			       dst, cookie) == NULL) {
599 		sk_tx_queue_clear(sk);
600 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
601 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
602 		dst_release(dst);
603 		return NULL;
604 	}
605 
606 	return dst;
607 }
608 EXPORT_SYMBOL(__sk_dst_check);
609 
610 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
611 {
612 	struct dst_entry *dst = sk_dst_get(sk);
613 
614 	if (dst && dst->obsolete &&
615 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
616 			       dst, cookie) == NULL) {
617 		sk_dst_reset(sk);
618 		dst_release(dst);
619 		return NULL;
620 	}
621 
622 	return dst;
623 }
624 EXPORT_SYMBOL(sk_dst_check);
625 
626 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
627 {
628 	int ret = -ENOPROTOOPT;
629 #ifdef CONFIG_NETDEVICES
630 	struct net *net = sock_net(sk);
631 
632 	/* Sorry... */
633 	ret = -EPERM;
634 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
635 		goto out;
636 
637 	ret = -EINVAL;
638 	if (ifindex < 0)
639 		goto out;
640 
641 	/* Paired with all READ_ONCE() done locklessly. */
642 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
643 
644 	if (sk->sk_prot->rehash)
645 		sk->sk_prot->rehash(sk);
646 	sk_dst_reset(sk);
647 
648 	ret = 0;
649 
650 out:
651 #endif
652 
653 	return ret;
654 }
655 
656 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
657 {
658 	int ret;
659 
660 	if (lock_sk)
661 		lock_sock(sk);
662 	ret = sock_bindtoindex_locked(sk, ifindex);
663 	if (lock_sk)
664 		release_sock(sk);
665 
666 	return ret;
667 }
668 EXPORT_SYMBOL(sock_bindtoindex);
669 
670 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
671 {
672 	int ret = -ENOPROTOOPT;
673 #ifdef CONFIG_NETDEVICES
674 	struct net *net = sock_net(sk);
675 	char devname[IFNAMSIZ];
676 	int index;
677 
678 	ret = -EINVAL;
679 	if (optlen < 0)
680 		goto out;
681 
682 	/* Bind this socket to a particular device like "eth0",
683 	 * as specified in the passed interface name. If the
684 	 * name is "" or the option length is zero the socket
685 	 * is not bound.
686 	 */
687 	if (optlen > IFNAMSIZ - 1)
688 		optlen = IFNAMSIZ - 1;
689 	memset(devname, 0, sizeof(devname));
690 
691 	ret = -EFAULT;
692 	if (copy_from_sockptr(devname, optval, optlen))
693 		goto out;
694 
695 	index = 0;
696 	if (devname[0] != '\0') {
697 		struct net_device *dev;
698 
699 		rcu_read_lock();
700 		dev = dev_get_by_name_rcu(net, devname);
701 		if (dev)
702 			index = dev->ifindex;
703 		rcu_read_unlock();
704 		ret = -ENODEV;
705 		if (!dev)
706 			goto out;
707 	}
708 
709 	sockopt_lock_sock(sk);
710 	ret = sock_bindtoindex_locked(sk, index);
711 	sockopt_release_sock(sk);
712 out:
713 #endif
714 
715 	return ret;
716 }
717 
718 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
719 				sockptr_t optlen, int len)
720 {
721 	int ret = -ENOPROTOOPT;
722 #ifdef CONFIG_NETDEVICES
723 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
724 	struct net *net = sock_net(sk);
725 	char devname[IFNAMSIZ];
726 
727 	if (bound_dev_if == 0) {
728 		len = 0;
729 		goto zero;
730 	}
731 
732 	ret = -EINVAL;
733 	if (len < IFNAMSIZ)
734 		goto out;
735 
736 	ret = netdev_get_name(net, devname, bound_dev_if);
737 	if (ret)
738 		goto out;
739 
740 	len = strlen(devname) + 1;
741 
742 	ret = -EFAULT;
743 	if (copy_to_sockptr(optval, devname, len))
744 		goto out;
745 
746 zero:
747 	ret = -EFAULT;
748 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
749 		goto out;
750 
751 	ret = 0;
752 
753 out:
754 #endif
755 
756 	return ret;
757 }
758 
759 bool sk_mc_loop(const struct sock *sk)
760 {
761 	if (dev_recursion_level())
762 		return false;
763 	if (!sk)
764 		return true;
765 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
766 	switch (READ_ONCE(sk->sk_family)) {
767 	case AF_INET:
768 		return inet_test_bit(MC_LOOP, sk);
769 #if IS_ENABLED(CONFIG_IPV6)
770 	case AF_INET6:
771 		return inet6_test_bit(MC6_LOOP, sk);
772 #endif
773 	}
774 	WARN_ON_ONCE(1);
775 	return true;
776 }
777 EXPORT_SYMBOL(sk_mc_loop);
778 
779 void sock_set_reuseaddr(struct sock *sk)
780 {
781 	lock_sock(sk);
782 	sk->sk_reuse = SK_CAN_REUSE;
783 	release_sock(sk);
784 }
785 EXPORT_SYMBOL(sock_set_reuseaddr);
786 
787 void sock_set_reuseport(struct sock *sk)
788 {
789 	lock_sock(sk);
790 	sk->sk_reuseport = true;
791 	release_sock(sk);
792 }
793 EXPORT_SYMBOL(sock_set_reuseport);
794 
795 void sock_no_linger(struct sock *sk)
796 {
797 	lock_sock(sk);
798 	WRITE_ONCE(sk->sk_lingertime, 0);
799 	sock_set_flag(sk, SOCK_LINGER);
800 	release_sock(sk);
801 }
802 EXPORT_SYMBOL(sock_no_linger);
803 
804 void sock_set_priority(struct sock *sk, u32 priority)
805 {
806 	WRITE_ONCE(sk->sk_priority, priority);
807 }
808 EXPORT_SYMBOL(sock_set_priority);
809 
810 void sock_set_sndtimeo(struct sock *sk, s64 secs)
811 {
812 	lock_sock(sk);
813 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
814 		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
815 	else
816 		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
817 	release_sock(sk);
818 }
819 EXPORT_SYMBOL(sock_set_sndtimeo);
820 
821 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
822 {
823 	if (val)  {
824 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
825 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
826 		sock_set_flag(sk, SOCK_RCVTSTAMP);
827 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
828 	} else {
829 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
830 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
831 	}
832 }
833 
834 void sock_enable_timestamps(struct sock *sk)
835 {
836 	lock_sock(sk);
837 	__sock_set_timestamps(sk, true, false, true);
838 	release_sock(sk);
839 }
840 EXPORT_SYMBOL(sock_enable_timestamps);
841 
842 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
843 {
844 	switch (optname) {
845 	case SO_TIMESTAMP_OLD:
846 		__sock_set_timestamps(sk, valbool, false, false);
847 		break;
848 	case SO_TIMESTAMP_NEW:
849 		__sock_set_timestamps(sk, valbool, true, false);
850 		break;
851 	case SO_TIMESTAMPNS_OLD:
852 		__sock_set_timestamps(sk, valbool, false, true);
853 		break;
854 	case SO_TIMESTAMPNS_NEW:
855 		__sock_set_timestamps(sk, valbool, true, true);
856 		break;
857 	}
858 }
859 
860 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
861 {
862 	struct net *net = sock_net(sk);
863 	struct net_device *dev = NULL;
864 	bool match = false;
865 	int *vclock_index;
866 	int i, num;
867 
868 	if (sk->sk_bound_dev_if)
869 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
870 
871 	if (!dev) {
872 		pr_err("%s: sock not bind to device\n", __func__);
873 		return -EOPNOTSUPP;
874 	}
875 
876 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
877 	dev_put(dev);
878 
879 	for (i = 0; i < num; i++) {
880 		if (*(vclock_index + i) == phc_index) {
881 			match = true;
882 			break;
883 		}
884 	}
885 
886 	if (num > 0)
887 		kfree(vclock_index);
888 
889 	if (!match)
890 		return -EINVAL;
891 
892 	WRITE_ONCE(sk->sk_bind_phc, phc_index);
893 
894 	return 0;
895 }
896 
897 int sock_set_timestamping(struct sock *sk, int optname,
898 			  struct so_timestamping timestamping)
899 {
900 	int val = timestamping.flags;
901 	int ret;
902 
903 	if (val & ~SOF_TIMESTAMPING_MASK)
904 		return -EINVAL;
905 
906 	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
907 	    !(val & SOF_TIMESTAMPING_OPT_ID))
908 		return -EINVAL;
909 
910 	if (val & SOF_TIMESTAMPING_OPT_ID &&
911 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
912 		if (sk_is_tcp(sk)) {
913 			if ((1 << sk->sk_state) &
914 			    (TCPF_CLOSE | TCPF_LISTEN))
915 				return -EINVAL;
916 			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
917 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
918 			else
919 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
920 		} else {
921 			atomic_set(&sk->sk_tskey, 0);
922 		}
923 	}
924 
925 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
926 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
927 		return -EINVAL;
928 
929 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
930 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
931 		if (ret)
932 			return ret;
933 	}
934 
935 	WRITE_ONCE(sk->sk_tsflags, val);
936 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
937 
938 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
939 		sock_enable_timestamp(sk,
940 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
941 	else
942 		sock_disable_timestamp(sk,
943 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
944 	return 0;
945 }
946 
947 void sock_set_keepalive(struct sock *sk)
948 {
949 	lock_sock(sk);
950 	if (sk->sk_prot->keepalive)
951 		sk->sk_prot->keepalive(sk, true);
952 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
953 	release_sock(sk);
954 }
955 EXPORT_SYMBOL(sock_set_keepalive);
956 
957 static void __sock_set_rcvbuf(struct sock *sk, int val)
958 {
959 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
960 	 * as a negative value.
961 	 */
962 	val = min_t(int, val, INT_MAX / 2);
963 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
964 
965 	/* We double it on the way in to account for "struct sk_buff" etc.
966 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
967 	 * will allow that much actual data to be received on that socket.
968 	 *
969 	 * Applications are unaware that "struct sk_buff" and other overheads
970 	 * allocate from the receive buffer during socket buffer allocation.
971 	 *
972 	 * And after considering the possible alternatives, returning the value
973 	 * we actually used in getsockopt is the most desirable behavior.
974 	 */
975 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
976 }
977 
978 void sock_set_rcvbuf(struct sock *sk, int val)
979 {
980 	lock_sock(sk);
981 	__sock_set_rcvbuf(sk, val);
982 	release_sock(sk);
983 }
984 EXPORT_SYMBOL(sock_set_rcvbuf);
985 
986 static void __sock_set_mark(struct sock *sk, u32 val)
987 {
988 	if (val != sk->sk_mark) {
989 		WRITE_ONCE(sk->sk_mark, val);
990 		sk_dst_reset(sk);
991 	}
992 }
993 
994 void sock_set_mark(struct sock *sk, u32 val)
995 {
996 	lock_sock(sk);
997 	__sock_set_mark(sk, val);
998 	release_sock(sk);
999 }
1000 EXPORT_SYMBOL(sock_set_mark);
1001 
1002 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1003 {
1004 	/* Round down bytes to multiple of pages */
1005 	bytes = round_down(bytes, PAGE_SIZE);
1006 
1007 	WARN_ON(bytes > sk->sk_reserved_mem);
1008 	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1009 	sk_mem_reclaim(sk);
1010 }
1011 
1012 static int sock_reserve_memory(struct sock *sk, int bytes)
1013 {
1014 	long allocated;
1015 	bool charged;
1016 	int pages;
1017 
1018 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1019 		return -EOPNOTSUPP;
1020 
1021 	if (!bytes)
1022 		return 0;
1023 
1024 	pages = sk_mem_pages(bytes);
1025 
1026 	/* pre-charge to memcg */
1027 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1028 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1029 	if (!charged)
1030 		return -ENOMEM;
1031 
1032 	/* pre-charge to forward_alloc */
1033 	sk_memory_allocated_add(sk, pages);
1034 	allocated = sk_memory_allocated(sk);
1035 	/* If the system goes into memory pressure with this
1036 	 * precharge, give up and return error.
1037 	 */
1038 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1039 		sk_memory_allocated_sub(sk, pages);
1040 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1041 		return -ENOMEM;
1042 	}
1043 	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1044 
1045 	WRITE_ONCE(sk->sk_reserved_mem,
1046 		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1047 
1048 	return 0;
1049 }
1050 
1051 void sockopt_lock_sock(struct sock *sk)
1052 {
1053 	/* When current->bpf_ctx is set, the setsockopt is called from
1054 	 * a bpf prog.  bpf has ensured the sk lock has been
1055 	 * acquired before calling setsockopt().
1056 	 */
1057 	if (has_current_bpf_ctx())
1058 		return;
1059 
1060 	lock_sock(sk);
1061 }
1062 EXPORT_SYMBOL(sockopt_lock_sock);
1063 
1064 void sockopt_release_sock(struct sock *sk)
1065 {
1066 	if (has_current_bpf_ctx())
1067 		return;
1068 
1069 	release_sock(sk);
1070 }
1071 EXPORT_SYMBOL(sockopt_release_sock);
1072 
1073 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1074 {
1075 	return has_current_bpf_ctx() || ns_capable(ns, cap);
1076 }
1077 EXPORT_SYMBOL(sockopt_ns_capable);
1078 
1079 bool sockopt_capable(int cap)
1080 {
1081 	return has_current_bpf_ctx() || capable(cap);
1082 }
1083 EXPORT_SYMBOL(sockopt_capable);
1084 
1085 /*
1086  *	This is meant for all protocols to use and covers goings on
1087  *	at the socket level. Everything here is generic.
1088  */
1089 
1090 int sk_setsockopt(struct sock *sk, int level, int optname,
1091 		  sockptr_t optval, unsigned int optlen)
1092 {
1093 	struct so_timestamping timestamping;
1094 	struct socket *sock = sk->sk_socket;
1095 	struct sock_txtime sk_txtime;
1096 	int val;
1097 	int valbool;
1098 	struct linger ling;
1099 	int ret = 0;
1100 
1101 	/*
1102 	 *	Options without arguments
1103 	 */
1104 
1105 	if (optname == SO_BINDTODEVICE)
1106 		return sock_setbindtodevice(sk, optval, optlen);
1107 
1108 	if (optlen < sizeof(int))
1109 		return -EINVAL;
1110 
1111 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1112 		return -EFAULT;
1113 
1114 	valbool = val ? 1 : 0;
1115 
1116 	/* handle options which do not require locking the socket. */
1117 	switch (optname) {
1118 	case SO_PRIORITY:
1119 		if ((val >= 0 && val <= 6) ||
1120 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1121 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1122 			sock_set_priority(sk, val);
1123 			return 0;
1124 		}
1125 		return -EPERM;
1126 	case SO_PASSSEC:
1127 		assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1128 		return 0;
1129 	case SO_PASSCRED:
1130 		assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1131 		return 0;
1132 	case SO_PASSPIDFD:
1133 		assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1134 		return 0;
1135 	case SO_TYPE:
1136 	case SO_PROTOCOL:
1137 	case SO_DOMAIN:
1138 	case SO_ERROR:
1139 		return -ENOPROTOOPT;
1140 #ifdef CONFIG_NET_RX_BUSY_POLL
1141 	case SO_BUSY_POLL:
1142 		if (val < 0)
1143 			return -EINVAL;
1144 		WRITE_ONCE(sk->sk_ll_usec, val);
1145 		return 0;
1146 	case SO_PREFER_BUSY_POLL:
1147 		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1148 			return -EPERM;
1149 		WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1150 		return 0;
1151 	case SO_BUSY_POLL_BUDGET:
1152 		if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1153 		    !sockopt_capable(CAP_NET_ADMIN))
1154 			return -EPERM;
1155 		if (val < 0 || val > U16_MAX)
1156 			return -EINVAL;
1157 		WRITE_ONCE(sk->sk_busy_poll_budget, val);
1158 		return 0;
1159 #endif
1160 	case SO_MAX_PACING_RATE:
1161 		{
1162 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1163 		unsigned long pacing_rate;
1164 
1165 		if (sizeof(ulval) != sizeof(val) &&
1166 		    optlen >= sizeof(ulval) &&
1167 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1168 			return -EFAULT;
1169 		}
1170 		if (ulval != ~0UL)
1171 			cmpxchg(&sk->sk_pacing_status,
1172 				SK_PACING_NONE,
1173 				SK_PACING_NEEDED);
1174 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1175 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1176 		pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1177 		if (ulval < pacing_rate)
1178 			WRITE_ONCE(sk->sk_pacing_rate, ulval);
1179 		return 0;
1180 		}
1181 	case SO_TXREHASH:
1182 		if (val < -1 || val > 1)
1183 			return -EINVAL;
1184 		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1185 			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1186 		/* Paired with READ_ONCE() in tcp_rtx_synack()
1187 		 * and sk_getsockopt().
1188 		 */
1189 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1190 		return 0;
1191 	}
1192 
1193 	sockopt_lock_sock(sk);
1194 
1195 	switch (optname) {
1196 	case SO_DEBUG:
1197 		if (val && !sockopt_capable(CAP_NET_ADMIN))
1198 			ret = -EACCES;
1199 		else
1200 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1201 		break;
1202 	case SO_REUSEADDR:
1203 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1204 		break;
1205 	case SO_REUSEPORT:
1206 		sk->sk_reuseport = valbool;
1207 		break;
1208 	case SO_DONTROUTE:
1209 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1210 		sk_dst_reset(sk);
1211 		break;
1212 	case SO_BROADCAST:
1213 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1214 		break;
1215 	case SO_SNDBUF:
1216 		/* Don't error on this BSD doesn't and if you think
1217 		 * about it this is right. Otherwise apps have to
1218 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1219 		 * are treated in BSD as hints
1220 		 */
1221 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1222 set_sndbuf:
1223 		/* Ensure val * 2 fits into an int, to prevent max_t()
1224 		 * from treating it as a negative value.
1225 		 */
1226 		val = min_t(int, val, INT_MAX / 2);
1227 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1228 		WRITE_ONCE(sk->sk_sndbuf,
1229 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1230 		/* Wake up sending tasks if we upped the value. */
1231 		sk->sk_write_space(sk);
1232 		break;
1233 
1234 	case SO_SNDBUFFORCE:
1235 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1236 			ret = -EPERM;
1237 			break;
1238 		}
1239 
1240 		/* No negative values (to prevent underflow, as val will be
1241 		 * multiplied by 2).
1242 		 */
1243 		if (val < 0)
1244 			val = 0;
1245 		goto set_sndbuf;
1246 
1247 	case SO_RCVBUF:
1248 		/* Don't error on this BSD doesn't and if you think
1249 		 * about it this is right. Otherwise apps have to
1250 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1251 		 * are treated in BSD as hints
1252 		 */
1253 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1254 		break;
1255 
1256 	case SO_RCVBUFFORCE:
1257 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1258 			ret = -EPERM;
1259 			break;
1260 		}
1261 
1262 		/* No negative values (to prevent underflow, as val will be
1263 		 * multiplied by 2).
1264 		 */
1265 		__sock_set_rcvbuf(sk, max(val, 0));
1266 		break;
1267 
1268 	case SO_KEEPALIVE:
1269 		if (sk->sk_prot->keepalive)
1270 			sk->sk_prot->keepalive(sk, valbool);
1271 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1272 		break;
1273 
1274 	case SO_OOBINLINE:
1275 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1276 		break;
1277 
1278 	case SO_NO_CHECK:
1279 		sk->sk_no_check_tx = valbool;
1280 		break;
1281 
1282 	case SO_LINGER:
1283 		if (optlen < sizeof(ling)) {
1284 			ret = -EINVAL;	/* 1003.1g */
1285 			break;
1286 		}
1287 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1288 			ret = -EFAULT;
1289 			break;
1290 		}
1291 		if (!ling.l_onoff) {
1292 			sock_reset_flag(sk, SOCK_LINGER);
1293 		} else {
1294 			unsigned long t_sec = ling.l_linger;
1295 
1296 			if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1297 				WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1298 			else
1299 				WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1300 			sock_set_flag(sk, SOCK_LINGER);
1301 		}
1302 		break;
1303 
1304 	case SO_BSDCOMPAT:
1305 		break;
1306 
1307 	case SO_TIMESTAMP_OLD:
1308 	case SO_TIMESTAMP_NEW:
1309 	case SO_TIMESTAMPNS_OLD:
1310 	case SO_TIMESTAMPNS_NEW:
1311 		sock_set_timestamp(sk, optname, valbool);
1312 		break;
1313 
1314 	case SO_TIMESTAMPING_NEW:
1315 	case SO_TIMESTAMPING_OLD:
1316 		if (optlen == sizeof(timestamping)) {
1317 			if (copy_from_sockptr(&timestamping, optval,
1318 					      sizeof(timestamping))) {
1319 				ret = -EFAULT;
1320 				break;
1321 			}
1322 		} else {
1323 			memset(&timestamping, 0, sizeof(timestamping));
1324 			timestamping.flags = val;
1325 		}
1326 		ret = sock_set_timestamping(sk, optname, timestamping);
1327 		break;
1328 
1329 	case SO_RCVLOWAT:
1330 		{
1331 		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1332 
1333 		if (val < 0)
1334 			val = INT_MAX;
1335 		if (sock)
1336 			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1337 		if (set_rcvlowat)
1338 			ret = set_rcvlowat(sk, val);
1339 		else
1340 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1341 		break;
1342 		}
1343 	case SO_RCVTIMEO_OLD:
1344 	case SO_RCVTIMEO_NEW:
1345 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1346 				       optlen, optname == SO_RCVTIMEO_OLD);
1347 		break;
1348 
1349 	case SO_SNDTIMEO_OLD:
1350 	case SO_SNDTIMEO_NEW:
1351 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1352 				       optlen, optname == SO_SNDTIMEO_OLD);
1353 		break;
1354 
1355 	case SO_ATTACH_FILTER: {
1356 		struct sock_fprog fprog;
1357 
1358 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1359 		if (!ret)
1360 			ret = sk_attach_filter(&fprog, sk);
1361 		break;
1362 	}
1363 	case SO_ATTACH_BPF:
1364 		ret = -EINVAL;
1365 		if (optlen == sizeof(u32)) {
1366 			u32 ufd;
1367 
1368 			ret = -EFAULT;
1369 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1370 				break;
1371 
1372 			ret = sk_attach_bpf(ufd, sk);
1373 		}
1374 		break;
1375 
1376 	case SO_ATTACH_REUSEPORT_CBPF: {
1377 		struct sock_fprog fprog;
1378 
1379 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1380 		if (!ret)
1381 			ret = sk_reuseport_attach_filter(&fprog, sk);
1382 		break;
1383 	}
1384 	case SO_ATTACH_REUSEPORT_EBPF:
1385 		ret = -EINVAL;
1386 		if (optlen == sizeof(u32)) {
1387 			u32 ufd;
1388 
1389 			ret = -EFAULT;
1390 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1391 				break;
1392 
1393 			ret = sk_reuseport_attach_bpf(ufd, sk);
1394 		}
1395 		break;
1396 
1397 	case SO_DETACH_REUSEPORT_BPF:
1398 		ret = reuseport_detach_prog(sk);
1399 		break;
1400 
1401 	case SO_DETACH_FILTER:
1402 		ret = sk_detach_filter(sk);
1403 		break;
1404 
1405 	case SO_LOCK_FILTER:
1406 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1407 			ret = -EPERM;
1408 		else
1409 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1410 		break;
1411 
1412 	case SO_MARK:
1413 		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1414 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1415 			ret = -EPERM;
1416 			break;
1417 		}
1418 
1419 		__sock_set_mark(sk, val);
1420 		break;
1421 	case SO_RCVMARK:
1422 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1423 		break;
1424 
1425 	case SO_RXQ_OVFL:
1426 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1427 		break;
1428 
1429 	case SO_WIFI_STATUS:
1430 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1431 		break;
1432 
1433 	case SO_PEEK_OFF:
1434 		{
1435 		int (*set_peek_off)(struct sock *sk, int val);
1436 
1437 		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1438 		if (set_peek_off)
1439 			ret = set_peek_off(sk, val);
1440 		else
1441 			ret = -EOPNOTSUPP;
1442 		break;
1443 		}
1444 
1445 	case SO_NOFCS:
1446 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1447 		break;
1448 
1449 	case SO_SELECT_ERR_QUEUE:
1450 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1451 		break;
1452 
1453 
1454 	case SO_INCOMING_CPU:
1455 		reuseport_update_incoming_cpu(sk, val);
1456 		break;
1457 
1458 	case SO_CNX_ADVICE:
1459 		if (val == 1)
1460 			dst_negative_advice(sk);
1461 		break;
1462 
1463 	case SO_ZEROCOPY:
1464 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1465 			if (!(sk_is_tcp(sk) ||
1466 			      (sk->sk_type == SOCK_DGRAM &&
1467 			       sk->sk_protocol == IPPROTO_UDP)))
1468 				ret = -EOPNOTSUPP;
1469 		} else if (sk->sk_family != PF_RDS) {
1470 			ret = -EOPNOTSUPP;
1471 		}
1472 		if (!ret) {
1473 			if (val < 0 || val > 1)
1474 				ret = -EINVAL;
1475 			else
1476 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1477 		}
1478 		break;
1479 
1480 	case SO_TXTIME:
1481 		if (optlen != sizeof(struct sock_txtime)) {
1482 			ret = -EINVAL;
1483 			break;
1484 		} else if (copy_from_sockptr(&sk_txtime, optval,
1485 			   sizeof(struct sock_txtime))) {
1486 			ret = -EFAULT;
1487 			break;
1488 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1489 			ret = -EINVAL;
1490 			break;
1491 		}
1492 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1493 		 * scheduler has enough safe guards.
1494 		 */
1495 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1496 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1497 			ret = -EPERM;
1498 			break;
1499 		}
1500 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1501 		sk->sk_clockid = sk_txtime.clockid;
1502 		sk->sk_txtime_deadline_mode =
1503 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1504 		sk->sk_txtime_report_errors =
1505 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1506 		break;
1507 
1508 	case SO_BINDTOIFINDEX:
1509 		ret = sock_bindtoindex_locked(sk, val);
1510 		break;
1511 
1512 	case SO_BUF_LOCK:
1513 		if (val & ~SOCK_BUF_LOCK_MASK) {
1514 			ret = -EINVAL;
1515 			break;
1516 		}
1517 		sk->sk_userlocks = val | (sk->sk_userlocks &
1518 					  ~SOCK_BUF_LOCK_MASK);
1519 		break;
1520 
1521 	case SO_RESERVE_MEM:
1522 	{
1523 		int delta;
1524 
1525 		if (val < 0) {
1526 			ret = -EINVAL;
1527 			break;
1528 		}
1529 
1530 		delta = val - sk->sk_reserved_mem;
1531 		if (delta < 0)
1532 			sock_release_reserved_memory(sk, -delta);
1533 		else
1534 			ret = sock_reserve_memory(sk, delta);
1535 		break;
1536 	}
1537 
1538 	default:
1539 		ret = -ENOPROTOOPT;
1540 		break;
1541 	}
1542 	sockopt_release_sock(sk);
1543 	return ret;
1544 }
1545 
1546 int sock_setsockopt(struct socket *sock, int level, int optname,
1547 		    sockptr_t optval, unsigned int optlen)
1548 {
1549 	return sk_setsockopt(sock->sk, level, optname,
1550 			     optval, optlen);
1551 }
1552 EXPORT_SYMBOL(sock_setsockopt);
1553 
1554 static const struct cred *sk_get_peer_cred(struct sock *sk)
1555 {
1556 	const struct cred *cred;
1557 
1558 	spin_lock(&sk->sk_peer_lock);
1559 	cred = get_cred(sk->sk_peer_cred);
1560 	spin_unlock(&sk->sk_peer_lock);
1561 
1562 	return cred;
1563 }
1564 
1565 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1566 			  struct ucred *ucred)
1567 {
1568 	ucred->pid = pid_vnr(pid);
1569 	ucred->uid = ucred->gid = -1;
1570 	if (cred) {
1571 		struct user_namespace *current_ns = current_user_ns();
1572 
1573 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1574 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1575 	}
1576 }
1577 
1578 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1579 {
1580 	struct user_namespace *user_ns = current_user_ns();
1581 	int i;
1582 
1583 	for (i = 0; i < src->ngroups; i++) {
1584 		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1585 
1586 		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1587 			return -EFAULT;
1588 	}
1589 
1590 	return 0;
1591 }
1592 
1593 int sk_getsockopt(struct sock *sk, int level, int optname,
1594 		  sockptr_t optval, sockptr_t optlen)
1595 {
1596 	struct socket *sock = sk->sk_socket;
1597 
1598 	union {
1599 		int val;
1600 		u64 val64;
1601 		unsigned long ulval;
1602 		struct linger ling;
1603 		struct old_timeval32 tm32;
1604 		struct __kernel_old_timeval tm;
1605 		struct  __kernel_sock_timeval stm;
1606 		struct sock_txtime txtime;
1607 		struct so_timestamping timestamping;
1608 	} v;
1609 
1610 	int lv = sizeof(int);
1611 	int len;
1612 
1613 	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1614 		return -EFAULT;
1615 	if (len < 0)
1616 		return -EINVAL;
1617 
1618 	memset(&v, 0, sizeof(v));
1619 
1620 	switch (optname) {
1621 	case SO_DEBUG:
1622 		v.val = sock_flag(sk, SOCK_DBG);
1623 		break;
1624 
1625 	case SO_DONTROUTE:
1626 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1627 		break;
1628 
1629 	case SO_BROADCAST:
1630 		v.val = sock_flag(sk, SOCK_BROADCAST);
1631 		break;
1632 
1633 	case SO_SNDBUF:
1634 		v.val = READ_ONCE(sk->sk_sndbuf);
1635 		break;
1636 
1637 	case SO_RCVBUF:
1638 		v.val = READ_ONCE(sk->sk_rcvbuf);
1639 		break;
1640 
1641 	case SO_REUSEADDR:
1642 		v.val = sk->sk_reuse;
1643 		break;
1644 
1645 	case SO_REUSEPORT:
1646 		v.val = sk->sk_reuseport;
1647 		break;
1648 
1649 	case SO_KEEPALIVE:
1650 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1651 		break;
1652 
1653 	case SO_TYPE:
1654 		v.val = sk->sk_type;
1655 		break;
1656 
1657 	case SO_PROTOCOL:
1658 		v.val = sk->sk_protocol;
1659 		break;
1660 
1661 	case SO_DOMAIN:
1662 		v.val = sk->sk_family;
1663 		break;
1664 
1665 	case SO_ERROR:
1666 		v.val = -sock_error(sk);
1667 		if (v.val == 0)
1668 			v.val = xchg(&sk->sk_err_soft, 0);
1669 		break;
1670 
1671 	case SO_OOBINLINE:
1672 		v.val = sock_flag(sk, SOCK_URGINLINE);
1673 		break;
1674 
1675 	case SO_NO_CHECK:
1676 		v.val = sk->sk_no_check_tx;
1677 		break;
1678 
1679 	case SO_PRIORITY:
1680 		v.val = READ_ONCE(sk->sk_priority);
1681 		break;
1682 
1683 	case SO_LINGER:
1684 		lv		= sizeof(v.ling);
1685 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1686 		v.ling.l_linger	= READ_ONCE(sk->sk_lingertime) / HZ;
1687 		break;
1688 
1689 	case SO_BSDCOMPAT:
1690 		break;
1691 
1692 	case SO_TIMESTAMP_OLD:
1693 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1694 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1695 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1696 		break;
1697 
1698 	case SO_TIMESTAMPNS_OLD:
1699 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1700 		break;
1701 
1702 	case SO_TIMESTAMP_NEW:
1703 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1704 		break;
1705 
1706 	case SO_TIMESTAMPNS_NEW:
1707 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1708 		break;
1709 
1710 	case SO_TIMESTAMPING_OLD:
1711 	case SO_TIMESTAMPING_NEW:
1712 		lv = sizeof(v.timestamping);
1713 		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1714 		 * returning the flags when they were set through the same option.
1715 		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1716 		 */
1717 		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1718 			v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1719 			v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1720 		}
1721 		break;
1722 
1723 	case SO_RCVTIMEO_OLD:
1724 	case SO_RCVTIMEO_NEW:
1725 		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1726 				      SO_RCVTIMEO_OLD == optname);
1727 		break;
1728 
1729 	case SO_SNDTIMEO_OLD:
1730 	case SO_SNDTIMEO_NEW:
1731 		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1732 				      SO_SNDTIMEO_OLD == optname);
1733 		break;
1734 
1735 	case SO_RCVLOWAT:
1736 		v.val = READ_ONCE(sk->sk_rcvlowat);
1737 		break;
1738 
1739 	case SO_SNDLOWAT:
1740 		v.val = 1;
1741 		break;
1742 
1743 	case SO_PASSCRED:
1744 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1745 		break;
1746 
1747 	case SO_PASSPIDFD:
1748 		v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1749 		break;
1750 
1751 	case SO_PEERCRED:
1752 	{
1753 		struct ucred peercred;
1754 		if (len > sizeof(peercred))
1755 			len = sizeof(peercred);
1756 
1757 		spin_lock(&sk->sk_peer_lock);
1758 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1759 		spin_unlock(&sk->sk_peer_lock);
1760 
1761 		if (copy_to_sockptr(optval, &peercred, len))
1762 			return -EFAULT;
1763 		goto lenout;
1764 	}
1765 
1766 	case SO_PEERPIDFD:
1767 	{
1768 		struct pid *peer_pid;
1769 		struct file *pidfd_file = NULL;
1770 		int pidfd;
1771 
1772 		if (len > sizeof(pidfd))
1773 			len = sizeof(pidfd);
1774 
1775 		spin_lock(&sk->sk_peer_lock);
1776 		peer_pid = get_pid(sk->sk_peer_pid);
1777 		spin_unlock(&sk->sk_peer_lock);
1778 
1779 		if (!peer_pid)
1780 			return -ENODATA;
1781 
1782 		pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1783 		put_pid(peer_pid);
1784 		if (pidfd < 0)
1785 			return pidfd;
1786 
1787 		if (copy_to_sockptr(optval, &pidfd, len) ||
1788 		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1789 			put_unused_fd(pidfd);
1790 			fput(pidfd_file);
1791 
1792 			return -EFAULT;
1793 		}
1794 
1795 		fd_install(pidfd, pidfd_file);
1796 		return 0;
1797 	}
1798 
1799 	case SO_PEERGROUPS:
1800 	{
1801 		const struct cred *cred;
1802 		int ret, n;
1803 
1804 		cred = sk_get_peer_cred(sk);
1805 		if (!cred)
1806 			return -ENODATA;
1807 
1808 		n = cred->group_info->ngroups;
1809 		if (len < n * sizeof(gid_t)) {
1810 			len = n * sizeof(gid_t);
1811 			put_cred(cred);
1812 			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1813 		}
1814 		len = n * sizeof(gid_t);
1815 
1816 		ret = groups_to_user(optval, cred->group_info);
1817 		put_cred(cred);
1818 		if (ret)
1819 			return ret;
1820 		goto lenout;
1821 	}
1822 
1823 	case SO_PEERNAME:
1824 	{
1825 		struct sockaddr_storage address;
1826 
1827 		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1828 		if (lv < 0)
1829 			return -ENOTCONN;
1830 		if (lv < len)
1831 			return -EINVAL;
1832 		if (copy_to_sockptr(optval, &address, len))
1833 			return -EFAULT;
1834 		goto lenout;
1835 	}
1836 
1837 	/* Dubious BSD thing... Probably nobody even uses it, but
1838 	 * the UNIX standard wants it for whatever reason... -DaveM
1839 	 */
1840 	case SO_ACCEPTCONN:
1841 		v.val = sk->sk_state == TCP_LISTEN;
1842 		break;
1843 
1844 	case SO_PASSSEC:
1845 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1846 		break;
1847 
1848 	case SO_PEERSEC:
1849 		return security_socket_getpeersec_stream(sock,
1850 							 optval, optlen, len);
1851 
1852 	case SO_MARK:
1853 		v.val = READ_ONCE(sk->sk_mark);
1854 		break;
1855 
1856 	case SO_RCVMARK:
1857 		v.val = sock_flag(sk, SOCK_RCVMARK);
1858 		break;
1859 
1860 	case SO_RXQ_OVFL:
1861 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1862 		break;
1863 
1864 	case SO_WIFI_STATUS:
1865 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1866 		break;
1867 
1868 	case SO_PEEK_OFF:
1869 		if (!READ_ONCE(sock->ops)->set_peek_off)
1870 			return -EOPNOTSUPP;
1871 
1872 		v.val = READ_ONCE(sk->sk_peek_off);
1873 		break;
1874 	case SO_NOFCS:
1875 		v.val = sock_flag(sk, SOCK_NOFCS);
1876 		break;
1877 
1878 	case SO_BINDTODEVICE:
1879 		return sock_getbindtodevice(sk, optval, optlen, len);
1880 
1881 	case SO_GET_FILTER:
1882 		len = sk_get_filter(sk, optval, len);
1883 		if (len < 0)
1884 			return len;
1885 
1886 		goto lenout;
1887 
1888 	case SO_LOCK_FILTER:
1889 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1890 		break;
1891 
1892 	case SO_BPF_EXTENSIONS:
1893 		v.val = bpf_tell_extensions();
1894 		break;
1895 
1896 	case SO_SELECT_ERR_QUEUE:
1897 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1898 		break;
1899 
1900 #ifdef CONFIG_NET_RX_BUSY_POLL
1901 	case SO_BUSY_POLL:
1902 		v.val = READ_ONCE(sk->sk_ll_usec);
1903 		break;
1904 	case SO_PREFER_BUSY_POLL:
1905 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1906 		break;
1907 #endif
1908 
1909 	case SO_MAX_PACING_RATE:
1910 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1911 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1912 			lv = sizeof(v.ulval);
1913 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1914 		} else {
1915 			/* 32bit version */
1916 			v.val = min_t(unsigned long, ~0U,
1917 				      READ_ONCE(sk->sk_max_pacing_rate));
1918 		}
1919 		break;
1920 
1921 	case SO_INCOMING_CPU:
1922 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1923 		break;
1924 
1925 	case SO_MEMINFO:
1926 	{
1927 		u32 meminfo[SK_MEMINFO_VARS];
1928 
1929 		sk_get_meminfo(sk, meminfo);
1930 
1931 		len = min_t(unsigned int, len, sizeof(meminfo));
1932 		if (copy_to_sockptr(optval, &meminfo, len))
1933 			return -EFAULT;
1934 
1935 		goto lenout;
1936 	}
1937 
1938 #ifdef CONFIG_NET_RX_BUSY_POLL
1939 	case SO_INCOMING_NAPI_ID:
1940 		v.val = READ_ONCE(sk->sk_napi_id);
1941 
1942 		/* aggregate non-NAPI IDs down to 0 */
1943 		if (v.val < MIN_NAPI_ID)
1944 			v.val = 0;
1945 
1946 		break;
1947 #endif
1948 
1949 	case SO_COOKIE:
1950 		lv = sizeof(u64);
1951 		if (len < lv)
1952 			return -EINVAL;
1953 		v.val64 = sock_gen_cookie(sk);
1954 		break;
1955 
1956 	case SO_ZEROCOPY:
1957 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1958 		break;
1959 
1960 	case SO_TXTIME:
1961 		lv = sizeof(v.txtime);
1962 		v.txtime.clockid = sk->sk_clockid;
1963 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1964 				  SOF_TXTIME_DEADLINE_MODE : 0;
1965 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1966 				  SOF_TXTIME_REPORT_ERRORS : 0;
1967 		break;
1968 
1969 	case SO_BINDTOIFINDEX:
1970 		v.val = READ_ONCE(sk->sk_bound_dev_if);
1971 		break;
1972 
1973 	case SO_NETNS_COOKIE:
1974 		lv = sizeof(u64);
1975 		if (len != lv)
1976 			return -EINVAL;
1977 		v.val64 = sock_net(sk)->net_cookie;
1978 		break;
1979 
1980 	case SO_BUF_LOCK:
1981 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1982 		break;
1983 
1984 	case SO_RESERVE_MEM:
1985 		v.val = READ_ONCE(sk->sk_reserved_mem);
1986 		break;
1987 
1988 	case SO_TXREHASH:
1989 		/* Paired with WRITE_ONCE() in sk_setsockopt() */
1990 		v.val = READ_ONCE(sk->sk_txrehash);
1991 		break;
1992 
1993 	default:
1994 		/* We implement the SO_SNDLOWAT etc to not be settable
1995 		 * (1003.1g 7).
1996 		 */
1997 		return -ENOPROTOOPT;
1998 	}
1999 
2000 	if (len > lv)
2001 		len = lv;
2002 	if (copy_to_sockptr(optval, &v, len))
2003 		return -EFAULT;
2004 lenout:
2005 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2006 		return -EFAULT;
2007 	return 0;
2008 }
2009 
2010 /*
2011  * Initialize an sk_lock.
2012  *
2013  * (We also register the sk_lock with the lock validator.)
2014  */
2015 static inline void sock_lock_init(struct sock *sk)
2016 {
2017 	if (sk->sk_kern_sock)
2018 		sock_lock_init_class_and_name(
2019 			sk,
2020 			af_family_kern_slock_key_strings[sk->sk_family],
2021 			af_family_kern_slock_keys + sk->sk_family,
2022 			af_family_kern_key_strings[sk->sk_family],
2023 			af_family_kern_keys + sk->sk_family);
2024 	else
2025 		sock_lock_init_class_and_name(
2026 			sk,
2027 			af_family_slock_key_strings[sk->sk_family],
2028 			af_family_slock_keys + sk->sk_family,
2029 			af_family_key_strings[sk->sk_family],
2030 			af_family_keys + sk->sk_family);
2031 }
2032 
2033 /*
2034  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2035  * even temporarly, because of RCU lookups. sk_node should also be left as is.
2036  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2037  */
2038 static void sock_copy(struct sock *nsk, const struct sock *osk)
2039 {
2040 	const struct proto *prot = READ_ONCE(osk->sk_prot);
2041 #ifdef CONFIG_SECURITY_NETWORK
2042 	void *sptr = nsk->sk_security;
2043 #endif
2044 
2045 	/* If we move sk_tx_queue_mapping out of the private section,
2046 	 * we must check if sk_tx_queue_clear() is called after
2047 	 * sock_copy() in sk_clone_lock().
2048 	 */
2049 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2050 		     offsetof(struct sock, sk_dontcopy_begin) ||
2051 		     offsetof(struct sock, sk_tx_queue_mapping) >=
2052 		     offsetof(struct sock, sk_dontcopy_end));
2053 
2054 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2055 
2056 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2057 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
2058 
2059 #ifdef CONFIG_SECURITY_NETWORK
2060 	nsk->sk_security = sptr;
2061 	security_sk_clone(osk, nsk);
2062 #endif
2063 }
2064 
2065 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2066 		int family)
2067 {
2068 	struct sock *sk;
2069 	struct kmem_cache *slab;
2070 
2071 	slab = prot->slab;
2072 	if (slab != NULL) {
2073 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2074 		if (!sk)
2075 			return sk;
2076 		if (want_init_on_alloc(priority))
2077 			sk_prot_clear_nulls(sk, prot->obj_size);
2078 	} else
2079 		sk = kmalloc(prot->obj_size, priority);
2080 
2081 	if (sk != NULL) {
2082 		if (security_sk_alloc(sk, family, priority))
2083 			goto out_free;
2084 
2085 		if (!try_module_get(prot->owner))
2086 			goto out_free_sec;
2087 	}
2088 
2089 	return sk;
2090 
2091 out_free_sec:
2092 	security_sk_free(sk);
2093 out_free:
2094 	if (slab != NULL)
2095 		kmem_cache_free(slab, sk);
2096 	else
2097 		kfree(sk);
2098 	return NULL;
2099 }
2100 
2101 static void sk_prot_free(struct proto *prot, struct sock *sk)
2102 {
2103 	struct kmem_cache *slab;
2104 	struct module *owner;
2105 
2106 	owner = prot->owner;
2107 	slab = prot->slab;
2108 
2109 	cgroup_sk_free(&sk->sk_cgrp_data);
2110 	mem_cgroup_sk_free(sk);
2111 	security_sk_free(sk);
2112 	if (slab != NULL)
2113 		kmem_cache_free(slab, sk);
2114 	else
2115 		kfree(sk);
2116 	module_put(owner);
2117 }
2118 
2119 /**
2120  *	sk_alloc - All socket objects are allocated here
2121  *	@net: the applicable net namespace
2122  *	@family: protocol family
2123  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2124  *	@prot: struct proto associated with this new sock instance
2125  *	@kern: is this to be a kernel socket?
2126  */
2127 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2128 		      struct proto *prot, int kern)
2129 {
2130 	struct sock *sk;
2131 
2132 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2133 	if (sk) {
2134 		sk->sk_family = family;
2135 		/*
2136 		 * See comment in struct sock definition to understand
2137 		 * why we need sk_prot_creator -acme
2138 		 */
2139 		sk->sk_prot = sk->sk_prot_creator = prot;
2140 		sk->sk_kern_sock = kern;
2141 		sock_lock_init(sk);
2142 		sk->sk_net_refcnt = kern ? 0 : 1;
2143 		if (likely(sk->sk_net_refcnt)) {
2144 			get_net_track(net, &sk->ns_tracker, priority);
2145 			sock_inuse_add(net, 1);
2146 		} else {
2147 			__netns_tracker_alloc(net, &sk->ns_tracker,
2148 					      false, priority);
2149 		}
2150 
2151 		sock_net_set(sk, net);
2152 		refcount_set(&sk->sk_wmem_alloc, 1);
2153 
2154 		mem_cgroup_sk_alloc(sk);
2155 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2156 		sock_update_classid(&sk->sk_cgrp_data);
2157 		sock_update_netprioidx(&sk->sk_cgrp_data);
2158 		sk_tx_queue_clear(sk);
2159 	}
2160 
2161 	return sk;
2162 }
2163 EXPORT_SYMBOL(sk_alloc);
2164 
2165 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2166  * grace period. This is the case for UDP sockets and TCP listeners.
2167  */
2168 static void __sk_destruct(struct rcu_head *head)
2169 {
2170 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2171 	struct sk_filter *filter;
2172 
2173 	if (sk->sk_destruct)
2174 		sk->sk_destruct(sk);
2175 
2176 	filter = rcu_dereference_check(sk->sk_filter,
2177 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2178 	if (filter) {
2179 		sk_filter_uncharge(sk, filter);
2180 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2181 	}
2182 
2183 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2184 
2185 #ifdef CONFIG_BPF_SYSCALL
2186 	bpf_sk_storage_free(sk);
2187 #endif
2188 
2189 	if (atomic_read(&sk->sk_omem_alloc))
2190 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2191 			 __func__, atomic_read(&sk->sk_omem_alloc));
2192 
2193 	if (sk->sk_frag.page) {
2194 		put_page(sk->sk_frag.page);
2195 		sk->sk_frag.page = NULL;
2196 	}
2197 
2198 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2199 	put_cred(sk->sk_peer_cred);
2200 	put_pid(sk->sk_peer_pid);
2201 
2202 	if (likely(sk->sk_net_refcnt))
2203 		put_net_track(sock_net(sk), &sk->ns_tracker);
2204 	else
2205 		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2206 
2207 	sk_prot_free(sk->sk_prot_creator, sk);
2208 }
2209 
2210 void sk_destruct(struct sock *sk)
2211 {
2212 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2213 
2214 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2215 		reuseport_detach_sock(sk);
2216 		use_call_rcu = true;
2217 	}
2218 
2219 	if (use_call_rcu)
2220 		call_rcu(&sk->sk_rcu, __sk_destruct);
2221 	else
2222 		__sk_destruct(&sk->sk_rcu);
2223 }
2224 
2225 static void __sk_free(struct sock *sk)
2226 {
2227 	if (likely(sk->sk_net_refcnt))
2228 		sock_inuse_add(sock_net(sk), -1);
2229 
2230 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2231 		sock_diag_broadcast_destroy(sk);
2232 	else
2233 		sk_destruct(sk);
2234 }
2235 
2236 void sk_free(struct sock *sk)
2237 {
2238 	/*
2239 	 * We subtract one from sk_wmem_alloc and can know if
2240 	 * some packets are still in some tx queue.
2241 	 * If not null, sock_wfree() will call __sk_free(sk) later
2242 	 */
2243 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2244 		__sk_free(sk);
2245 }
2246 EXPORT_SYMBOL(sk_free);
2247 
2248 static void sk_init_common(struct sock *sk)
2249 {
2250 	skb_queue_head_init(&sk->sk_receive_queue);
2251 	skb_queue_head_init(&sk->sk_write_queue);
2252 	skb_queue_head_init(&sk->sk_error_queue);
2253 
2254 	rwlock_init(&sk->sk_callback_lock);
2255 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2256 			af_rlock_keys + sk->sk_family,
2257 			af_family_rlock_key_strings[sk->sk_family]);
2258 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2259 			af_wlock_keys + sk->sk_family,
2260 			af_family_wlock_key_strings[sk->sk_family]);
2261 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2262 			af_elock_keys + sk->sk_family,
2263 			af_family_elock_key_strings[sk->sk_family]);
2264 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2265 			af_callback_keys + sk->sk_family,
2266 			af_family_clock_key_strings[sk->sk_family]);
2267 }
2268 
2269 /**
2270  *	sk_clone_lock - clone a socket, and lock its clone
2271  *	@sk: the socket to clone
2272  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2273  *
2274  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2275  */
2276 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2277 {
2278 	struct proto *prot = READ_ONCE(sk->sk_prot);
2279 	struct sk_filter *filter;
2280 	bool is_charged = true;
2281 	struct sock *newsk;
2282 
2283 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2284 	if (!newsk)
2285 		goto out;
2286 
2287 	sock_copy(newsk, sk);
2288 
2289 	newsk->sk_prot_creator = prot;
2290 
2291 	/* SANITY */
2292 	if (likely(newsk->sk_net_refcnt)) {
2293 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2294 		sock_inuse_add(sock_net(newsk), 1);
2295 	} else {
2296 		/* Kernel sockets are not elevating the struct net refcount.
2297 		 * Instead, use a tracker to more easily detect if a layer
2298 		 * is not properly dismantling its kernel sockets at netns
2299 		 * destroy time.
2300 		 */
2301 		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2302 				      false, priority);
2303 	}
2304 	sk_node_init(&newsk->sk_node);
2305 	sock_lock_init(newsk);
2306 	bh_lock_sock(newsk);
2307 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2308 	newsk->sk_backlog.len = 0;
2309 
2310 	atomic_set(&newsk->sk_rmem_alloc, 0);
2311 
2312 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2313 	refcount_set(&newsk->sk_wmem_alloc, 1);
2314 
2315 	atomic_set(&newsk->sk_omem_alloc, 0);
2316 	sk_init_common(newsk);
2317 
2318 	newsk->sk_dst_cache	= NULL;
2319 	newsk->sk_dst_pending_confirm = 0;
2320 	newsk->sk_wmem_queued	= 0;
2321 	newsk->sk_forward_alloc = 0;
2322 	newsk->sk_reserved_mem  = 0;
2323 	atomic_set(&newsk->sk_drops, 0);
2324 	newsk->sk_send_head	= NULL;
2325 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2326 	atomic_set(&newsk->sk_zckey, 0);
2327 
2328 	sock_reset_flag(newsk, SOCK_DONE);
2329 
2330 	/* sk->sk_memcg will be populated at accept() time */
2331 	newsk->sk_memcg = NULL;
2332 
2333 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2334 
2335 	rcu_read_lock();
2336 	filter = rcu_dereference(sk->sk_filter);
2337 	if (filter != NULL)
2338 		/* though it's an empty new sock, the charging may fail
2339 		 * if sysctl_optmem_max was changed between creation of
2340 		 * original socket and cloning
2341 		 */
2342 		is_charged = sk_filter_charge(newsk, filter);
2343 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2344 	rcu_read_unlock();
2345 
2346 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2347 		/* We need to make sure that we don't uncharge the new
2348 		 * socket if we couldn't charge it in the first place
2349 		 * as otherwise we uncharge the parent's filter.
2350 		 */
2351 		if (!is_charged)
2352 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2353 		sk_free_unlock_clone(newsk);
2354 		newsk = NULL;
2355 		goto out;
2356 	}
2357 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2358 
2359 	if (bpf_sk_storage_clone(sk, newsk)) {
2360 		sk_free_unlock_clone(newsk);
2361 		newsk = NULL;
2362 		goto out;
2363 	}
2364 
2365 	/* Clear sk_user_data if parent had the pointer tagged
2366 	 * as not suitable for copying when cloning.
2367 	 */
2368 	if (sk_user_data_is_nocopy(newsk))
2369 		newsk->sk_user_data = NULL;
2370 
2371 	newsk->sk_err	   = 0;
2372 	newsk->sk_err_soft = 0;
2373 	newsk->sk_priority = 0;
2374 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2375 
2376 	/* Before updating sk_refcnt, we must commit prior changes to memory
2377 	 * (Documentation/RCU/rculist_nulls.rst for details)
2378 	 */
2379 	smp_wmb();
2380 	refcount_set(&newsk->sk_refcnt, 2);
2381 
2382 	sk_set_socket(newsk, NULL);
2383 	sk_tx_queue_clear(newsk);
2384 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2385 
2386 	if (newsk->sk_prot->sockets_allocated)
2387 		sk_sockets_allocated_inc(newsk);
2388 
2389 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2390 		net_enable_timestamp();
2391 out:
2392 	return newsk;
2393 }
2394 EXPORT_SYMBOL_GPL(sk_clone_lock);
2395 
2396 void sk_free_unlock_clone(struct sock *sk)
2397 {
2398 	/* It is still raw copy of parent, so invalidate
2399 	 * destructor and make plain sk_free() */
2400 	sk->sk_destruct = NULL;
2401 	bh_unlock_sock(sk);
2402 	sk_free(sk);
2403 }
2404 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2405 
2406 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2407 {
2408 	bool is_ipv6 = false;
2409 	u32 max_size;
2410 
2411 #if IS_ENABLED(CONFIG_IPV6)
2412 	is_ipv6 = (sk->sk_family == AF_INET6 &&
2413 		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2414 #endif
2415 	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2416 	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2417 			READ_ONCE(dst->dev->gso_ipv4_max_size);
2418 	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2419 		max_size = GSO_LEGACY_MAX_SIZE;
2420 
2421 	return max_size - (MAX_TCP_HEADER + 1);
2422 }
2423 
2424 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2425 {
2426 	u32 max_segs = 1;
2427 
2428 	sk->sk_route_caps = dst->dev->features;
2429 	if (sk_is_tcp(sk))
2430 		sk->sk_route_caps |= NETIF_F_GSO;
2431 	if (sk->sk_route_caps & NETIF_F_GSO)
2432 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2433 	if (unlikely(sk->sk_gso_disabled))
2434 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2435 	if (sk_can_gso(sk)) {
2436 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2437 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2438 		} else {
2439 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2440 			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2441 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2442 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2443 		}
2444 	}
2445 	sk->sk_gso_max_segs = max_segs;
2446 	sk_dst_set(sk, dst);
2447 }
2448 EXPORT_SYMBOL_GPL(sk_setup_caps);
2449 
2450 /*
2451  *	Simple resource managers for sockets.
2452  */
2453 
2454 
2455 /*
2456  * Write buffer destructor automatically called from kfree_skb.
2457  */
2458 void sock_wfree(struct sk_buff *skb)
2459 {
2460 	struct sock *sk = skb->sk;
2461 	unsigned int len = skb->truesize;
2462 	bool free;
2463 
2464 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2465 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2466 		    sk->sk_write_space == sock_def_write_space) {
2467 			rcu_read_lock();
2468 			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2469 			sock_def_write_space_wfree(sk);
2470 			rcu_read_unlock();
2471 			if (unlikely(free))
2472 				__sk_free(sk);
2473 			return;
2474 		}
2475 
2476 		/*
2477 		 * Keep a reference on sk_wmem_alloc, this will be released
2478 		 * after sk_write_space() call
2479 		 */
2480 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2481 		sk->sk_write_space(sk);
2482 		len = 1;
2483 	}
2484 	/*
2485 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2486 	 * could not do because of in-flight packets
2487 	 */
2488 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2489 		__sk_free(sk);
2490 }
2491 EXPORT_SYMBOL(sock_wfree);
2492 
2493 /* This variant of sock_wfree() is used by TCP,
2494  * since it sets SOCK_USE_WRITE_QUEUE.
2495  */
2496 void __sock_wfree(struct sk_buff *skb)
2497 {
2498 	struct sock *sk = skb->sk;
2499 
2500 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2501 		__sk_free(sk);
2502 }
2503 
2504 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2505 {
2506 	skb_orphan(skb);
2507 	skb->sk = sk;
2508 #ifdef CONFIG_INET
2509 	if (unlikely(!sk_fullsock(sk))) {
2510 		skb->destructor = sock_edemux;
2511 		sock_hold(sk);
2512 		return;
2513 	}
2514 #endif
2515 	skb->destructor = sock_wfree;
2516 	skb_set_hash_from_sk(skb, sk);
2517 	/*
2518 	 * We used to take a refcount on sk, but following operation
2519 	 * is enough to guarantee sk_free() wont free this sock until
2520 	 * all in-flight packets are completed
2521 	 */
2522 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2523 }
2524 EXPORT_SYMBOL(skb_set_owner_w);
2525 
2526 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2527 {
2528 #ifdef CONFIG_TLS_DEVICE
2529 	/* Drivers depend on in-order delivery for crypto offload,
2530 	 * partial orphan breaks out-of-order-OK logic.
2531 	 */
2532 	if (skb->decrypted)
2533 		return false;
2534 #endif
2535 	return (skb->destructor == sock_wfree ||
2536 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2537 }
2538 
2539 /* This helper is used by netem, as it can hold packets in its
2540  * delay queue. We want to allow the owner socket to send more
2541  * packets, as if they were already TX completed by a typical driver.
2542  * But we also want to keep skb->sk set because some packet schedulers
2543  * rely on it (sch_fq for example).
2544  */
2545 void skb_orphan_partial(struct sk_buff *skb)
2546 {
2547 	if (skb_is_tcp_pure_ack(skb))
2548 		return;
2549 
2550 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2551 		return;
2552 
2553 	skb_orphan(skb);
2554 }
2555 EXPORT_SYMBOL(skb_orphan_partial);
2556 
2557 /*
2558  * Read buffer destructor automatically called from kfree_skb.
2559  */
2560 void sock_rfree(struct sk_buff *skb)
2561 {
2562 	struct sock *sk = skb->sk;
2563 	unsigned int len = skb->truesize;
2564 
2565 	atomic_sub(len, &sk->sk_rmem_alloc);
2566 	sk_mem_uncharge(sk, len);
2567 }
2568 EXPORT_SYMBOL(sock_rfree);
2569 
2570 /*
2571  * Buffer destructor for skbs that are not used directly in read or write
2572  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2573  */
2574 void sock_efree(struct sk_buff *skb)
2575 {
2576 	sock_put(skb->sk);
2577 }
2578 EXPORT_SYMBOL(sock_efree);
2579 
2580 /* Buffer destructor for prefetch/receive path where reference count may
2581  * not be held, e.g. for listen sockets.
2582  */
2583 #ifdef CONFIG_INET
2584 void sock_pfree(struct sk_buff *skb)
2585 {
2586 	if (sk_is_refcounted(skb->sk))
2587 		sock_gen_put(skb->sk);
2588 }
2589 EXPORT_SYMBOL(sock_pfree);
2590 #endif /* CONFIG_INET */
2591 
2592 kuid_t sock_i_uid(struct sock *sk)
2593 {
2594 	kuid_t uid;
2595 
2596 	read_lock_bh(&sk->sk_callback_lock);
2597 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2598 	read_unlock_bh(&sk->sk_callback_lock);
2599 	return uid;
2600 }
2601 EXPORT_SYMBOL(sock_i_uid);
2602 
2603 unsigned long __sock_i_ino(struct sock *sk)
2604 {
2605 	unsigned long ino;
2606 
2607 	read_lock(&sk->sk_callback_lock);
2608 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2609 	read_unlock(&sk->sk_callback_lock);
2610 	return ino;
2611 }
2612 EXPORT_SYMBOL(__sock_i_ino);
2613 
2614 unsigned long sock_i_ino(struct sock *sk)
2615 {
2616 	unsigned long ino;
2617 
2618 	local_bh_disable();
2619 	ino = __sock_i_ino(sk);
2620 	local_bh_enable();
2621 	return ino;
2622 }
2623 EXPORT_SYMBOL(sock_i_ino);
2624 
2625 /*
2626  * Allocate a skb from the socket's send buffer.
2627  */
2628 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2629 			     gfp_t priority)
2630 {
2631 	if (force ||
2632 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2633 		struct sk_buff *skb = alloc_skb(size, priority);
2634 
2635 		if (skb) {
2636 			skb_set_owner_w(skb, sk);
2637 			return skb;
2638 		}
2639 	}
2640 	return NULL;
2641 }
2642 EXPORT_SYMBOL(sock_wmalloc);
2643 
2644 static void sock_ofree(struct sk_buff *skb)
2645 {
2646 	struct sock *sk = skb->sk;
2647 
2648 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2649 }
2650 
2651 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2652 			     gfp_t priority)
2653 {
2654 	struct sk_buff *skb;
2655 
2656 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2657 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2658 	    READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2659 		return NULL;
2660 
2661 	skb = alloc_skb(size, priority);
2662 	if (!skb)
2663 		return NULL;
2664 
2665 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2666 	skb->sk = sk;
2667 	skb->destructor = sock_ofree;
2668 	return skb;
2669 }
2670 
2671 /*
2672  * Allocate a memory block from the socket's option memory buffer.
2673  */
2674 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2675 {
2676 	int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2677 
2678 	if ((unsigned int)size <= optmem_max &&
2679 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2680 		void *mem;
2681 		/* First do the add, to avoid the race if kmalloc
2682 		 * might sleep.
2683 		 */
2684 		atomic_add(size, &sk->sk_omem_alloc);
2685 		mem = kmalloc(size, priority);
2686 		if (mem)
2687 			return mem;
2688 		atomic_sub(size, &sk->sk_omem_alloc);
2689 	}
2690 	return NULL;
2691 }
2692 EXPORT_SYMBOL(sock_kmalloc);
2693 
2694 /* Free an option memory block. Note, we actually want the inline
2695  * here as this allows gcc to detect the nullify and fold away the
2696  * condition entirely.
2697  */
2698 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2699 				  const bool nullify)
2700 {
2701 	if (WARN_ON_ONCE(!mem))
2702 		return;
2703 	if (nullify)
2704 		kfree_sensitive(mem);
2705 	else
2706 		kfree(mem);
2707 	atomic_sub(size, &sk->sk_omem_alloc);
2708 }
2709 
2710 void sock_kfree_s(struct sock *sk, void *mem, int size)
2711 {
2712 	__sock_kfree_s(sk, mem, size, false);
2713 }
2714 EXPORT_SYMBOL(sock_kfree_s);
2715 
2716 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2717 {
2718 	__sock_kfree_s(sk, mem, size, true);
2719 }
2720 EXPORT_SYMBOL(sock_kzfree_s);
2721 
2722 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2723    I think, these locks should be removed for datagram sockets.
2724  */
2725 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2726 {
2727 	DEFINE_WAIT(wait);
2728 
2729 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2730 	for (;;) {
2731 		if (!timeo)
2732 			break;
2733 		if (signal_pending(current))
2734 			break;
2735 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2736 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2737 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2738 			break;
2739 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2740 			break;
2741 		if (READ_ONCE(sk->sk_err))
2742 			break;
2743 		timeo = schedule_timeout(timeo);
2744 	}
2745 	finish_wait(sk_sleep(sk), &wait);
2746 	return timeo;
2747 }
2748 
2749 
2750 /*
2751  *	Generic send/receive buffer handlers
2752  */
2753 
2754 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2755 				     unsigned long data_len, int noblock,
2756 				     int *errcode, int max_page_order)
2757 {
2758 	struct sk_buff *skb;
2759 	long timeo;
2760 	int err;
2761 
2762 	timeo = sock_sndtimeo(sk, noblock);
2763 	for (;;) {
2764 		err = sock_error(sk);
2765 		if (err != 0)
2766 			goto failure;
2767 
2768 		err = -EPIPE;
2769 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2770 			goto failure;
2771 
2772 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2773 			break;
2774 
2775 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2776 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2777 		err = -EAGAIN;
2778 		if (!timeo)
2779 			goto failure;
2780 		if (signal_pending(current))
2781 			goto interrupted;
2782 		timeo = sock_wait_for_wmem(sk, timeo);
2783 	}
2784 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2785 				   errcode, sk->sk_allocation);
2786 	if (skb)
2787 		skb_set_owner_w(skb, sk);
2788 	return skb;
2789 
2790 interrupted:
2791 	err = sock_intr_errno(timeo);
2792 failure:
2793 	*errcode = err;
2794 	return NULL;
2795 }
2796 EXPORT_SYMBOL(sock_alloc_send_pskb);
2797 
2798 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2799 		     struct sockcm_cookie *sockc)
2800 {
2801 	u32 tsflags;
2802 
2803 	switch (cmsg->cmsg_type) {
2804 	case SO_MARK:
2805 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2806 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2807 			return -EPERM;
2808 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2809 			return -EINVAL;
2810 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2811 		break;
2812 	case SO_TIMESTAMPING_OLD:
2813 	case SO_TIMESTAMPING_NEW:
2814 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2815 			return -EINVAL;
2816 
2817 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2818 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2819 			return -EINVAL;
2820 
2821 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2822 		sockc->tsflags |= tsflags;
2823 		break;
2824 	case SCM_TXTIME:
2825 		if (!sock_flag(sk, SOCK_TXTIME))
2826 			return -EINVAL;
2827 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2828 			return -EINVAL;
2829 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2830 		break;
2831 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2832 	case SCM_RIGHTS:
2833 	case SCM_CREDENTIALS:
2834 		break;
2835 	default:
2836 		return -EINVAL;
2837 	}
2838 	return 0;
2839 }
2840 EXPORT_SYMBOL(__sock_cmsg_send);
2841 
2842 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2843 		   struct sockcm_cookie *sockc)
2844 {
2845 	struct cmsghdr *cmsg;
2846 	int ret;
2847 
2848 	for_each_cmsghdr(cmsg, msg) {
2849 		if (!CMSG_OK(msg, cmsg))
2850 			return -EINVAL;
2851 		if (cmsg->cmsg_level != SOL_SOCKET)
2852 			continue;
2853 		ret = __sock_cmsg_send(sk, cmsg, sockc);
2854 		if (ret)
2855 			return ret;
2856 	}
2857 	return 0;
2858 }
2859 EXPORT_SYMBOL(sock_cmsg_send);
2860 
2861 static void sk_enter_memory_pressure(struct sock *sk)
2862 {
2863 	if (!sk->sk_prot->enter_memory_pressure)
2864 		return;
2865 
2866 	sk->sk_prot->enter_memory_pressure(sk);
2867 }
2868 
2869 static void sk_leave_memory_pressure(struct sock *sk)
2870 {
2871 	if (sk->sk_prot->leave_memory_pressure) {
2872 		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2873 				     tcp_leave_memory_pressure, sk);
2874 	} else {
2875 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2876 
2877 		if (memory_pressure && READ_ONCE(*memory_pressure))
2878 			WRITE_ONCE(*memory_pressure, 0);
2879 	}
2880 }
2881 
2882 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2883 
2884 /**
2885  * skb_page_frag_refill - check that a page_frag contains enough room
2886  * @sz: minimum size of the fragment we want to get
2887  * @pfrag: pointer to page_frag
2888  * @gfp: priority for memory allocation
2889  *
2890  * Note: While this allocator tries to use high order pages, there is
2891  * no guarantee that allocations succeed. Therefore, @sz MUST be
2892  * less or equal than PAGE_SIZE.
2893  */
2894 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2895 {
2896 	if (pfrag->page) {
2897 		if (page_ref_count(pfrag->page) == 1) {
2898 			pfrag->offset = 0;
2899 			return true;
2900 		}
2901 		if (pfrag->offset + sz <= pfrag->size)
2902 			return true;
2903 		put_page(pfrag->page);
2904 	}
2905 
2906 	pfrag->offset = 0;
2907 	if (SKB_FRAG_PAGE_ORDER &&
2908 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2909 		/* Avoid direct reclaim but allow kswapd to wake */
2910 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2911 					  __GFP_COMP | __GFP_NOWARN |
2912 					  __GFP_NORETRY,
2913 					  SKB_FRAG_PAGE_ORDER);
2914 		if (likely(pfrag->page)) {
2915 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2916 			return true;
2917 		}
2918 	}
2919 	pfrag->page = alloc_page(gfp);
2920 	if (likely(pfrag->page)) {
2921 		pfrag->size = PAGE_SIZE;
2922 		return true;
2923 	}
2924 	return false;
2925 }
2926 EXPORT_SYMBOL(skb_page_frag_refill);
2927 
2928 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2929 {
2930 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2931 		return true;
2932 
2933 	sk_enter_memory_pressure(sk);
2934 	sk_stream_moderate_sndbuf(sk);
2935 	return false;
2936 }
2937 EXPORT_SYMBOL(sk_page_frag_refill);
2938 
2939 void __lock_sock(struct sock *sk)
2940 	__releases(&sk->sk_lock.slock)
2941 	__acquires(&sk->sk_lock.slock)
2942 {
2943 	DEFINE_WAIT(wait);
2944 
2945 	for (;;) {
2946 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2947 					TASK_UNINTERRUPTIBLE);
2948 		spin_unlock_bh(&sk->sk_lock.slock);
2949 		schedule();
2950 		spin_lock_bh(&sk->sk_lock.slock);
2951 		if (!sock_owned_by_user(sk))
2952 			break;
2953 	}
2954 	finish_wait(&sk->sk_lock.wq, &wait);
2955 }
2956 
2957 void __release_sock(struct sock *sk)
2958 	__releases(&sk->sk_lock.slock)
2959 	__acquires(&sk->sk_lock.slock)
2960 {
2961 	struct sk_buff *skb, *next;
2962 
2963 	while ((skb = sk->sk_backlog.head) != NULL) {
2964 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2965 
2966 		spin_unlock_bh(&sk->sk_lock.slock);
2967 
2968 		do {
2969 			next = skb->next;
2970 			prefetch(next);
2971 			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2972 			skb_mark_not_on_list(skb);
2973 			sk_backlog_rcv(sk, skb);
2974 
2975 			cond_resched();
2976 
2977 			skb = next;
2978 		} while (skb != NULL);
2979 
2980 		spin_lock_bh(&sk->sk_lock.slock);
2981 	}
2982 
2983 	/*
2984 	 * Doing the zeroing here guarantee we can not loop forever
2985 	 * while a wild producer attempts to flood us.
2986 	 */
2987 	sk->sk_backlog.len = 0;
2988 }
2989 
2990 void __sk_flush_backlog(struct sock *sk)
2991 {
2992 	spin_lock_bh(&sk->sk_lock.slock);
2993 	__release_sock(sk);
2994 
2995 	if (sk->sk_prot->release_cb)
2996 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
2997 				     tcp_release_cb, sk);
2998 
2999 	spin_unlock_bh(&sk->sk_lock.slock);
3000 }
3001 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3002 
3003 /**
3004  * sk_wait_data - wait for data to arrive at sk_receive_queue
3005  * @sk:    sock to wait on
3006  * @timeo: for how long
3007  * @skb:   last skb seen on sk_receive_queue
3008  *
3009  * Now socket state including sk->sk_err is changed only under lock,
3010  * hence we may omit checks after joining wait queue.
3011  * We check receive queue before schedule() only as optimization;
3012  * it is very likely that release_sock() added new data.
3013  */
3014 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3015 {
3016 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3017 	int rc;
3018 
3019 	add_wait_queue(sk_sleep(sk), &wait);
3020 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3021 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3022 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3023 	remove_wait_queue(sk_sleep(sk), &wait);
3024 	return rc;
3025 }
3026 EXPORT_SYMBOL(sk_wait_data);
3027 
3028 /**
3029  *	__sk_mem_raise_allocated - increase memory_allocated
3030  *	@sk: socket
3031  *	@size: memory size to allocate
3032  *	@amt: pages to allocate
3033  *	@kind: allocation type
3034  *
3035  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3036  *
3037  *	Unlike the globally shared limits among the sockets under same protocol,
3038  *	consuming the budget of a memcg won't have direct effect on other ones.
3039  *	So be optimistic about memcg's tolerance, and leave the callers to decide
3040  *	whether or not to raise allocated through sk_under_memory_pressure() or
3041  *	its variants.
3042  */
3043 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3044 {
3045 	struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3046 	struct proto *prot = sk->sk_prot;
3047 	bool charged = false;
3048 	long allocated;
3049 
3050 	sk_memory_allocated_add(sk, amt);
3051 	allocated = sk_memory_allocated(sk);
3052 
3053 	if (memcg) {
3054 		if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
3055 			goto suppress_allocation;
3056 		charged = true;
3057 	}
3058 
3059 	/* Under limit. */
3060 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3061 		sk_leave_memory_pressure(sk);
3062 		return 1;
3063 	}
3064 
3065 	/* Under pressure. */
3066 	if (allocated > sk_prot_mem_limits(sk, 1))
3067 		sk_enter_memory_pressure(sk);
3068 
3069 	/* Over hard limit. */
3070 	if (allocated > sk_prot_mem_limits(sk, 2))
3071 		goto suppress_allocation;
3072 
3073 	/* Guarantee minimum buffer size under pressure (either global
3074 	 * or memcg) to make sure features described in RFC 7323 (TCP
3075 	 * Extensions for High Performance) work properly.
3076 	 *
3077 	 * This rule does NOT stand when exceeds global or memcg's hard
3078 	 * limit, or else a DoS attack can be taken place by spawning
3079 	 * lots of sockets whose usage are under minimum buffer size.
3080 	 */
3081 	if (kind == SK_MEM_RECV) {
3082 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3083 			return 1;
3084 
3085 	} else { /* SK_MEM_SEND */
3086 		int wmem0 = sk_get_wmem0(sk, prot);
3087 
3088 		if (sk->sk_type == SOCK_STREAM) {
3089 			if (sk->sk_wmem_queued < wmem0)
3090 				return 1;
3091 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3092 				return 1;
3093 		}
3094 	}
3095 
3096 	if (sk_has_memory_pressure(sk)) {
3097 		u64 alloc;
3098 
3099 		/* The following 'average' heuristic is within the
3100 		 * scope of global accounting, so it only makes
3101 		 * sense for global memory pressure.
3102 		 */
3103 		if (!sk_under_global_memory_pressure(sk))
3104 			return 1;
3105 
3106 		/* Try to be fair among all the sockets under global
3107 		 * pressure by allowing the ones that below average
3108 		 * usage to raise.
3109 		 */
3110 		alloc = sk_sockets_allocated_read_positive(sk);
3111 		if (sk_prot_mem_limits(sk, 2) > alloc *
3112 		    sk_mem_pages(sk->sk_wmem_queued +
3113 				 atomic_read(&sk->sk_rmem_alloc) +
3114 				 sk->sk_forward_alloc))
3115 			return 1;
3116 	}
3117 
3118 suppress_allocation:
3119 
3120 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3121 		sk_stream_moderate_sndbuf(sk);
3122 
3123 		/* Fail only if socket is _under_ its sndbuf.
3124 		 * In this case we cannot block, so that we have to fail.
3125 		 */
3126 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3127 			/* Force charge with __GFP_NOFAIL */
3128 			if (memcg && !charged) {
3129 				mem_cgroup_charge_skmem(memcg, amt,
3130 					gfp_memcg_charge() | __GFP_NOFAIL);
3131 			}
3132 			return 1;
3133 		}
3134 	}
3135 
3136 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3137 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3138 
3139 	sk_memory_allocated_sub(sk, amt);
3140 
3141 	if (charged)
3142 		mem_cgroup_uncharge_skmem(memcg, amt);
3143 
3144 	return 0;
3145 }
3146 
3147 /**
3148  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3149  *	@sk: socket
3150  *	@size: memory size to allocate
3151  *	@kind: allocation type
3152  *
3153  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3154  *	rmem allocation. This function assumes that protocols which have
3155  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3156  */
3157 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3158 {
3159 	int ret, amt = sk_mem_pages(size);
3160 
3161 	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3162 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3163 	if (!ret)
3164 		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3165 	return ret;
3166 }
3167 EXPORT_SYMBOL(__sk_mem_schedule);
3168 
3169 /**
3170  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3171  *	@sk: socket
3172  *	@amount: number of quanta
3173  *
3174  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3175  */
3176 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3177 {
3178 	sk_memory_allocated_sub(sk, amount);
3179 
3180 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3181 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3182 
3183 	if (sk_under_global_memory_pressure(sk) &&
3184 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3185 		sk_leave_memory_pressure(sk);
3186 }
3187 
3188 /**
3189  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3190  *	@sk: socket
3191  *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3192  */
3193 void __sk_mem_reclaim(struct sock *sk, int amount)
3194 {
3195 	amount >>= PAGE_SHIFT;
3196 	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3197 	__sk_mem_reduce_allocated(sk, amount);
3198 }
3199 EXPORT_SYMBOL(__sk_mem_reclaim);
3200 
3201 int sk_set_peek_off(struct sock *sk, int val)
3202 {
3203 	WRITE_ONCE(sk->sk_peek_off, val);
3204 	return 0;
3205 }
3206 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3207 
3208 /*
3209  * Set of default routines for initialising struct proto_ops when
3210  * the protocol does not support a particular function. In certain
3211  * cases where it makes no sense for a protocol to have a "do nothing"
3212  * function, some default processing is provided.
3213  */
3214 
3215 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3216 {
3217 	return -EOPNOTSUPP;
3218 }
3219 EXPORT_SYMBOL(sock_no_bind);
3220 
3221 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3222 		    int len, int flags)
3223 {
3224 	return -EOPNOTSUPP;
3225 }
3226 EXPORT_SYMBOL(sock_no_connect);
3227 
3228 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3229 {
3230 	return -EOPNOTSUPP;
3231 }
3232 EXPORT_SYMBOL(sock_no_socketpair);
3233 
3234 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3235 		   bool kern)
3236 {
3237 	return -EOPNOTSUPP;
3238 }
3239 EXPORT_SYMBOL(sock_no_accept);
3240 
3241 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3242 		    int peer)
3243 {
3244 	return -EOPNOTSUPP;
3245 }
3246 EXPORT_SYMBOL(sock_no_getname);
3247 
3248 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3249 {
3250 	return -EOPNOTSUPP;
3251 }
3252 EXPORT_SYMBOL(sock_no_ioctl);
3253 
3254 int sock_no_listen(struct socket *sock, int backlog)
3255 {
3256 	return -EOPNOTSUPP;
3257 }
3258 EXPORT_SYMBOL(sock_no_listen);
3259 
3260 int sock_no_shutdown(struct socket *sock, int how)
3261 {
3262 	return -EOPNOTSUPP;
3263 }
3264 EXPORT_SYMBOL(sock_no_shutdown);
3265 
3266 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3267 {
3268 	return -EOPNOTSUPP;
3269 }
3270 EXPORT_SYMBOL(sock_no_sendmsg);
3271 
3272 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3273 {
3274 	return -EOPNOTSUPP;
3275 }
3276 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3277 
3278 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3279 		    int flags)
3280 {
3281 	return -EOPNOTSUPP;
3282 }
3283 EXPORT_SYMBOL(sock_no_recvmsg);
3284 
3285 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3286 {
3287 	/* Mirror missing mmap method error code */
3288 	return -ENODEV;
3289 }
3290 EXPORT_SYMBOL(sock_no_mmap);
3291 
3292 /*
3293  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3294  * various sock-based usage counts.
3295  */
3296 void __receive_sock(struct file *file)
3297 {
3298 	struct socket *sock;
3299 
3300 	sock = sock_from_file(file);
3301 	if (sock) {
3302 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3303 		sock_update_classid(&sock->sk->sk_cgrp_data);
3304 	}
3305 }
3306 
3307 /*
3308  *	Default Socket Callbacks
3309  */
3310 
3311 static void sock_def_wakeup(struct sock *sk)
3312 {
3313 	struct socket_wq *wq;
3314 
3315 	rcu_read_lock();
3316 	wq = rcu_dereference(sk->sk_wq);
3317 	if (skwq_has_sleeper(wq))
3318 		wake_up_interruptible_all(&wq->wait);
3319 	rcu_read_unlock();
3320 }
3321 
3322 static void sock_def_error_report(struct sock *sk)
3323 {
3324 	struct socket_wq *wq;
3325 
3326 	rcu_read_lock();
3327 	wq = rcu_dereference(sk->sk_wq);
3328 	if (skwq_has_sleeper(wq))
3329 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3330 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3331 	rcu_read_unlock();
3332 }
3333 
3334 void sock_def_readable(struct sock *sk)
3335 {
3336 	struct socket_wq *wq;
3337 
3338 	trace_sk_data_ready(sk);
3339 
3340 	rcu_read_lock();
3341 	wq = rcu_dereference(sk->sk_wq);
3342 	if (skwq_has_sleeper(wq))
3343 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3344 						EPOLLRDNORM | EPOLLRDBAND);
3345 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3346 	rcu_read_unlock();
3347 }
3348 
3349 static void sock_def_write_space(struct sock *sk)
3350 {
3351 	struct socket_wq *wq;
3352 
3353 	rcu_read_lock();
3354 
3355 	/* Do not wake up a writer until he can make "significant"
3356 	 * progress.  --DaveM
3357 	 */
3358 	if (sock_writeable(sk)) {
3359 		wq = rcu_dereference(sk->sk_wq);
3360 		if (skwq_has_sleeper(wq))
3361 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3362 						EPOLLWRNORM | EPOLLWRBAND);
3363 
3364 		/* Should agree with poll, otherwise some programs break */
3365 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3366 	}
3367 
3368 	rcu_read_unlock();
3369 }
3370 
3371 /* An optimised version of sock_def_write_space(), should only be called
3372  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3373  * ->sk_wmem_alloc.
3374  */
3375 static void sock_def_write_space_wfree(struct sock *sk)
3376 {
3377 	/* Do not wake up a writer until he can make "significant"
3378 	 * progress.  --DaveM
3379 	 */
3380 	if (sock_writeable(sk)) {
3381 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3382 
3383 		/* rely on refcount_sub from sock_wfree() */
3384 		smp_mb__after_atomic();
3385 		if (wq && waitqueue_active(&wq->wait))
3386 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3387 						EPOLLWRNORM | EPOLLWRBAND);
3388 
3389 		/* Should agree with poll, otherwise some programs break */
3390 		sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3391 	}
3392 }
3393 
3394 static void sock_def_destruct(struct sock *sk)
3395 {
3396 }
3397 
3398 void sk_send_sigurg(struct sock *sk)
3399 {
3400 	if (sk->sk_socket && sk->sk_socket->file)
3401 		if (send_sigurg(&sk->sk_socket->file->f_owner))
3402 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3403 }
3404 EXPORT_SYMBOL(sk_send_sigurg);
3405 
3406 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3407 		    unsigned long expires)
3408 {
3409 	if (!mod_timer(timer, expires))
3410 		sock_hold(sk);
3411 }
3412 EXPORT_SYMBOL(sk_reset_timer);
3413 
3414 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3415 {
3416 	if (del_timer(timer))
3417 		__sock_put(sk);
3418 }
3419 EXPORT_SYMBOL(sk_stop_timer);
3420 
3421 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3422 {
3423 	if (del_timer_sync(timer))
3424 		__sock_put(sk);
3425 }
3426 EXPORT_SYMBOL(sk_stop_timer_sync);
3427 
3428 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3429 {
3430 	sk_init_common(sk);
3431 	sk->sk_send_head	=	NULL;
3432 
3433 	timer_setup(&sk->sk_timer, NULL, 0);
3434 
3435 	sk->sk_allocation	=	GFP_KERNEL;
3436 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3437 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3438 	sk->sk_state		=	TCP_CLOSE;
3439 	sk->sk_use_task_frag	=	true;
3440 	sk_set_socket(sk, sock);
3441 
3442 	sock_set_flag(sk, SOCK_ZAPPED);
3443 
3444 	if (sock) {
3445 		sk->sk_type	=	sock->type;
3446 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3447 		sock->sk	=	sk;
3448 	} else {
3449 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3450 	}
3451 	sk->sk_uid	=	uid;
3452 
3453 	rwlock_init(&sk->sk_callback_lock);
3454 	if (sk->sk_kern_sock)
3455 		lockdep_set_class_and_name(
3456 			&sk->sk_callback_lock,
3457 			af_kern_callback_keys + sk->sk_family,
3458 			af_family_kern_clock_key_strings[sk->sk_family]);
3459 	else
3460 		lockdep_set_class_and_name(
3461 			&sk->sk_callback_lock,
3462 			af_callback_keys + sk->sk_family,
3463 			af_family_clock_key_strings[sk->sk_family]);
3464 
3465 	sk->sk_state_change	=	sock_def_wakeup;
3466 	sk->sk_data_ready	=	sock_def_readable;
3467 	sk->sk_write_space	=	sock_def_write_space;
3468 	sk->sk_error_report	=	sock_def_error_report;
3469 	sk->sk_destruct		=	sock_def_destruct;
3470 
3471 	sk->sk_frag.page	=	NULL;
3472 	sk->sk_frag.offset	=	0;
3473 	sk->sk_peek_off		=	-1;
3474 
3475 	sk->sk_peer_pid 	=	NULL;
3476 	sk->sk_peer_cred	=	NULL;
3477 	spin_lock_init(&sk->sk_peer_lock);
3478 
3479 	sk->sk_write_pending	=	0;
3480 	sk->sk_rcvlowat		=	1;
3481 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3482 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3483 
3484 	sk->sk_stamp = SK_DEFAULT_STAMP;
3485 #if BITS_PER_LONG==32
3486 	seqlock_init(&sk->sk_stamp_seq);
3487 #endif
3488 	atomic_set(&sk->sk_zckey, 0);
3489 
3490 #ifdef CONFIG_NET_RX_BUSY_POLL
3491 	sk->sk_napi_id		=	0;
3492 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3493 #endif
3494 
3495 	sk->sk_max_pacing_rate = ~0UL;
3496 	sk->sk_pacing_rate = ~0UL;
3497 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3498 	sk->sk_incoming_cpu = -1;
3499 
3500 	sk_rx_queue_clear(sk);
3501 	/*
3502 	 * Before updating sk_refcnt, we must commit prior changes to memory
3503 	 * (Documentation/RCU/rculist_nulls.rst for details)
3504 	 */
3505 	smp_wmb();
3506 	refcount_set(&sk->sk_refcnt, 1);
3507 	atomic_set(&sk->sk_drops, 0);
3508 }
3509 EXPORT_SYMBOL(sock_init_data_uid);
3510 
3511 void sock_init_data(struct socket *sock, struct sock *sk)
3512 {
3513 	kuid_t uid = sock ?
3514 		SOCK_INODE(sock)->i_uid :
3515 		make_kuid(sock_net(sk)->user_ns, 0);
3516 
3517 	sock_init_data_uid(sock, sk, uid);
3518 }
3519 EXPORT_SYMBOL(sock_init_data);
3520 
3521 void lock_sock_nested(struct sock *sk, int subclass)
3522 {
3523 	/* The sk_lock has mutex_lock() semantics here. */
3524 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3525 
3526 	might_sleep();
3527 	spin_lock_bh(&sk->sk_lock.slock);
3528 	if (sock_owned_by_user_nocheck(sk))
3529 		__lock_sock(sk);
3530 	sk->sk_lock.owned = 1;
3531 	spin_unlock_bh(&sk->sk_lock.slock);
3532 }
3533 EXPORT_SYMBOL(lock_sock_nested);
3534 
3535 void release_sock(struct sock *sk)
3536 {
3537 	spin_lock_bh(&sk->sk_lock.slock);
3538 	if (sk->sk_backlog.tail)
3539 		__release_sock(sk);
3540 
3541 	if (sk->sk_prot->release_cb)
3542 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3543 				     tcp_release_cb, sk);
3544 
3545 	sock_release_ownership(sk);
3546 	if (waitqueue_active(&sk->sk_lock.wq))
3547 		wake_up(&sk->sk_lock.wq);
3548 	spin_unlock_bh(&sk->sk_lock.slock);
3549 }
3550 EXPORT_SYMBOL(release_sock);
3551 
3552 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3553 {
3554 	might_sleep();
3555 	spin_lock_bh(&sk->sk_lock.slock);
3556 
3557 	if (!sock_owned_by_user_nocheck(sk)) {
3558 		/*
3559 		 * Fast path return with bottom halves disabled and
3560 		 * sock::sk_lock.slock held.
3561 		 *
3562 		 * The 'mutex' is not contended and holding
3563 		 * sock::sk_lock.slock prevents all other lockers to
3564 		 * proceed so the corresponding unlock_sock_fast() can
3565 		 * avoid the slow path of release_sock() completely and
3566 		 * just release slock.
3567 		 *
3568 		 * From a semantical POV this is equivalent to 'acquiring'
3569 		 * the 'mutex', hence the corresponding lockdep
3570 		 * mutex_release() has to happen in the fast path of
3571 		 * unlock_sock_fast().
3572 		 */
3573 		return false;
3574 	}
3575 
3576 	__lock_sock(sk);
3577 	sk->sk_lock.owned = 1;
3578 	__acquire(&sk->sk_lock.slock);
3579 	spin_unlock_bh(&sk->sk_lock.slock);
3580 	return true;
3581 }
3582 EXPORT_SYMBOL(__lock_sock_fast);
3583 
3584 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3585 		   bool timeval, bool time32)
3586 {
3587 	struct sock *sk = sock->sk;
3588 	struct timespec64 ts;
3589 
3590 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3591 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3592 	if (ts.tv_sec == -1)
3593 		return -ENOENT;
3594 	if (ts.tv_sec == 0) {
3595 		ktime_t kt = ktime_get_real();
3596 		sock_write_timestamp(sk, kt);
3597 		ts = ktime_to_timespec64(kt);
3598 	}
3599 
3600 	if (timeval)
3601 		ts.tv_nsec /= 1000;
3602 
3603 #ifdef CONFIG_COMPAT_32BIT_TIME
3604 	if (time32)
3605 		return put_old_timespec32(&ts, userstamp);
3606 #endif
3607 #ifdef CONFIG_SPARC64
3608 	/* beware of padding in sparc64 timeval */
3609 	if (timeval && !in_compat_syscall()) {
3610 		struct __kernel_old_timeval __user tv = {
3611 			.tv_sec = ts.tv_sec,
3612 			.tv_usec = ts.tv_nsec,
3613 		};
3614 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3615 			return -EFAULT;
3616 		return 0;
3617 	}
3618 #endif
3619 	return put_timespec64(&ts, userstamp);
3620 }
3621 EXPORT_SYMBOL(sock_gettstamp);
3622 
3623 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3624 {
3625 	if (!sock_flag(sk, flag)) {
3626 		unsigned long previous_flags = sk->sk_flags;
3627 
3628 		sock_set_flag(sk, flag);
3629 		/*
3630 		 * we just set one of the two flags which require net
3631 		 * time stamping, but time stamping might have been on
3632 		 * already because of the other one
3633 		 */
3634 		if (sock_needs_netstamp(sk) &&
3635 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3636 			net_enable_timestamp();
3637 	}
3638 }
3639 
3640 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3641 		       int level, int type)
3642 {
3643 	struct sock_exterr_skb *serr;
3644 	struct sk_buff *skb;
3645 	int copied, err;
3646 
3647 	err = -EAGAIN;
3648 	skb = sock_dequeue_err_skb(sk);
3649 	if (skb == NULL)
3650 		goto out;
3651 
3652 	copied = skb->len;
3653 	if (copied > len) {
3654 		msg->msg_flags |= MSG_TRUNC;
3655 		copied = len;
3656 	}
3657 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3658 	if (err)
3659 		goto out_free_skb;
3660 
3661 	sock_recv_timestamp(msg, sk, skb);
3662 
3663 	serr = SKB_EXT_ERR(skb);
3664 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3665 
3666 	msg->msg_flags |= MSG_ERRQUEUE;
3667 	err = copied;
3668 
3669 out_free_skb:
3670 	kfree_skb(skb);
3671 out:
3672 	return err;
3673 }
3674 EXPORT_SYMBOL(sock_recv_errqueue);
3675 
3676 /*
3677  *	Get a socket option on an socket.
3678  *
3679  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3680  *	asynchronous errors should be reported by getsockopt. We assume
3681  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3682  */
3683 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3684 			   char __user *optval, int __user *optlen)
3685 {
3686 	struct sock *sk = sock->sk;
3687 
3688 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3689 	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3690 }
3691 EXPORT_SYMBOL(sock_common_getsockopt);
3692 
3693 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3694 			int flags)
3695 {
3696 	struct sock *sk = sock->sk;
3697 	int addr_len = 0;
3698 	int err;
3699 
3700 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3701 	if (err >= 0)
3702 		msg->msg_namelen = addr_len;
3703 	return err;
3704 }
3705 EXPORT_SYMBOL(sock_common_recvmsg);
3706 
3707 /*
3708  *	Set socket options on an inet socket.
3709  */
3710 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3711 			   sockptr_t optval, unsigned int optlen)
3712 {
3713 	struct sock *sk = sock->sk;
3714 
3715 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3716 	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3717 }
3718 EXPORT_SYMBOL(sock_common_setsockopt);
3719 
3720 void sk_common_release(struct sock *sk)
3721 {
3722 	if (sk->sk_prot->destroy)
3723 		sk->sk_prot->destroy(sk);
3724 
3725 	/*
3726 	 * Observation: when sk_common_release is called, processes have
3727 	 * no access to socket. But net still has.
3728 	 * Step one, detach it from networking:
3729 	 *
3730 	 * A. Remove from hash tables.
3731 	 */
3732 
3733 	sk->sk_prot->unhash(sk);
3734 
3735 	/*
3736 	 * In this point socket cannot receive new packets, but it is possible
3737 	 * that some packets are in flight because some CPU runs receiver and
3738 	 * did hash table lookup before we unhashed socket. They will achieve
3739 	 * receive queue and will be purged by socket destructor.
3740 	 *
3741 	 * Also we still have packets pending on receive queue and probably,
3742 	 * our own packets waiting in device queues. sock_destroy will drain
3743 	 * receive queue, but transmitted packets will delay socket destruction
3744 	 * until the last reference will be released.
3745 	 */
3746 
3747 	sock_orphan(sk);
3748 
3749 	xfrm_sk_free_policy(sk);
3750 
3751 	sock_put(sk);
3752 }
3753 EXPORT_SYMBOL(sk_common_release);
3754 
3755 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3756 {
3757 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3758 
3759 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3760 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3761 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3762 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3763 	mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3764 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3765 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3766 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3767 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3768 }
3769 
3770 #ifdef CONFIG_PROC_FS
3771 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3772 
3773 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3774 {
3775 	int cpu, idx = prot->inuse_idx;
3776 	int res = 0;
3777 
3778 	for_each_possible_cpu(cpu)
3779 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3780 
3781 	return res >= 0 ? res : 0;
3782 }
3783 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3784 
3785 int sock_inuse_get(struct net *net)
3786 {
3787 	int cpu, res = 0;
3788 
3789 	for_each_possible_cpu(cpu)
3790 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3791 
3792 	return res;
3793 }
3794 
3795 EXPORT_SYMBOL_GPL(sock_inuse_get);
3796 
3797 static int __net_init sock_inuse_init_net(struct net *net)
3798 {
3799 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3800 	if (net->core.prot_inuse == NULL)
3801 		return -ENOMEM;
3802 	return 0;
3803 }
3804 
3805 static void __net_exit sock_inuse_exit_net(struct net *net)
3806 {
3807 	free_percpu(net->core.prot_inuse);
3808 }
3809 
3810 static struct pernet_operations net_inuse_ops = {
3811 	.init = sock_inuse_init_net,
3812 	.exit = sock_inuse_exit_net,
3813 };
3814 
3815 static __init int net_inuse_init(void)
3816 {
3817 	if (register_pernet_subsys(&net_inuse_ops))
3818 		panic("Cannot initialize net inuse counters");
3819 
3820 	return 0;
3821 }
3822 
3823 core_initcall(net_inuse_init);
3824 
3825 static int assign_proto_idx(struct proto *prot)
3826 {
3827 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3828 
3829 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3830 		pr_err("PROTO_INUSE_NR exhausted\n");
3831 		return -ENOSPC;
3832 	}
3833 
3834 	set_bit(prot->inuse_idx, proto_inuse_idx);
3835 	return 0;
3836 }
3837 
3838 static void release_proto_idx(struct proto *prot)
3839 {
3840 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3841 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3842 }
3843 #else
3844 static inline int assign_proto_idx(struct proto *prot)
3845 {
3846 	return 0;
3847 }
3848 
3849 static inline void release_proto_idx(struct proto *prot)
3850 {
3851 }
3852 
3853 #endif
3854 
3855 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3856 {
3857 	if (!twsk_prot)
3858 		return;
3859 	kfree(twsk_prot->twsk_slab_name);
3860 	twsk_prot->twsk_slab_name = NULL;
3861 	kmem_cache_destroy(twsk_prot->twsk_slab);
3862 	twsk_prot->twsk_slab = NULL;
3863 }
3864 
3865 static int tw_prot_init(const struct proto *prot)
3866 {
3867 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3868 
3869 	if (!twsk_prot)
3870 		return 0;
3871 
3872 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3873 					      prot->name);
3874 	if (!twsk_prot->twsk_slab_name)
3875 		return -ENOMEM;
3876 
3877 	twsk_prot->twsk_slab =
3878 		kmem_cache_create(twsk_prot->twsk_slab_name,
3879 				  twsk_prot->twsk_obj_size, 0,
3880 				  SLAB_ACCOUNT | prot->slab_flags,
3881 				  NULL);
3882 	if (!twsk_prot->twsk_slab) {
3883 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3884 			prot->name);
3885 		return -ENOMEM;
3886 	}
3887 
3888 	return 0;
3889 }
3890 
3891 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3892 {
3893 	if (!rsk_prot)
3894 		return;
3895 	kfree(rsk_prot->slab_name);
3896 	rsk_prot->slab_name = NULL;
3897 	kmem_cache_destroy(rsk_prot->slab);
3898 	rsk_prot->slab = NULL;
3899 }
3900 
3901 static int req_prot_init(const struct proto *prot)
3902 {
3903 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3904 
3905 	if (!rsk_prot)
3906 		return 0;
3907 
3908 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3909 					prot->name);
3910 	if (!rsk_prot->slab_name)
3911 		return -ENOMEM;
3912 
3913 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3914 					   rsk_prot->obj_size, 0,
3915 					   SLAB_ACCOUNT | prot->slab_flags,
3916 					   NULL);
3917 
3918 	if (!rsk_prot->slab) {
3919 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3920 			prot->name);
3921 		return -ENOMEM;
3922 	}
3923 	return 0;
3924 }
3925 
3926 int proto_register(struct proto *prot, int alloc_slab)
3927 {
3928 	int ret = -ENOBUFS;
3929 
3930 	if (prot->memory_allocated && !prot->sysctl_mem) {
3931 		pr_err("%s: missing sysctl_mem\n", prot->name);
3932 		return -EINVAL;
3933 	}
3934 	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3935 		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3936 		return -EINVAL;
3937 	}
3938 	if (alloc_slab) {
3939 		prot->slab = kmem_cache_create_usercopy(prot->name,
3940 					prot->obj_size, 0,
3941 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3942 					prot->slab_flags,
3943 					prot->useroffset, prot->usersize,
3944 					NULL);
3945 
3946 		if (prot->slab == NULL) {
3947 			pr_crit("%s: Can't create sock SLAB cache!\n",
3948 				prot->name);
3949 			goto out;
3950 		}
3951 
3952 		if (req_prot_init(prot))
3953 			goto out_free_request_sock_slab;
3954 
3955 		if (tw_prot_init(prot))
3956 			goto out_free_timewait_sock_slab;
3957 	}
3958 
3959 	mutex_lock(&proto_list_mutex);
3960 	ret = assign_proto_idx(prot);
3961 	if (ret) {
3962 		mutex_unlock(&proto_list_mutex);
3963 		goto out_free_timewait_sock_slab;
3964 	}
3965 	list_add(&prot->node, &proto_list);
3966 	mutex_unlock(&proto_list_mutex);
3967 	return ret;
3968 
3969 out_free_timewait_sock_slab:
3970 	if (alloc_slab)
3971 		tw_prot_cleanup(prot->twsk_prot);
3972 out_free_request_sock_slab:
3973 	if (alloc_slab) {
3974 		req_prot_cleanup(prot->rsk_prot);
3975 
3976 		kmem_cache_destroy(prot->slab);
3977 		prot->slab = NULL;
3978 	}
3979 out:
3980 	return ret;
3981 }
3982 EXPORT_SYMBOL(proto_register);
3983 
3984 void proto_unregister(struct proto *prot)
3985 {
3986 	mutex_lock(&proto_list_mutex);
3987 	release_proto_idx(prot);
3988 	list_del(&prot->node);
3989 	mutex_unlock(&proto_list_mutex);
3990 
3991 	kmem_cache_destroy(prot->slab);
3992 	prot->slab = NULL;
3993 
3994 	req_prot_cleanup(prot->rsk_prot);
3995 	tw_prot_cleanup(prot->twsk_prot);
3996 }
3997 EXPORT_SYMBOL(proto_unregister);
3998 
3999 int sock_load_diag_module(int family, int protocol)
4000 {
4001 	if (!protocol) {
4002 		if (!sock_is_registered(family))
4003 			return -ENOENT;
4004 
4005 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4006 				      NETLINK_SOCK_DIAG, family);
4007 	}
4008 
4009 #ifdef CONFIG_INET
4010 	if (family == AF_INET &&
4011 	    protocol != IPPROTO_RAW &&
4012 	    protocol < MAX_INET_PROTOS &&
4013 	    !rcu_access_pointer(inet_protos[protocol]))
4014 		return -ENOENT;
4015 #endif
4016 
4017 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4018 			      NETLINK_SOCK_DIAG, family, protocol);
4019 }
4020 EXPORT_SYMBOL(sock_load_diag_module);
4021 
4022 #ifdef CONFIG_PROC_FS
4023 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4024 	__acquires(proto_list_mutex)
4025 {
4026 	mutex_lock(&proto_list_mutex);
4027 	return seq_list_start_head(&proto_list, *pos);
4028 }
4029 
4030 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4031 {
4032 	return seq_list_next(v, &proto_list, pos);
4033 }
4034 
4035 static void proto_seq_stop(struct seq_file *seq, void *v)
4036 	__releases(proto_list_mutex)
4037 {
4038 	mutex_unlock(&proto_list_mutex);
4039 }
4040 
4041 static char proto_method_implemented(const void *method)
4042 {
4043 	return method == NULL ? 'n' : 'y';
4044 }
4045 static long sock_prot_memory_allocated(struct proto *proto)
4046 {
4047 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4048 }
4049 
4050 static const char *sock_prot_memory_pressure(struct proto *proto)
4051 {
4052 	return proto->memory_pressure != NULL ?
4053 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4054 }
4055 
4056 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4057 {
4058 
4059 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4060 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4061 		   proto->name,
4062 		   proto->obj_size,
4063 		   sock_prot_inuse_get(seq_file_net(seq), proto),
4064 		   sock_prot_memory_allocated(proto),
4065 		   sock_prot_memory_pressure(proto),
4066 		   proto->max_header,
4067 		   proto->slab == NULL ? "no" : "yes",
4068 		   module_name(proto->owner),
4069 		   proto_method_implemented(proto->close),
4070 		   proto_method_implemented(proto->connect),
4071 		   proto_method_implemented(proto->disconnect),
4072 		   proto_method_implemented(proto->accept),
4073 		   proto_method_implemented(proto->ioctl),
4074 		   proto_method_implemented(proto->init),
4075 		   proto_method_implemented(proto->destroy),
4076 		   proto_method_implemented(proto->shutdown),
4077 		   proto_method_implemented(proto->setsockopt),
4078 		   proto_method_implemented(proto->getsockopt),
4079 		   proto_method_implemented(proto->sendmsg),
4080 		   proto_method_implemented(proto->recvmsg),
4081 		   proto_method_implemented(proto->bind),
4082 		   proto_method_implemented(proto->backlog_rcv),
4083 		   proto_method_implemented(proto->hash),
4084 		   proto_method_implemented(proto->unhash),
4085 		   proto_method_implemented(proto->get_port),
4086 		   proto_method_implemented(proto->enter_memory_pressure));
4087 }
4088 
4089 static int proto_seq_show(struct seq_file *seq, void *v)
4090 {
4091 	if (v == &proto_list)
4092 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4093 			   "protocol",
4094 			   "size",
4095 			   "sockets",
4096 			   "memory",
4097 			   "press",
4098 			   "maxhdr",
4099 			   "slab",
4100 			   "module",
4101 			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4102 	else
4103 		proto_seq_printf(seq, list_entry(v, struct proto, node));
4104 	return 0;
4105 }
4106 
4107 static const struct seq_operations proto_seq_ops = {
4108 	.start  = proto_seq_start,
4109 	.next   = proto_seq_next,
4110 	.stop   = proto_seq_stop,
4111 	.show   = proto_seq_show,
4112 };
4113 
4114 static __net_init int proto_init_net(struct net *net)
4115 {
4116 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4117 			sizeof(struct seq_net_private)))
4118 		return -ENOMEM;
4119 
4120 	return 0;
4121 }
4122 
4123 static __net_exit void proto_exit_net(struct net *net)
4124 {
4125 	remove_proc_entry("protocols", net->proc_net);
4126 }
4127 
4128 
4129 static __net_initdata struct pernet_operations proto_net_ops = {
4130 	.init = proto_init_net,
4131 	.exit = proto_exit_net,
4132 };
4133 
4134 static int __init proto_init(void)
4135 {
4136 	return register_pernet_subsys(&proto_net_ops);
4137 }
4138 
4139 subsys_initcall(proto_init);
4140 
4141 #endif /* PROC_FS */
4142 
4143 #ifdef CONFIG_NET_RX_BUSY_POLL
4144 bool sk_busy_loop_end(void *p, unsigned long start_time)
4145 {
4146 	struct sock *sk = p;
4147 
4148 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4149 		return true;
4150 
4151 	if (sk_is_udp(sk) &&
4152 	    !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4153 		return true;
4154 
4155 	return sk_busy_loop_timeout(sk, start_time);
4156 }
4157 EXPORT_SYMBOL(sk_busy_loop_end);
4158 #endif /* CONFIG_NET_RX_BUSY_POLL */
4159 
4160 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4161 {
4162 	if (!sk->sk_prot->bind_add)
4163 		return -EOPNOTSUPP;
4164 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4165 }
4166 EXPORT_SYMBOL(sock_bind_add);
4167 
4168 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4169 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4170 		     void __user *arg, void *karg, size_t size)
4171 {
4172 	int ret;
4173 
4174 	if (copy_from_user(karg, arg, size))
4175 		return -EFAULT;
4176 
4177 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4178 	if (ret)
4179 		return ret;
4180 
4181 	if (copy_to_user(arg, karg, size))
4182 		return -EFAULT;
4183 
4184 	return 0;
4185 }
4186 EXPORT_SYMBOL(sock_ioctl_inout);
4187 
4188 /* This is the most common ioctl prep function, where the result (4 bytes) is
4189  * copied back to userspace if the ioctl() returns successfully. No input is
4190  * copied from userspace as input argument.
4191  */
4192 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4193 {
4194 	int ret, karg = 0;
4195 
4196 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4197 	if (ret)
4198 		return ret;
4199 
4200 	return put_user(karg, (int __user *)arg);
4201 }
4202 
4203 /* A wrapper around sock ioctls, which copies the data from userspace
4204  * (depending on the protocol/ioctl), and copies back the result to userspace.
4205  * The main motivation for this function is to pass kernel memory to the
4206  * protocol ioctl callbacks, instead of userspace memory.
4207  */
4208 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4209 {
4210 	int rc = 1;
4211 
4212 	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4213 		rc = ipmr_sk_ioctl(sk, cmd, arg);
4214 	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4215 		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4216 	else if (sk_is_phonet(sk))
4217 		rc = phonet_sk_ioctl(sk, cmd, arg);
4218 
4219 	/* If ioctl was processed, returns its value */
4220 	if (rc <= 0)
4221 		return rc;
4222 
4223 	/* Otherwise call the default handler */
4224 	return sock_ioctl_out(sk, cmd, arg);
4225 }
4226 EXPORT_SYMBOL(sk_ioctl);
4227