xref: /linux/net/core/sock.c (revision b6459415b384cb829f0b2a4268f211c789f6cf0b)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117 
118 #include <linux/uaccess.h>
119 
120 #include <linux/netdevice.h>
121 #include <net/protocol.h>
122 #include <linux/skbuff.h>
123 #include <net/net_namespace.h>
124 #include <net/request_sock.h>
125 #include <net/sock.h>
126 #include <linux/net_tstamp.h>
127 #include <net/xfrm.h>
128 #include <linux/ipsec.h>
129 #include <net/cls_cgroup.h>
130 #include <net/netprio_cgroup.h>
131 #include <linux/sock_diag.h>
132 
133 #include <linux/filter.h>
134 #include <net/sock_reuseport.h>
135 #include <net/bpf_sk_storage.h>
136 
137 #include <trace/events/sock.h>
138 
139 #include <net/tcp.h>
140 #include <net/busy_poll.h>
141 
142 #include <linux/ethtool.h>
143 
144 static DEFINE_MUTEX(proto_list_mutex);
145 static LIST_HEAD(proto_list);
146 
147 /**
148  * sk_ns_capable - General socket capability test
149  * @sk: Socket to use a capability on or through
150  * @user_ns: The user namespace of the capability to use
151  * @cap: The capability to use
152  *
153  * Test to see if the opener of the socket had when the socket was
154  * created and the current process has the capability @cap in the user
155  * namespace @user_ns.
156  */
157 bool sk_ns_capable(const struct sock *sk,
158 		   struct user_namespace *user_ns, int cap)
159 {
160 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
161 		ns_capable(user_ns, cap);
162 }
163 EXPORT_SYMBOL(sk_ns_capable);
164 
165 /**
166  * sk_capable - Socket global capability test
167  * @sk: Socket to use a capability on or through
168  * @cap: The global capability to use
169  *
170  * Test to see if the opener of the socket had when the socket was
171  * created and the current process has the capability @cap in all user
172  * namespaces.
173  */
174 bool sk_capable(const struct sock *sk, int cap)
175 {
176 	return sk_ns_capable(sk, &init_user_ns, cap);
177 }
178 EXPORT_SYMBOL(sk_capable);
179 
180 /**
181  * sk_net_capable - Network namespace socket capability test
182  * @sk: Socket to use a capability on or through
183  * @cap: The capability to use
184  *
185  * Test to see if the opener of the socket had when the socket was created
186  * and the current process has the capability @cap over the network namespace
187  * the socket is a member of.
188  */
189 bool sk_net_capable(const struct sock *sk, int cap)
190 {
191 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
192 }
193 EXPORT_SYMBOL(sk_net_capable);
194 
195 /*
196  * Each address family might have different locking rules, so we have
197  * one slock key per address family and separate keys for internal and
198  * userspace sockets.
199  */
200 static struct lock_class_key af_family_keys[AF_MAX];
201 static struct lock_class_key af_family_kern_keys[AF_MAX];
202 static struct lock_class_key af_family_slock_keys[AF_MAX];
203 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
204 
205 /*
206  * Make lock validator output more readable. (we pre-construct these
207  * strings build-time, so that runtime initialization of socket
208  * locks is fast):
209  */
210 
211 #define _sock_locks(x)						  \
212   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
213   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
214   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
215   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
216   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
217   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
218   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
219   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
220   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
221   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
222   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
223   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
224   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
225   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
226   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
227   x "AF_MCTP"  , \
228   x "AF_MAX"
229 
230 static const char *const af_family_key_strings[AF_MAX+1] = {
231 	_sock_locks("sk_lock-")
232 };
233 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
234 	_sock_locks("slock-")
235 };
236 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
237 	_sock_locks("clock-")
238 };
239 
240 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
241 	_sock_locks("k-sk_lock-")
242 };
243 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
244 	_sock_locks("k-slock-")
245 };
246 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
247 	_sock_locks("k-clock-")
248 };
249 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
250 	_sock_locks("rlock-")
251 };
252 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
253 	_sock_locks("wlock-")
254 };
255 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
256 	_sock_locks("elock-")
257 };
258 
259 /*
260  * sk_callback_lock and sk queues locking rules are per-address-family,
261  * so split the lock classes by using a per-AF key:
262  */
263 static struct lock_class_key af_callback_keys[AF_MAX];
264 static struct lock_class_key af_rlock_keys[AF_MAX];
265 static struct lock_class_key af_wlock_keys[AF_MAX];
266 static struct lock_class_key af_elock_keys[AF_MAX];
267 static struct lock_class_key af_kern_callback_keys[AF_MAX];
268 
269 /* Run time adjustable parameters. */
270 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
271 EXPORT_SYMBOL(sysctl_wmem_max);
272 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
273 EXPORT_SYMBOL(sysctl_rmem_max);
274 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
275 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
276 
277 /* Maximal space eaten by iovec or ancillary data plus some space */
278 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
279 EXPORT_SYMBOL(sysctl_optmem_max);
280 
281 int sysctl_tstamp_allow_data __read_mostly = 1;
282 
283 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
284 EXPORT_SYMBOL_GPL(memalloc_socks_key);
285 
286 /**
287  * sk_set_memalloc - sets %SOCK_MEMALLOC
288  * @sk: socket to set it on
289  *
290  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
291  * It's the responsibility of the admin to adjust min_free_kbytes
292  * to meet the requirements
293  */
294 void sk_set_memalloc(struct sock *sk)
295 {
296 	sock_set_flag(sk, SOCK_MEMALLOC);
297 	sk->sk_allocation |= __GFP_MEMALLOC;
298 	static_branch_inc(&memalloc_socks_key);
299 }
300 EXPORT_SYMBOL_GPL(sk_set_memalloc);
301 
302 void sk_clear_memalloc(struct sock *sk)
303 {
304 	sock_reset_flag(sk, SOCK_MEMALLOC);
305 	sk->sk_allocation &= ~__GFP_MEMALLOC;
306 	static_branch_dec(&memalloc_socks_key);
307 
308 	/*
309 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
310 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
311 	 * it has rmem allocations due to the last swapfile being deactivated
312 	 * but there is a risk that the socket is unusable due to exceeding
313 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
314 	 */
315 	sk_mem_reclaim(sk);
316 }
317 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
318 
319 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
320 {
321 	int ret;
322 	unsigned int noreclaim_flag;
323 
324 	/* these should have been dropped before queueing */
325 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
326 
327 	noreclaim_flag = memalloc_noreclaim_save();
328 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
329 				 tcp_v6_do_rcv,
330 				 tcp_v4_do_rcv,
331 				 sk, skb);
332 	memalloc_noreclaim_restore(noreclaim_flag);
333 
334 	return ret;
335 }
336 EXPORT_SYMBOL(__sk_backlog_rcv);
337 
338 void sk_error_report(struct sock *sk)
339 {
340 	sk->sk_error_report(sk);
341 
342 	switch (sk->sk_family) {
343 	case AF_INET:
344 		fallthrough;
345 	case AF_INET6:
346 		trace_inet_sk_error_report(sk);
347 		break;
348 	default:
349 		break;
350 	}
351 }
352 EXPORT_SYMBOL(sk_error_report);
353 
354 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
355 {
356 	struct __kernel_sock_timeval tv;
357 
358 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
359 		tv.tv_sec = 0;
360 		tv.tv_usec = 0;
361 	} else {
362 		tv.tv_sec = timeo / HZ;
363 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
364 	}
365 
366 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
367 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
368 		*(struct old_timeval32 *)optval = tv32;
369 		return sizeof(tv32);
370 	}
371 
372 	if (old_timeval) {
373 		struct __kernel_old_timeval old_tv;
374 		old_tv.tv_sec = tv.tv_sec;
375 		old_tv.tv_usec = tv.tv_usec;
376 		*(struct __kernel_old_timeval *)optval = old_tv;
377 		return sizeof(old_tv);
378 	}
379 
380 	*(struct __kernel_sock_timeval *)optval = tv;
381 	return sizeof(tv);
382 }
383 EXPORT_SYMBOL(sock_get_timeout);
384 
385 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
386 			   sockptr_t optval, int optlen, bool old_timeval)
387 {
388 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
389 		struct old_timeval32 tv32;
390 
391 		if (optlen < sizeof(tv32))
392 			return -EINVAL;
393 
394 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
395 			return -EFAULT;
396 		tv->tv_sec = tv32.tv_sec;
397 		tv->tv_usec = tv32.tv_usec;
398 	} else if (old_timeval) {
399 		struct __kernel_old_timeval old_tv;
400 
401 		if (optlen < sizeof(old_tv))
402 			return -EINVAL;
403 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
404 			return -EFAULT;
405 		tv->tv_sec = old_tv.tv_sec;
406 		tv->tv_usec = old_tv.tv_usec;
407 	} else {
408 		if (optlen < sizeof(*tv))
409 			return -EINVAL;
410 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
411 			return -EFAULT;
412 	}
413 
414 	return 0;
415 }
416 EXPORT_SYMBOL(sock_copy_user_timeval);
417 
418 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
419 			    bool old_timeval)
420 {
421 	struct __kernel_sock_timeval tv;
422 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
423 
424 	if (err)
425 		return err;
426 
427 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
428 		return -EDOM;
429 
430 	if (tv.tv_sec < 0) {
431 		static int warned __read_mostly;
432 
433 		*timeo_p = 0;
434 		if (warned < 10 && net_ratelimit()) {
435 			warned++;
436 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
437 				__func__, current->comm, task_pid_nr(current));
438 		}
439 		return 0;
440 	}
441 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
442 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
443 		return 0;
444 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
445 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
446 	return 0;
447 }
448 
449 static bool sock_needs_netstamp(const struct sock *sk)
450 {
451 	switch (sk->sk_family) {
452 	case AF_UNSPEC:
453 	case AF_UNIX:
454 		return false;
455 	default:
456 		return true;
457 	}
458 }
459 
460 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
461 {
462 	if (sk->sk_flags & flags) {
463 		sk->sk_flags &= ~flags;
464 		if (sock_needs_netstamp(sk) &&
465 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
466 			net_disable_timestamp();
467 	}
468 }
469 
470 
471 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
472 {
473 	unsigned long flags;
474 	struct sk_buff_head *list = &sk->sk_receive_queue;
475 
476 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
477 		atomic_inc(&sk->sk_drops);
478 		trace_sock_rcvqueue_full(sk, skb);
479 		return -ENOMEM;
480 	}
481 
482 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
483 		atomic_inc(&sk->sk_drops);
484 		return -ENOBUFS;
485 	}
486 
487 	skb->dev = NULL;
488 	skb_set_owner_r(skb, sk);
489 
490 	/* we escape from rcu protected region, make sure we dont leak
491 	 * a norefcounted dst
492 	 */
493 	skb_dst_force(skb);
494 
495 	spin_lock_irqsave(&list->lock, flags);
496 	sock_skb_set_dropcount(sk, skb);
497 	__skb_queue_tail(list, skb);
498 	spin_unlock_irqrestore(&list->lock, flags);
499 
500 	if (!sock_flag(sk, SOCK_DEAD))
501 		sk->sk_data_ready(sk);
502 	return 0;
503 }
504 EXPORT_SYMBOL(__sock_queue_rcv_skb);
505 
506 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
507 {
508 	int err;
509 
510 	err = sk_filter(sk, skb);
511 	if (err)
512 		return err;
513 
514 	return __sock_queue_rcv_skb(sk, skb);
515 }
516 EXPORT_SYMBOL(sock_queue_rcv_skb);
517 
518 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
519 		     const int nested, unsigned int trim_cap, bool refcounted)
520 {
521 	int rc = NET_RX_SUCCESS;
522 
523 	if (sk_filter_trim_cap(sk, skb, trim_cap))
524 		goto discard_and_relse;
525 
526 	skb->dev = NULL;
527 
528 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
529 		atomic_inc(&sk->sk_drops);
530 		goto discard_and_relse;
531 	}
532 	if (nested)
533 		bh_lock_sock_nested(sk);
534 	else
535 		bh_lock_sock(sk);
536 	if (!sock_owned_by_user(sk)) {
537 		/*
538 		 * trylock + unlock semantics:
539 		 */
540 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
541 
542 		rc = sk_backlog_rcv(sk, skb);
543 
544 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
545 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
546 		bh_unlock_sock(sk);
547 		atomic_inc(&sk->sk_drops);
548 		goto discard_and_relse;
549 	}
550 
551 	bh_unlock_sock(sk);
552 out:
553 	if (refcounted)
554 		sock_put(sk);
555 	return rc;
556 discard_and_relse:
557 	kfree_skb(skb);
558 	goto out;
559 }
560 EXPORT_SYMBOL(__sk_receive_skb);
561 
562 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
563 							  u32));
564 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
565 							   u32));
566 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
567 {
568 	struct dst_entry *dst = __sk_dst_get(sk);
569 
570 	if (dst && dst->obsolete &&
571 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
572 			       dst, cookie) == NULL) {
573 		sk_tx_queue_clear(sk);
574 		sk->sk_dst_pending_confirm = 0;
575 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
576 		dst_release(dst);
577 		return NULL;
578 	}
579 
580 	return dst;
581 }
582 EXPORT_SYMBOL(__sk_dst_check);
583 
584 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
585 {
586 	struct dst_entry *dst = sk_dst_get(sk);
587 
588 	if (dst && dst->obsolete &&
589 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
590 			       dst, cookie) == NULL) {
591 		sk_dst_reset(sk);
592 		dst_release(dst);
593 		return NULL;
594 	}
595 
596 	return dst;
597 }
598 EXPORT_SYMBOL(sk_dst_check);
599 
600 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
601 {
602 	int ret = -ENOPROTOOPT;
603 #ifdef CONFIG_NETDEVICES
604 	struct net *net = sock_net(sk);
605 
606 	/* Sorry... */
607 	ret = -EPERM;
608 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
609 		goto out;
610 
611 	ret = -EINVAL;
612 	if (ifindex < 0)
613 		goto out;
614 
615 	sk->sk_bound_dev_if = ifindex;
616 	if (sk->sk_prot->rehash)
617 		sk->sk_prot->rehash(sk);
618 	sk_dst_reset(sk);
619 
620 	ret = 0;
621 
622 out:
623 #endif
624 
625 	return ret;
626 }
627 
628 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
629 {
630 	int ret;
631 
632 	if (lock_sk)
633 		lock_sock(sk);
634 	ret = sock_bindtoindex_locked(sk, ifindex);
635 	if (lock_sk)
636 		release_sock(sk);
637 
638 	return ret;
639 }
640 EXPORT_SYMBOL(sock_bindtoindex);
641 
642 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
643 {
644 	int ret = -ENOPROTOOPT;
645 #ifdef CONFIG_NETDEVICES
646 	struct net *net = sock_net(sk);
647 	char devname[IFNAMSIZ];
648 	int index;
649 
650 	ret = -EINVAL;
651 	if (optlen < 0)
652 		goto out;
653 
654 	/* Bind this socket to a particular device like "eth0",
655 	 * as specified in the passed interface name. If the
656 	 * name is "" or the option length is zero the socket
657 	 * is not bound.
658 	 */
659 	if (optlen > IFNAMSIZ - 1)
660 		optlen = IFNAMSIZ - 1;
661 	memset(devname, 0, sizeof(devname));
662 
663 	ret = -EFAULT;
664 	if (copy_from_sockptr(devname, optval, optlen))
665 		goto out;
666 
667 	index = 0;
668 	if (devname[0] != '\0') {
669 		struct net_device *dev;
670 
671 		rcu_read_lock();
672 		dev = dev_get_by_name_rcu(net, devname);
673 		if (dev)
674 			index = dev->ifindex;
675 		rcu_read_unlock();
676 		ret = -ENODEV;
677 		if (!dev)
678 			goto out;
679 	}
680 
681 	return sock_bindtoindex(sk, index, true);
682 out:
683 #endif
684 
685 	return ret;
686 }
687 
688 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
689 				int __user *optlen, int len)
690 {
691 	int ret = -ENOPROTOOPT;
692 #ifdef CONFIG_NETDEVICES
693 	struct net *net = sock_net(sk);
694 	char devname[IFNAMSIZ];
695 
696 	if (sk->sk_bound_dev_if == 0) {
697 		len = 0;
698 		goto zero;
699 	}
700 
701 	ret = -EINVAL;
702 	if (len < IFNAMSIZ)
703 		goto out;
704 
705 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
706 	if (ret)
707 		goto out;
708 
709 	len = strlen(devname) + 1;
710 
711 	ret = -EFAULT;
712 	if (copy_to_user(optval, devname, len))
713 		goto out;
714 
715 zero:
716 	ret = -EFAULT;
717 	if (put_user(len, optlen))
718 		goto out;
719 
720 	ret = 0;
721 
722 out:
723 #endif
724 
725 	return ret;
726 }
727 
728 bool sk_mc_loop(struct sock *sk)
729 {
730 	if (dev_recursion_level())
731 		return false;
732 	if (!sk)
733 		return true;
734 	switch (sk->sk_family) {
735 	case AF_INET:
736 		return inet_sk(sk)->mc_loop;
737 #if IS_ENABLED(CONFIG_IPV6)
738 	case AF_INET6:
739 		return inet6_sk(sk)->mc_loop;
740 #endif
741 	}
742 	WARN_ON_ONCE(1);
743 	return true;
744 }
745 EXPORT_SYMBOL(sk_mc_loop);
746 
747 void sock_set_reuseaddr(struct sock *sk)
748 {
749 	lock_sock(sk);
750 	sk->sk_reuse = SK_CAN_REUSE;
751 	release_sock(sk);
752 }
753 EXPORT_SYMBOL(sock_set_reuseaddr);
754 
755 void sock_set_reuseport(struct sock *sk)
756 {
757 	lock_sock(sk);
758 	sk->sk_reuseport = true;
759 	release_sock(sk);
760 }
761 EXPORT_SYMBOL(sock_set_reuseport);
762 
763 void sock_no_linger(struct sock *sk)
764 {
765 	lock_sock(sk);
766 	sk->sk_lingertime = 0;
767 	sock_set_flag(sk, SOCK_LINGER);
768 	release_sock(sk);
769 }
770 EXPORT_SYMBOL(sock_no_linger);
771 
772 void sock_set_priority(struct sock *sk, u32 priority)
773 {
774 	lock_sock(sk);
775 	sk->sk_priority = priority;
776 	release_sock(sk);
777 }
778 EXPORT_SYMBOL(sock_set_priority);
779 
780 void sock_set_sndtimeo(struct sock *sk, s64 secs)
781 {
782 	lock_sock(sk);
783 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
784 		sk->sk_sndtimeo = secs * HZ;
785 	else
786 		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
787 	release_sock(sk);
788 }
789 EXPORT_SYMBOL(sock_set_sndtimeo);
790 
791 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
792 {
793 	if (val)  {
794 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
795 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
796 		sock_set_flag(sk, SOCK_RCVTSTAMP);
797 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
798 	} else {
799 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
800 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
801 	}
802 }
803 
804 void sock_enable_timestamps(struct sock *sk)
805 {
806 	lock_sock(sk);
807 	__sock_set_timestamps(sk, true, false, true);
808 	release_sock(sk);
809 }
810 EXPORT_SYMBOL(sock_enable_timestamps);
811 
812 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
813 {
814 	switch (optname) {
815 	case SO_TIMESTAMP_OLD:
816 		__sock_set_timestamps(sk, valbool, false, false);
817 		break;
818 	case SO_TIMESTAMP_NEW:
819 		__sock_set_timestamps(sk, valbool, true, false);
820 		break;
821 	case SO_TIMESTAMPNS_OLD:
822 		__sock_set_timestamps(sk, valbool, false, true);
823 		break;
824 	case SO_TIMESTAMPNS_NEW:
825 		__sock_set_timestamps(sk, valbool, true, true);
826 		break;
827 	}
828 }
829 
830 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
831 {
832 	struct net *net = sock_net(sk);
833 	struct net_device *dev = NULL;
834 	bool match = false;
835 	int *vclock_index;
836 	int i, num;
837 
838 	if (sk->sk_bound_dev_if)
839 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
840 
841 	if (!dev) {
842 		pr_err("%s: sock not bind to device\n", __func__);
843 		return -EOPNOTSUPP;
844 	}
845 
846 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
847 	for (i = 0; i < num; i++) {
848 		if (*(vclock_index + i) == phc_index) {
849 			match = true;
850 			break;
851 		}
852 	}
853 
854 	if (num > 0)
855 		kfree(vclock_index);
856 
857 	if (!match)
858 		return -EINVAL;
859 
860 	sk->sk_bind_phc = phc_index;
861 
862 	return 0;
863 }
864 
865 int sock_set_timestamping(struct sock *sk, int optname,
866 			  struct so_timestamping timestamping)
867 {
868 	int val = timestamping.flags;
869 	int ret;
870 
871 	if (val & ~SOF_TIMESTAMPING_MASK)
872 		return -EINVAL;
873 
874 	if (val & SOF_TIMESTAMPING_OPT_ID &&
875 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
876 		if (sk_is_tcp(sk)) {
877 			if ((1 << sk->sk_state) &
878 			    (TCPF_CLOSE | TCPF_LISTEN))
879 				return -EINVAL;
880 			sk->sk_tskey = tcp_sk(sk)->snd_una;
881 		} else {
882 			sk->sk_tskey = 0;
883 		}
884 	}
885 
886 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
887 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
888 		return -EINVAL;
889 
890 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
891 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
892 		if (ret)
893 			return ret;
894 	}
895 
896 	sk->sk_tsflags = val;
897 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
898 
899 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
900 		sock_enable_timestamp(sk,
901 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
902 	else
903 		sock_disable_timestamp(sk,
904 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
905 	return 0;
906 }
907 
908 void sock_set_keepalive(struct sock *sk)
909 {
910 	lock_sock(sk);
911 	if (sk->sk_prot->keepalive)
912 		sk->sk_prot->keepalive(sk, true);
913 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
914 	release_sock(sk);
915 }
916 EXPORT_SYMBOL(sock_set_keepalive);
917 
918 static void __sock_set_rcvbuf(struct sock *sk, int val)
919 {
920 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
921 	 * as a negative value.
922 	 */
923 	val = min_t(int, val, INT_MAX / 2);
924 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
925 
926 	/* We double it on the way in to account for "struct sk_buff" etc.
927 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
928 	 * will allow that much actual data to be received on that socket.
929 	 *
930 	 * Applications are unaware that "struct sk_buff" and other overheads
931 	 * allocate from the receive buffer during socket buffer allocation.
932 	 *
933 	 * And after considering the possible alternatives, returning the value
934 	 * we actually used in getsockopt is the most desirable behavior.
935 	 */
936 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
937 }
938 
939 void sock_set_rcvbuf(struct sock *sk, int val)
940 {
941 	lock_sock(sk);
942 	__sock_set_rcvbuf(sk, val);
943 	release_sock(sk);
944 }
945 EXPORT_SYMBOL(sock_set_rcvbuf);
946 
947 static void __sock_set_mark(struct sock *sk, u32 val)
948 {
949 	if (val != sk->sk_mark) {
950 		sk->sk_mark = val;
951 		sk_dst_reset(sk);
952 	}
953 }
954 
955 void sock_set_mark(struct sock *sk, u32 val)
956 {
957 	lock_sock(sk);
958 	__sock_set_mark(sk, val);
959 	release_sock(sk);
960 }
961 EXPORT_SYMBOL(sock_set_mark);
962 
963 static void sock_release_reserved_memory(struct sock *sk, int bytes)
964 {
965 	/* Round down bytes to multiple of pages */
966 	bytes &= ~(SK_MEM_QUANTUM - 1);
967 
968 	WARN_ON(bytes > sk->sk_reserved_mem);
969 	sk->sk_reserved_mem -= bytes;
970 	sk_mem_reclaim(sk);
971 }
972 
973 static int sock_reserve_memory(struct sock *sk, int bytes)
974 {
975 	long allocated;
976 	bool charged;
977 	int pages;
978 
979 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
980 		return -EOPNOTSUPP;
981 
982 	if (!bytes)
983 		return 0;
984 
985 	pages = sk_mem_pages(bytes);
986 
987 	/* pre-charge to memcg */
988 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
989 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
990 	if (!charged)
991 		return -ENOMEM;
992 
993 	/* pre-charge to forward_alloc */
994 	allocated = sk_memory_allocated_add(sk, pages);
995 	/* If the system goes into memory pressure with this
996 	 * precharge, give up and return error.
997 	 */
998 	if (allocated > sk_prot_mem_limits(sk, 1)) {
999 		sk_memory_allocated_sub(sk, pages);
1000 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1001 		return -ENOMEM;
1002 	}
1003 	sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT;
1004 
1005 	sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT;
1006 
1007 	return 0;
1008 }
1009 
1010 /*
1011  *	This is meant for all protocols to use and covers goings on
1012  *	at the socket level. Everything here is generic.
1013  */
1014 
1015 int sock_setsockopt(struct socket *sock, int level, int optname,
1016 		    sockptr_t optval, unsigned int optlen)
1017 {
1018 	struct so_timestamping timestamping;
1019 	struct sock_txtime sk_txtime;
1020 	struct sock *sk = sock->sk;
1021 	int val;
1022 	int valbool;
1023 	struct linger ling;
1024 	int ret = 0;
1025 
1026 	/*
1027 	 *	Options without arguments
1028 	 */
1029 
1030 	if (optname == SO_BINDTODEVICE)
1031 		return sock_setbindtodevice(sk, optval, optlen);
1032 
1033 	if (optlen < sizeof(int))
1034 		return -EINVAL;
1035 
1036 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1037 		return -EFAULT;
1038 
1039 	valbool = val ? 1 : 0;
1040 
1041 	lock_sock(sk);
1042 
1043 	switch (optname) {
1044 	case SO_DEBUG:
1045 		if (val && !capable(CAP_NET_ADMIN))
1046 			ret = -EACCES;
1047 		else
1048 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1049 		break;
1050 	case SO_REUSEADDR:
1051 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1052 		break;
1053 	case SO_REUSEPORT:
1054 		sk->sk_reuseport = valbool;
1055 		break;
1056 	case SO_TYPE:
1057 	case SO_PROTOCOL:
1058 	case SO_DOMAIN:
1059 	case SO_ERROR:
1060 		ret = -ENOPROTOOPT;
1061 		break;
1062 	case SO_DONTROUTE:
1063 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1064 		sk_dst_reset(sk);
1065 		break;
1066 	case SO_BROADCAST:
1067 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1068 		break;
1069 	case SO_SNDBUF:
1070 		/* Don't error on this BSD doesn't and if you think
1071 		 * about it this is right. Otherwise apps have to
1072 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1073 		 * are treated in BSD as hints
1074 		 */
1075 		val = min_t(u32, val, sysctl_wmem_max);
1076 set_sndbuf:
1077 		/* Ensure val * 2 fits into an int, to prevent max_t()
1078 		 * from treating it as a negative value.
1079 		 */
1080 		val = min_t(int, val, INT_MAX / 2);
1081 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1082 		WRITE_ONCE(sk->sk_sndbuf,
1083 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1084 		/* Wake up sending tasks if we upped the value. */
1085 		sk->sk_write_space(sk);
1086 		break;
1087 
1088 	case SO_SNDBUFFORCE:
1089 		if (!capable(CAP_NET_ADMIN)) {
1090 			ret = -EPERM;
1091 			break;
1092 		}
1093 
1094 		/* No negative values (to prevent underflow, as val will be
1095 		 * multiplied by 2).
1096 		 */
1097 		if (val < 0)
1098 			val = 0;
1099 		goto set_sndbuf;
1100 
1101 	case SO_RCVBUF:
1102 		/* Don't error on this BSD doesn't and if you think
1103 		 * about it this is right. Otherwise apps have to
1104 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1105 		 * are treated in BSD as hints
1106 		 */
1107 		__sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
1108 		break;
1109 
1110 	case SO_RCVBUFFORCE:
1111 		if (!capable(CAP_NET_ADMIN)) {
1112 			ret = -EPERM;
1113 			break;
1114 		}
1115 
1116 		/* No negative values (to prevent underflow, as val will be
1117 		 * multiplied by 2).
1118 		 */
1119 		__sock_set_rcvbuf(sk, max(val, 0));
1120 		break;
1121 
1122 	case SO_KEEPALIVE:
1123 		if (sk->sk_prot->keepalive)
1124 			sk->sk_prot->keepalive(sk, valbool);
1125 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1126 		break;
1127 
1128 	case SO_OOBINLINE:
1129 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1130 		break;
1131 
1132 	case SO_NO_CHECK:
1133 		sk->sk_no_check_tx = valbool;
1134 		break;
1135 
1136 	case SO_PRIORITY:
1137 		if ((val >= 0 && val <= 6) ||
1138 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1139 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1140 			sk->sk_priority = val;
1141 		else
1142 			ret = -EPERM;
1143 		break;
1144 
1145 	case SO_LINGER:
1146 		if (optlen < sizeof(ling)) {
1147 			ret = -EINVAL;	/* 1003.1g */
1148 			break;
1149 		}
1150 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1151 			ret = -EFAULT;
1152 			break;
1153 		}
1154 		if (!ling.l_onoff)
1155 			sock_reset_flag(sk, SOCK_LINGER);
1156 		else {
1157 #if (BITS_PER_LONG == 32)
1158 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1159 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1160 			else
1161 #endif
1162 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1163 			sock_set_flag(sk, SOCK_LINGER);
1164 		}
1165 		break;
1166 
1167 	case SO_BSDCOMPAT:
1168 		break;
1169 
1170 	case SO_PASSCRED:
1171 		if (valbool)
1172 			set_bit(SOCK_PASSCRED, &sock->flags);
1173 		else
1174 			clear_bit(SOCK_PASSCRED, &sock->flags);
1175 		break;
1176 
1177 	case SO_TIMESTAMP_OLD:
1178 	case SO_TIMESTAMP_NEW:
1179 	case SO_TIMESTAMPNS_OLD:
1180 	case SO_TIMESTAMPNS_NEW:
1181 		sock_set_timestamp(sk, optname, valbool);
1182 		break;
1183 
1184 	case SO_TIMESTAMPING_NEW:
1185 	case SO_TIMESTAMPING_OLD:
1186 		if (optlen == sizeof(timestamping)) {
1187 			if (copy_from_sockptr(&timestamping, optval,
1188 					      sizeof(timestamping))) {
1189 				ret = -EFAULT;
1190 				break;
1191 			}
1192 		} else {
1193 			memset(&timestamping, 0, sizeof(timestamping));
1194 			timestamping.flags = val;
1195 		}
1196 		ret = sock_set_timestamping(sk, optname, timestamping);
1197 		break;
1198 
1199 	case SO_RCVLOWAT:
1200 		if (val < 0)
1201 			val = INT_MAX;
1202 		if (sock->ops->set_rcvlowat)
1203 			ret = sock->ops->set_rcvlowat(sk, val);
1204 		else
1205 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1206 		break;
1207 
1208 	case SO_RCVTIMEO_OLD:
1209 	case SO_RCVTIMEO_NEW:
1210 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1211 				       optlen, optname == SO_RCVTIMEO_OLD);
1212 		break;
1213 
1214 	case SO_SNDTIMEO_OLD:
1215 	case SO_SNDTIMEO_NEW:
1216 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1217 				       optlen, optname == SO_SNDTIMEO_OLD);
1218 		break;
1219 
1220 	case SO_ATTACH_FILTER: {
1221 		struct sock_fprog fprog;
1222 
1223 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1224 		if (!ret)
1225 			ret = sk_attach_filter(&fprog, sk);
1226 		break;
1227 	}
1228 	case SO_ATTACH_BPF:
1229 		ret = -EINVAL;
1230 		if (optlen == sizeof(u32)) {
1231 			u32 ufd;
1232 
1233 			ret = -EFAULT;
1234 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1235 				break;
1236 
1237 			ret = sk_attach_bpf(ufd, sk);
1238 		}
1239 		break;
1240 
1241 	case SO_ATTACH_REUSEPORT_CBPF: {
1242 		struct sock_fprog fprog;
1243 
1244 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1245 		if (!ret)
1246 			ret = sk_reuseport_attach_filter(&fprog, sk);
1247 		break;
1248 	}
1249 	case SO_ATTACH_REUSEPORT_EBPF:
1250 		ret = -EINVAL;
1251 		if (optlen == sizeof(u32)) {
1252 			u32 ufd;
1253 
1254 			ret = -EFAULT;
1255 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1256 				break;
1257 
1258 			ret = sk_reuseport_attach_bpf(ufd, sk);
1259 		}
1260 		break;
1261 
1262 	case SO_DETACH_REUSEPORT_BPF:
1263 		ret = reuseport_detach_prog(sk);
1264 		break;
1265 
1266 	case SO_DETACH_FILTER:
1267 		ret = sk_detach_filter(sk);
1268 		break;
1269 
1270 	case SO_LOCK_FILTER:
1271 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1272 			ret = -EPERM;
1273 		else
1274 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1275 		break;
1276 
1277 	case SO_PASSSEC:
1278 		if (valbool)
1279 			set_bit(SOCK_PASSSEC, &sock->flags);
1280 		else
1281 			clear_bit(SOCK_PASSSEC, &sock->flags);
1282 		break;
1283 	case SO_MARK:
1284 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1285 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1286 			ret = -EPERM;
1287 			break;
1288 		}
1289 
1290 		__sock_set_mark(sk, val);
1291 		break;
1292 
1293 	case SO_RXQ_OVFL:
1294 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1295 		break;
1296 
1297 	case SO_WIFI_STATUS:
1298 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1299 		break;
1300 
1301 	case SO_PEEK_OFF:
1302 		if (sock->ops->set_peek_off)
1303 			ret = sock->ops->set_peek_off(sk, val);
1304 		else
1305 			ret = -EOPNOTSUPP;
1306 		break;
1307 
1308 	case SO_NOFCS:
1309 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1310 		break;
1311 
1312 	case SO_SELECT_ERR_QUEUE:
1313 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1314 		break;
1315 
1316 #ifdef CONFIG_NET_RX_BUSY_POLL
1317 	case SO_BUSY_POLL:
1318 		/* allow unprivileged users to decrease the value */
1319 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1320 			ret = -EPERM;
1321 		else {
1322 			if (val < 0)
1323 				ret = -EINVAL;
1324 			else
1325 				WRITE_ONCE(sk->sk_ll_usec, val);
1326 		}
1327 		break;
1328 	case SO_PREFER_BUSY_POLL:
1329 		if (valbool && !capable(CAP_NET_ADMIN))
1330 			ret = -EPERM;
1331 		else
1332 			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1333 		break;
1334 	case SO_BUSY_POLL_BUDGET:
1335 		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1336 			ret = -EPERM;
1337 		} else {
1338 			if (val < 0 || val > U16_MAX)
1339 				ret = -EINVAL;
1340 			else
1341 				WRITE_ONCE(sk->sk_busy_poll_budget, val);
1342 		}
1343 		break;
1344 #endif
1345 
1346 	case SO_MAX_PACING_RATE:
1347 		{
1348 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1349 
1350 		if (sizeof(ulval) != sizeof(val) &&
1351 		    optlen >= sizeof(ulval) &&
1352 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1353 			ret = -EFAULT;
1354 			break;
1355 		}
1356 		if (ulval != ~0UL)
1357 			cmpxchg(&sk->sk_pacing_status,
1358 				SK_PACING_NONE,
1359 				SK_PACING_NEEDED);
1360 		sk->sk_max_pacing_rate = ulval;
1361 		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1362 		break;
1363 		}
1364 	case SO_INCOMING_CPU:
1365 		WRITE_ONCE(sk->sk_incoming_cpu, val);
1366 		break;
1367 
1368 	case SO_CNX_ADVICE:
1369 		if (val == 1)
1370 			dst_negative_advice(sk);
1371 		break;
1372 
1373 	case SO_ZEROCOPY:
1374 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1375 			if (!(sk_is_tcp(sk) ||
1376 			      (sk->sk_type == SOCK_DGRAM &&
1377 			       sk->sk_protocol == IPPROTO_UDP)))
1378 				ret = -ENOTSUPP;
1379 		} else if (sk->sk_family != PF_RDS) {
1380 			ret = -ENOTSUPP;
1381 		}
1382 		if (!ret) {
1383 			if (val < 0 || val > 1)
1384 				ret = -EINVAL;
1385 			else
1386 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1387 		}
1388 		break;
1389 
1390 	case SO_TXTIME:
1391 		if (optlen != sizeof(struct sock_txtime)) {
1392 			ret = -EINVAL;
1393 			break;
1394 		} else if (copy_from_sockptr(&sk_txtime, optval,
1395 			   sizeof(struct sock_txtime))) {
1396 			ret = -EFAULT;
1397 			break;
1398 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1399 			ret = -EINVAL;
1400 			break;
1401 		}
1402 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1403 		 * scheduler has enough safe guards.
1404 		 */
1405 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1406 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1407 			ret = -EPERM;
1408 			break;
1409 		}
1410 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1411 		sk->sk_clockid = sk_txtime.clockid;
1412 		sk->sk_txtime_deadline_mode =
1413 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1414 		sk->sk_txtime_report_errors =
1415 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1416 		break;
1417 
1418 	case SO_BINDTOIFINDEX:
1419 		ret = sock_bindtoindex_locked(sk, val);
1420 		break;
1421 
1422 	case SO_BUF_LOCK:
1423 		if (val & ~SOCK_BUF_LOCK_MASK) {
1424 			ret = -EINVAL;
1425 			break;
1426 		}
1427 		sk->sk_userlocks = val | (sk->sk_userlocks &
1428 					  ~SOCK_BUF_LOCK_MASK);
1429 		break;
1430 
1431 	case SO_RESERVE_MEM:
1432 	{
1433 		int delta;
1434 
1435 		if (val < 0) {
1436 			ret = -EINVAL;
1437 			break;
1438 		}
1439 
1440 		delta = val - sk->sk_reserved_mem;
1441 		if (delta < 0)
1442 			sock_release_reserved_memory(sk, -delta);
1443 		else
1444 			ret = sock_reserve_memory(sk, delta);
1445 		break;
1446 	}
1447 
1448 	default:
1449 		ret = -ENOPROTOOPT;
1450 		break;
1451 	}
1452 	release_sock(sk);
1453 	return ret;
1454 }
1455 EXPORT_SYMBOL(sock_setsockopt);
1456 
1457 static const struct cred *sk_get_peer_cred(struct sock *sk)
1458 {
1459 	const struct cred *cred;
1460 
1461 	spin_lock(&sk->sk_peer_lock);
1462 	cred = get_cred(sk->sk_peer_cred);
1463 	spin_unlock(&sk->sk_peer_lock);
1464 
1465 	return cred;
1466 }
1467 
1468 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1469 			  struct ucred *ucred)
1470 {
1471 	ucred->pid = pid_vnr(pid);
1472 	ucred->uid = ucred->gid = -1;
1473 	if (cred) {
1474 		struct user_namespace *current_ns = current_user_ns();
1475 
1476 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1477 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1478 	}
1479 }
1480 
1481 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1482 {
1483 	struct user_namespace *user_ns = current_user_ns();
1484 	int i;
1485 
1486 	for (i = 0; i < src->ngroups; i++)
1487 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1488 			return -EFAULT;
1489 
1490 	return 0;
1491 }
1492 
1493 int sock_getsockopt(struct socket *sock, int level, int optname,
1494 		    char __user *optval, int __user *optlen)
1495 {
1496 	struct sock *sk = sock->sk;
1497 
1498 	union {
1499 		int val;
1500 		u64 val64;
1501 		unsigned long ulval;
1502 		struct linger ling;
1503 		struct old_timeval32 tm32;
1504 		struct __kernel_old_timeval tm;
1505 		struct  __kernel_sock_timeval stm;
1506 		struct sock_txtime txtime;
1507 		struct so_timestamping timestamping;
1508 	} v;
1509 
1510 	int lv = sizeof(int);
1511 	int len;
1512 
1513 	if (get_user(len, optlen))
1514 		return -EFAULT;
1515 	if (len < 0)
1516 		return -EINVAL;
1517 
1518 	memset(&v, 0, sizeof(v));
1519 
1520 	switch (optname) {
1521 	case SO_DEBUG:
1522 		v.val = sock_flag(sk, SOCK_DBG);
1523 		break;
1524 
1525 	case SO_DONTROUTE:
1526 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1527 		break;
1528 
1529 	case SO_BROADCAST:
1530 		v.val = sock_flag(sk, SOCK_BROADCAST);
1531 		break;
1532 
1533 	case SO_SNDBUF:
1534 		v.val = sk->sk_sndbuf;
1535 		break;
1536 
1537 	case SO_RCVBUF:
1538 		v.val = sk->sk_rcvbuf;
1539 		break;
1540 
1541 	case SO_REUSEADDR:
1542 		v.val = sk->sk_reuse;
1543 		break;
1544 
1545 	case SO_REUSEPORT:
1546 		v.val = sk->sk_reuseport;
1547 		break;
1548 
1549 	case SO_KEEPALIVE:
1550 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1551 		break;
1552 
1553 	case SO_TYPE:
1554 		v.val = sk->sk_type;
1555 		break;
1556 
1557 	case SO_PROTOCOL:
1558 		v.val = sk->sk_protocol;
1559 		break;
1560 
1561 	case SO_DOMAIN:
1562 		v.val = sk->sk_family;
1563 		break;
1564 
1565 	case SO_ERROR:
1566 		v.val = -sock_error(sk);
1567 		if (v.val == 0)
1568 			v.val = xchg(&sk->sk_err_soft, 0);
1569 		break;
1570 
1571 	case SO_OOBINLINE:
1572 		v.val = sock_flag(sk, SOCK_URGINLINE);
1573 		break;
1574 
1575 	case SO_NO_CHECK:
1576 		v.val = sk->sk_no_check_tx;
1577 		break;
1578 
1579 	case SO_PRIORITY:
1580 		v.val = sk->sk_priority;
1581 		break;
1582 
1583 	case SO_LINGER:
1584 		lv		= sizeof(v.ling);
1585 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1586 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1587 		break;
1588 
1589 	case SO_BSDCOMPAT:
1590 		break;
1591 
1592 	case SO_TIMESTAMP_OLD:
1593 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1594 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1595 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1596 		break;
1597 
1598 	case SO_TIMESTAMPNS_OLD:
1599 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1600 		break;
1601 
1602 	case SO_TIMESTAMP_NEW:
1603 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1604 		break;
1605 
1606 	case SO_TIMESTAMPNS_NEW:
1607 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1608 		break;
1609 
1610 	case SO_TIMESTAMPING_OLD:
1611 		lv = sizeof(v.timestamping);
1612 		v.timestamping.flags = sk->sk_tsflags;
1613 		v.timestamping.bind_phc = sk->sk_bind_phc;
1614 		break;
1615 
1616 	case SO_RCVTIMEO_OLD:
1617 	case SO_RCVTIMEO_NEW:
1618 		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1619 		break;
1620 
1621 	case SO_SNDTIMEO_OLD:
1622 	case SO_SNDTIMEO_NEW:
1623 		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1624 		break;
1625 
1626 	case SO_RCVLOWAT:
1627 		v.val = sk->sk_rcvlowat;
1628 		break;
1629 
1630 	case SO_SNDLOWAT:
1631 		v.val = 1;
1632 		break;
1633 
1634 	case SO_PASSCRED:
1635 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1636 		break;
1637 
1638 	case SO_PEERCRED:
1639 	{
1640 		struct ucred peercred;
1641 		if (len > sizeof(peercred))
1642 			len = sizeof(peercred);
1643 
1644 		spin_lock(&sk->sk_peer_lock);
1645 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1646 		spin_unlock(&sk->sk_peer_lock);
1647 
1648 		if (copy_to_user(optval, &peercred, len))
1649 			return -EFAULT;
1650 		goto lenout;
1651 	}
1652 
1653 	case SO_PEERGROUPS:
1654 	{
1655 		const struct cred *cred;
1656 		int ret, n;
1657 
1658 		cred = sk_get_peer_cred(sk);
1659 		if (!cred)
1660 			return -ENODATA;
1661 
1662 		n = cred->group_info->ngroups;
1663 		if (len < n * sizeof(gid_t)) {
1664 			len = n * sizeof(gid_t);
1665 			put_cred(cred);
1666 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1667 		}
1668 		len = n * sizeof(gid_t);
1669 
1670 		ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1671 		put_cred(cred);
1672 		if (ret)
1673 			return ret;
1674 		goto lenout;
1675 	}
1676 
1677 	case SO_PEERNAME:
1678 	{
1679 		char address[128];
1680 
1681 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1682 		if (lv < 0)
1683 			return -ENOTCONN;
1684 		if (lv < len)
1685 			return -EINVAL;
1686 		if (copy_to_user(optval, address, len))
1687 			return -EFAULT;
1688 		goto lenout;
1689 	}
1690 
1691 	/* Dubious BSD thing... Probably nobody even uses it, but
1692 	 * the UNIX standard wants it for whatever reason... -DaveM
1693 	 */
1694 	case SO_ACCEPTCONN:
1695 		v.val = sk->sk_state == TCP_LISTEN;
1696 		break;
1697 
1698 	case SO_PASSSEC:
1699 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1700 		break;
1701 
1702 	case SO_PEERSEC:
1703 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1704 
1705 	case SO_MARK:
1706 		v.val = sk->sk_mark;
1707 		break;
1708 
1709 	case SO_RXQ_OVFL:
1710 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1711 		break;
1712 
1713 	case SO_WIFI_STATUS:
1714 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1715 		break;
1716 
1717 	case SO_PEEK_OFF:
1718 		if (!sock->ops->set_peek_off)
1719 			return -EOPNOTSUPP;
1720 
1721 		v.val = sk->sk_peek_off;
1722 		break;
1723 	case SO_NOFCS:
1724 		v.val = sock_flag(sk, SOCK_NOFCS);
1725 		break;
1726 
1727 	case SO_BINDTODEVICE:
1728 		return sock_getbindtodevice(sk, optval, optlen, len);
1729 
1730 	case SO_GET_FILTER:
1731 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1732 		if (len < 0)
1733 			return len;
1734 
1735 		goto lenout;
1736 
1737 	case SO_LOCK_FILTER:
1738 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1739 		break;
1740 
1741 	case SO_BPF_EXTENSIONS:
1742 		v.val = bpf_tell_extensions();
1743 		break;
1744 
1745 	case SO_SELECT_ERR_QUEUE:
1746 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1747 		break;
1748 
1749 #ifdef CONFIG_NET_RX_BUSY_POLL
1750 	case SO_BUSY_POLL:
1751 		v.val = sk->sk_ll_usec;
1752 		break;
1753 	case SO_PREFER_BUSY_POLL:
1754 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1755 		break;
1756 #endif
1757 
1758 	case SO_MAX_PACING_RATE:
1759 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1760 			lv = sizeof(v.ulval);
1761 			v.ulval = sk->sk_max_pacing_rate;
1762 		} else {
1763 			/* 32bit version */
1764 			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1765 		}
1766 		break;
1767 
1768 	case SO_INCOMING_CPU:
1769 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1770 		break;
1771 
1772 	case SO_MEMINFO:
1773 	{
1774 		u32 meminfo[SK_MEMINFO_VARS];
1775 
1776 		sk_get_meminfo(sk, meminfo);
1777 
1778 		len = min_t(unsigned int, len, sizeof(meminfo));
1779 		if (copy_to_user(optval, &meminfo, len))
1780 			return -EFAULT;
1781 
1782 		goto lenout;
1783 	}
1784 
1785 #ifdef CONFIG_NET_RX_BUSY_POLL
1786 	case SO_INCOMING_NAPI_ID:
1787 		v.val = READ_ONCE(sk->sk_napi_id);
1788 
1789 		/* aggregate non-NAPI IDs down to 0 */
1790 		if (v.val < MIN_NAPI_ID)
1791 			v.val = 0;
1792 
1793 		break;
1794 #endif
1795 
1796 	case SO_COOKIE:
1797 		lv = sizeof(u64);
1798 		if (len < lv)
1799 			return -EINVAL;
1800 		v.val64 = sock_gen_cookie(sk);
1801 		break;
1802 
1803 	case SO_ZEROCOPY:
1804 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1805 		break;
1806 
1807 	case SO_TXTIME:
1808 		lv = sizeof(v.txtime);
1809 		v.txtime.clockid = sk->sk_clockid;
1810 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1811 				  SOF_TXTIME_DEADLINE_MODE : 0;
1812 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1813 				  SOF_TXTIME_REPORT_ERRORS : 0;
1814 		break;
1815 
1816 	case SO_BINDTOIFINDEX:
1817 		v.val = sk->sk_bound_dev_if;
1818 		break;
1819 
1820 	case SO_NETNS_COOKIE:
1821 		lv = sizeof(u64);
1822 		if (len != lv)
1823 			return -EINVAL;
1824 		v.val64 = sock_net(sk)->net_cookie;
1825 		break;
1826 
1827 	case SO_BUF_LOCK:
1828 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1829 		break;
1830 
1831 	case SO_RESERVE_MEM:
1832 		v.val = sk->sk_reserved_mem;
1833 		break;
1834 
1835 	default:
1836 		/* We implement the SO_SNDLOWAT etc to not be settable
1837 		 * (1003.1g 7).
1838 		 */
1839 		return -ENOPROTOOPT;
1840 	}
1841 
1842 	if (len > lv)
1843 		len = lv;
1844 	if (copy_to_user(optval, &v, len))
1845 		return -EFAULT;
1846 lenout:
1847 	if (put_user(len, optlen))
1848 		return -EFAULT;
1849 	return 0;
1850 }
1851 
1852 /*
1853  * Initialize an sk_lock.
1854  *
1855  * (We also register the sk_lock with the lock validator.)
1856  */
1857 static inline void sock_lock_init(struct sock *sk)
1858 {
1859 	if (sk->sk_kern_sock)
1860 		sock_lock_init_class_and_name(
1861 			sk,
1862 			af_family_kern_slock_key_strings[sk->sk_family],
1863 			af_family_kern_slock_keys + sk->sk_family,
1864 			af_family_kern_key_strings[sk->sk_family],
1865 			af_family_kern_keys + sk->sk_family);
1866 	else
1867 		sock_lock_init_class_and_name(
1868 			sk,
1869 			af_family_slock_key_strings[sk->sk_family],
1870 			af_family_slock_keys + sk->sk_family,
1871 			af_family_key_strings[sk->sk_family],
1872 			af_family_keys + sk->sk_family);
1873 }
1874 
1875 /*
1876  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1877  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1878  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1879  */
1880 static void sock_copy(struct sock *nsk, const struct sock *osk)
1881 {
1882 	const struct proto *prot = READ_ONCE(osk->sk_prot);
1883 #ifdef CONFIG_SECURITY_NETWORK
1884 	void *sptr = nsk->sk_security;
1885 #endif
1886 
1887 	/* If we move sk_tx_queue_mapping out of the private section,
1888 	 * we must check if sk_tx_queue_clear() is called after
1889 	 * sock_copy() in sk_clone_lock().
1890 	 */
1891 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1892 		     offsetof(struct sock, sk_dontcopy_begin) ||
1893 		     offsetof(struct sock, sk_tx_queue_mapping) >=
1894 		     offsetof(struct sock, sk_dontcopy_end));
1895 
1896 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1897 
1898 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1899 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1900 
1901 #ifdef CONFIG_SECURITY_NETWORK
1902 	nsk->sk_security = sptr;
1903 	security_sk_clone(osk, nsk);
1904 #endif
1905 }
1906 
1907 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1908 		int family)
1909 {
1910 	struct sock *sk;
1911 	struct kmem_cache *slab;
1912 
1913 	slab = prot->slab;
1914 	if (slab != NULL) {
1915 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1916 		if (!sk)
1917 			return sk;
1918 		if (want_init_on_alloc(priority))
1919 			sk_prot_clear_nulls(sk, prot->obj_size);
1920 	} else
1921 		sk = kmalloc(prot->obj_size, priority);
1922 
1923 	if (sk != NULL) {
1924 		if (security_sk_alloc(sk, family, priority))
1925 			goto out_free;
1926 
1927 		if (!try_module_get(prot->owner))
1928 			goto out_free_sec;
1929 	}
1930 
1931 	return sk;
1932 
1933 out_free_sec:
1934 	security_sk_free(sk);
1935 out_free:
1936 	if (slab != NULL)
1937 		kmem_cache_free(slab, sk);
1938 	else
1939 		kfree(sk);
1940 	return NULL;
1941 }
1942 
1943 static void sk_prot_free(struct proto *prot, struct sock *sk)
1944 {
1945 	struct kmem_cache *slab;
1946 	struct module *owner;
1947 
1948 	owner = prot->owner;
1949 	slab = prot->slab;
1950 
1951 	cgroup_sk_free(&sk->sk_cgrp_data);
1952 	mem_cgroup_sk_free(sk);
1953 	security_sk_free(sk);
1954 	if (slab != NULL)
1955 		kmem_cache_free(slab, sk);
1956 	else
1957 		kfree(sk);
1958 	module_put(owner);
1959 }
1960 
1961 /**
1962  *	sk_alloc - All socket objects are allocated here
1963  *	@net: the applicable net namespace
1964  *	@family: protocol family
1965  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1966  *	@prot: struct proto associated with this new sock instance
1967  *	@kern: is this to be a kernel socket?
1968  */
1969 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1970 		      struct proto *prot, int kern)
1971 {
1972 	struct sock *sk;
1973 
1974 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1975 	if (sk) {
1976 		sk->sk_family = family;
1977 		/*
1978 		 * See comment in struct sock definition to understand
1979 		 * why we need sk_prot_creator -acme
1980 		 */
1981 		sk->sk_prot = sk->sk_prot_creator = prot;
1982 		sk->sk_kern_sock = kern;
1983 		sock_lock_init(sk);
1984 		sk->sk_net_refcnt = kern ? 0 : 1;
1985 		if (likely(sk->sk_net_refcnt)) {
1986 			get_net_track(net, &sk->ns_tracker, priority);
1987 			sock_inuse_add(net, 1);
1988 		}
1989 
1990 		sock_net_set(sk, net);
1991 		refcount_set(&sk->sk_wmem_alloc, 1);
1992 
1993 		mem_cgroup_sk_alloc(sk);
1994 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1995 		sock_update_classid(&sk->sk_cgrp_data);
1996 		sock_update_netprioidx(&sk->sk_cgrp_data);
1997 		sk_tx_queue_clear(sk);
1998 	}
1999 
2000 	return sk;
2001 }
2002 EXPORT_SYMBOL(sk_alloc);
2003 
2004 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2005  * grace period. This is the case for UDP sockets and TCP listeners.
2006  */
2007 static void __sk_destruct(struct rcu_head *head)
2008 {
2009 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2010 	struct sk_filter *filter;
2011 
2012 	if (sk->sk_destruct)
2013 		sk->sk_destruct(sk);
2014 
2015 	filter = rcu_dereference_check(sk->sk_filter,
2016 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2017 	if (filter) {
2018 		sk_filter_uncharge(sk, filter);
2019 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2020 	}
2021 
2022 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2023 
2024 #ifdef CONFIG_BPF_SYSCALL
2025 	bpf_sk_storage_free(sk);
2026 #endif
2027 
2028 	if (atomic_read(&sk->sk_omem_alloc))
2029 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2030 			 __func__, atomic_read(&sk->sk_omem_alloc));
2031 
2032 	if (sk->sk_frag.page) {
2033 		put_page(sk->sk_frag.page);
2034 		sk->sk_frag.page = NULL;
2035 	}
2036 
2037 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2038 	put_cred(sk->sk_peer_cred);
2039 	put_pid(sk->sk_peer_pid);
2040 
2041 	if (likely(sk->sk_net_refcnt))
2042 		put_net_track(sock_net(sk), &sk->ns_tracker);
2043 	sk_prot_free(sk->sk_prot_creator, sk);
2044 }
2045 
2046 void sk_destruct(struct sock *sk)
2047 {
2048 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2049 
2050 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2051 		reuseport_detach_sock(sk);
2052 		use_call_rcu = true;
2053 	}
2054 
2055 	if (use_call_rcu)
2056 		call_rcu(&sk->sk_rcu, __sk_destruct);
2057 	else
2058 		__sk_destruct(&sk->sk_rcu);
2059 }
2060 
2061 static void __sk_free(struct sock *sk)
2062 {
2063 	if (likely(sk->sk_net_refcnt))
2064 		sock_inuse_add(sock_net(sk), -1);
2065 
2066 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2067 		sock_diag_broadcast_destroy(sk);
2068 	else
2069 		sk_destruct(sk);
2070 }
2071 
2072 void sk_free(struct sock *sk)
2073 {
2074 	/*
2075 	 * We subtract one from sk_wmem_alloc and can know if
2076 	 * some packets are still in some tx queue.
2077 	 * If not null, sock_wfree() will call __sk_free(sk) later
2078 	 */
2079 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2080 		__sk_free(sk);
2081 }
2082 EXPORT_SYMBOL(sk_free);
2083 
2084 static void sk_init_common(struct sock *sk)
2085 {
2086 	skb_queue_head_init(&sk->sk_receive_queue);
2087 	skb_queue_head_init(&sk->sk_write_queue);
2088 	skb_queue_head_init(&sk->sk_error_queue);
2089 
2090 	rwlock_init(&sk->sk_callback_lock);
2091 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2092 			af_rlock_keys + sk->sk_family,
2093 			af_family_rlock_key_strings[sk->sk_family]);
2094 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2095 			af_wlock_keys + sk->sk_family,
2096 			af_family_wlock_key_strings[sk->sk_family]);
2097 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2098 			af_elock_keys + sk->sk_family,
2099 			af_family_elock_key_strings[sk->sk_family]);
2100 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2101 			af_callback_keys + sk->sk_family,
2102 			af_family_clock_key_strings[sk->sk_family]);
2103 }
2104 
2105 /**
2106  *	sk_clone_lock - clone a socket, and lock its clone
2107  *	@sk: the socket to clone
2108  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2109  *
2110  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2111  */
2112 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2113 {
2114 	struct proto *prot = READ_ONCE(sk->sk_prot);
2115 	struct sk_filter *filter;
2116 	bool is_charged = true;
2117 	struct sock *newsk;
2118 
2119 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2120 	if (!newsk)
2121 		goto out;
2122 
2123 	sock_copy(newsk, sk);
2124 
2125 	newsk->sk_prot_creator = prot;
2126 
2127 	/* SANITY */
2128 	if (likely(newsk->sk_net_refcnt)) {
2129 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2130 		sock_inuse_add(sock_net(newsk), 1);
2131 	}
2132 	sk_node_init(&newsk->sk_node);
2133 	sock_lock_init(newsk);
2134 	bh_lock_sock(newsk);
2135 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2136 	newsk->sk_backlog.len = 0;
2137 
2138 	atomic_set(&newsk->sk_rmem_alloc, 0);
2139 
2140 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2141 	refcount_set(&newsk->sk_wmem_alloc, 1);
2142 
2143 	atomic_set(&newsk->sk_omem_alloc, 0);
2144 	sk_init_common(newsk);
2145 
2146 	newsk->sk_dst_cache	= NULL;
2147 	newsk->sk_dst_pending_confirm = 0;
2148 	newsk->sk_wmem_queued	= 0;
2149 	newsk->sk_forward_alloc = 0;
2150 	newsk->sk_reserved_mem  = 0;
2151 	atomic_set(&newsk->sk_drops, 0);
2152 	newsk->sk_send_head	= NULL;
2153 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2154 	atomic_set(&newsk->sk_zckey, 0);
2155 
2156 	sock_reset_flag(newsk, SOCK_DONE);
2157 
2158 	/* sk->sk_memcg will be populated at accept() time */
2159 	newsk->sk_memcg = NULL;
2160 
2161 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2162 
2163 	rcu_read_lock();
2164 	filter = rcu_dereference(sk->sk_filter);
2165 	if (filter != NULL)
2166 		/* though it's an empty new sock, the charging may fail
2167 		 * if sysctl_optmem_max was changed between creation of
2168 		 * original socket and cloning
2169 		 */
2170 		is_charged = sk_filter_charge(newsk, filter);
2171 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2172 	rcu_read_unlock();
2173 
2174 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2175 		/* We need to make sure that we don't uncharge the new
2176 		 * socket if we couldn't charge it in the first place
2177 		 * as otherwise we uncharge the parent's filter.
2178 		 */
2179 		if (!is_charged)
2180 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2181 		sk_free_unlock_clone(newsk);
2182 		newsk = NULL;
2183 		goto out;
2184 	}
2185 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2186 
2187 	if (bpf_sk_storage_clone(sk, newsk)) {
2188 		sk_free_unlock_clone(newsk);
2189 		newsk = NULL;
2190 		goto out;
2191 	}
2192 
2193 	/* Clear sk_user_data if parent had the pointer tagged
2194 	 * as not suitable for copying when cloning.
2195 	 */
2196 	if (sk_user_data_is_nocopy(newsk))
2197 		newsk->sk_user_data = NULL;
2198 
2199 	newsk->sk_err	   = 0;
2200 	newsk->sk_err_soft = 0;
2201 	newsk->sk_priority = 0;
2202 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2203 
2204 	/* Before updating sk_refcnt, we must commit prior changes to memory
2205 	 * (Documentation/RCU/rculist_nulls.rst for details)
2206 	 */
2207 	smp_wmb();
2208 	refcount_set(&newsk->sk_refcnt, 2);
2209 
2210 	/* Increment the counter in the same struct proto as the master
2211 	 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2212 	 * is the same as sk->sk_prot->socks, as this field was copied
2213 	 * with memcpy).
2214 	 *
2215 	 * This _changes_ the previous behaviour, where
2216 	 * tcp_create_openreq_child always was incrementing the
2217 	 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2218 	 * to be taken into account in all callers. -acme
2219 	 */
2220 	sk_refcnt_debug_inc(newsk);
2221 	sk_set_socket(newsk, NULL);
2222 	sk_tx_queue_clear(newsk);
2223 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2224 
2225 	if (newsk->sk_prot->sockets_allocated)
2226 		sk_sockets_allocated_inc(newsk);
2227 
2228 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2229 		net_enable_timestamp();
2230 out:
2231 	return newsk;
2232 }
2233 EXPORT_SYMBOL_GPL(sk_clone_lock);
2234 
2235 void sk_free_unlock_clone(struct sock *sk)
2236 {
2237 	/* It is still raw copy of parent, so invalidate
2238 	 * destructor and make plain sk_free() */
2239 	sk->sk_destruct = NULL;
2240 	bh_unlock_sock(sk);
2241 	sk_free(sk);
2242 }
2243 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2244 
2245 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2246 {
2247 	u32 max_segs = 1;
2248 
2249 	sk_dst_set(sk, dst);
2250 	sk->sk_route_caps = dst->dev->features;
2251 	if (sk_is_tcp(sk))
2252 		sk->sk_route_caps |= NETIF_F_GSO;
2253 	if (sk->sk_route_caps & NETIF_F_GSO)
2254 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2255 	if (unlikely(sk->sk_gso_disabled))
2256 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2257 	if (sk_can_gso(sk)) {
2258 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2259 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2260 		} else {
2261 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2262 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
2263 			sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
2264 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2265 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2266 		}
2267 	}
2268 	sk->sk_gso_max_segs = max_segs;
2269 }
2270 EXPORT_SYMBOL_GPL(sk_setup_caps);
2271 
2272 /*
2273  *	Simple resource managers for sockets.
2274  */
2275 
2276 
2277 /*
2278  * Write buffer destructor automatically called from kfree_skb.
2279  */
2280 void sock_wfree(struct sk_buff *skb)
2281 {
2282 	struct sock *sk = skb->sk;
2283 	unsigned int len = skb->truesize;
2284 
2285 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2286 		/*
2287 		 * Keep a reference on sk_wmem_alloc, this will be released
2288 		 * after sk_write_space() call
2289 		 */
2290 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2291 		sk->sk_write_space(sk);
2292 		len = 1;
2293 	}
2294 	/*
2295 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2296 	 * could not do because of in-flight packets
2297 	 */
2298 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2299 		__sk_free(sk);
2300 }
2301 EXPORT_SYMBOL(sock_wfree);
2302 
2303 /* This variant of sock_wfree() is used by TCP,
2304  * since it sets SOCK_USE_WRITE_QUEUE.
2305  */
2306 void __sock_wfree(struct sk_buff *skb)
2307 {
2308 	struct sock *sk = skb->sk;
2309 
2310 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2311 		__sk_free(sk);
2312 }
2313 
2314 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2315 {
2316 	skb_orphan(skb);
2317 	skb->sk = sk;
2318 #ifdef CONFIG_INET
2319 	if (unlikely(!sk_fullsock(sk))) {
2320 		skb->destructor = sock_edemux;
2321 		sock_hold(sk);
2322 		return;
2323 	}
2324 #endif
2325 	skb->destructor = sock_wfree;
2326 	skb_set_hash_from_sk(skb, sk);
2327 	/*
2328 	 * We used to take a refcount on sk, but following operation
2329 	 * is enough to guarantee sk_free() wont free this sock until
2330 	 * all in-flight packets are completed
2331 	 */
2332 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2333 }
2334 EXPORT_SYMBOL(skb_set_owner_w);
2335 
2336 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2337 {
2338 #ifdef CONFIG_TLS_DEVICE
2339 	/* Drivers depend on in-order delivery for crypto offload,
2340 	 * partial orphan breaks out-of-order-OK logic.
2341 	 */
2342 	if (skb->decrypted)
2343 		return false;
2344 #endif
2345 	return (skb->destructor == sock_wfree ||
2346 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2347 }
2348 
2349 /* This helper is used by netem, as it can hold packets in its
2350  * delay queue. We want to allow the owner socket to send more
2351  * packets, as if they were already TX completed by a typical driver.
2352  * But we also want to keep skb->sk set because some packet schedulers
2353  * rely on it (sch_fq for example).
2354  */
2355 void skb_orphan_partial(struct sk_buff *skb)
2356 {
2357 	if (skb_is_tcp_pure_ack(skb))
2358 		return;
2359 
2360 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2361 		return;
2362 
2363 	skb_orphan(skb);
2364 }
2365 EXPORT_SYMBOL(skb_orphan_partial);
2366 
2367 /*
2368  * Read buffer destructor automatically called from kfree_skb.
2369  */
2370 void sock_rfree(struct sk_buff *skb)
2371 {
2372 	struct sock *sk = skb->sk;
2373 	unsigned int len = skb->truesize;
2374 
2375 	atomic_sub(len, &sk->sk_rmem_alloc);
2376 	sk_mem_uncharge(sk, len);
2377 }
2378 EXPORT_SYMBOL(sock_rfree);
2379 
2380 /*
2381  * Buffer destructor for skbs that are not used directly in read or write
2382  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2383  */
2384 void sock_efree(struct sk_buff *skb)
2385 {
2386 	sock_put(skb->sk);
2387 }
2388 EXPORT_SYMBOL(sock_efree);
2389 
2390 /* Buffer destructor for prefetch/receive path where reference count may
2391  * not be held, e.g. for listen sockets.
2392  */
2393 #ifdef CONFIG_INET
2394 void sock_pfree(struct sk_buff *skb)
2395 {
2396 	if (sk_is_refcounted(skb->sk))
2397 		sock_gen_put(skb->sk);
2398 }
2399 EXPORT_SYMBOL(sock_pfree);
2400 #endif /* CONFIG_INET */
2401 
2402 kuid_t sock_i_uid(struct sock *sk)
2403 {
2404 	kuid_t uid;
2405 
2406 	read_lock_bh(&sk->sk_callback_lock);
2407 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2408 	read_unlock_bh(&sk->sk_callback_lock);
2409 	return uid;
2410 }
2411 EXPORT_SYMBOL(sock_i_uid);
2412 
2413 unsigned long sock_i_ino(struct sock *sk)
2414 {
2415 	unsigned long ino;
2416 
2417 	read_lock_bh(&sk->sk_callback_lock);
2418 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2419 	read_unlock_bh(&sk->sk_callback_lock);
2420 	return ino;
2421 }
2422 EXPORT_SYMBOL(sock_i_ino);
2423 
2424 /*
2425  * Allocate a skb from the socket's send buffer.
2426  */
2427 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2428 			     gfp_t priority)
2429 {
2430 	if (force ||
2431 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2432 		struct sk_buff *skb = alloc_skb(size, priority);
2433 
2434 		if (skb) {
2435 			skb_set_owner_w(skb, sk);
2436 			return skb;
2437 		}
2438 	}
2439 	return NULL;
2440 }
2441 EXPORT_SYMBOL(sock_wmalloc);
2442 
2443 static void sock_ofree(struct sk_buff *skb)
2444 {
2445 	struct sock *sk = skb->sk;
2446 
2447 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2448 }
2449 
2450 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2451 			     gfp_t priority)
2452 {
2453 	struct sk_buff *skb;
2454 
2455 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2456 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2457 	    sysctl_optmem_max)
2458 		return NULL;
2459 
2460 	skb = alloc_skb(size, priority);
2461 	if (!skb)
2462 		return NULL;
2463 
2464 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2465 	skb->sk = sk;
2466 	skb->destructor = sock_ofree;
2467 	return skb;
2468 }
2469 
2470 /*
2471  * Allocate a memory block from the socket's option memory buffer.
2472  */
2473 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2474 {
2475 	if ((unsigned int)size <= sysctl_optmem_max &&
2476 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2477 		void *mem;
2478 		/* First do the add, to avoid the race if kmalloc
2479 		 * might sleep.
2480 		 */
2481 		atomic_add(size, &sk->sk_omem_alloc);
2482 		mem = kmalloc(size, priority);
2483 		if (mem)
2484 			return mem;
2485 		atomic_sub(size, &sk->sk_omem_alloc);
2486 	}
2487 	return NULL;
2488 }
2489 EXPORT_SYMBOL(sock_kmalloc);
2490 
2491 /* Free an option memory block. Note, we actually want the inline
2492  * here as this allows gcc to detect the nullify and fold away the
2493  * condition entirely.
2494  */
2495 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2496 				  const bool nullify)
2497 {
2498 	if (WARN_ON_ONCE(!mem))
2499 		return;
2500 	if (nullify)
2501 		kfree_sensitive(mem);
2502 	else
2503 		kfree(mem);
2504 	atomic_sub(size, &sk->sk_omem_alloc);
2505 }
2506 
2507 void sock_kfree_s(struct sock *sk, void *mem, int size)
2508 {
2509 	__sock_kfree_s(sk, mem, size, false);
2510 }
2511 EXPORT_SYMBOL(sock_kfree_s);
2512 
2513 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2514 {
2515 	__sock_kfree_s(sk, mem, size, true);
2516 }
2517 EXPORT_SYMBOL(sock_kzfree_s);
2518 
2519 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2520    I think, these locks should be removed for datagram sockets.
2521  */
2522 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2523 {
2524 	DEFINE_WAIT(wait);
2525 
2526 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2527 	for (;;) {
2528 		if (!timeo)
2529 			break;
2530 		if (signal_pending(current))
2531 			break;
2532 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2533 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2534 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2535 			break;
2536 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2537 			break;
2538 		if (sk->sk_err)
2539 			break;
2540 		timeo = schedule_timeout(timeo);
2541 	}
2542 	finish_wait(sk_sleep(sk), &wait);
2543 	return timeo;
2544 }
2545 
2546 
2547 /*
2548  *	Generic send/receive buffer handlers
2549  */
2550 
2551 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2552 				     unsigned long data_len, int noblock,
2553 				     int *errcode, int max_page_order)
2554 {
2555 	struct sk_buff *skb;
2556 	long timeo;
2557 	int err;
2558 
2559 	timeo = sock_sndtimeo(sk, noblock);
2560 	for (;;) {
2561 		err = sock_error(sk);
2562 		if (err != 0)
2563 			goto failure;
2564 
2565 		err = -EPIPE;
2566 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2567 			goto failure;
2568 
2569 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2570 			break;
2571 
2572 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2573 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2574 		err = -EAGAIN;
2575 		if (!timeo)
2576 			goto failure;
2577 		if (signal_pending(current))
2578 			goto interrupted;
2579 		timeo = sock_wait_for_wmem(sk, timeo);
2580 	}
2581 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2582 				   errcode, sk->sk_allocation);
2583 	if (skb)
2584 		skb_set_owner_w(skb, sk);
2585 	return skb;
2586 
2587 interrupted:
2588 	err = sock_intr_errno(timeo);
2589 failure:
2590 	*errcode = err;
2591 	return NULL;
2592 }
2593 EXPORT_SYMBOL(sock_alloc_send_pskb);
2594 
2595 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2596 				    int noblock, int *errcode)
2597 {
2598 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2599 }
2600 EXPORT_SYMBOL(sock_alloc_send_skb);
2601 
2602 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2603 		     struct sockcm_cookie *sockc)
2604 {
2605 	u32 tsflags;
2606 
2607 	switch (cmsg->cmsg_type) {
2608 	case SO_MARK:
2609 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2610 			return -EPERM;
2611 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2612 			return -EINVAL;
2613 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2614 		break;
2615 	case SO_TIMESTAMPING_OLD:
2616 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2617 			return -EINVAL;
2618 
2619 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2620 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2621 			return -EINVAL;
2622 
2623 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2624 		sockc->tsflags |= tsflags;
2625 		break;
2626 	case SCM_TXTIME:
2627 		if (!sock_flag(sk, SOCK_TXTIME))
2628 			return -EINVAL;
2629 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2630 			return -EINVAL;
2631 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2632 		break;
2633 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2634 	case SCM_RIGHTS:
2635 	case SCM_CREDENTIALS:
2636 		break;
2637 	default:
2638 		return -EINVAL;
2639 	}
2640 	return 0;
2641 }
2642 EXPORT_SYMBOL(__sock_cmsg_send);
2643 
2644 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2645 		   struct sockcm_cookie *sockc)
2646 {
2647 	struct cmsghdr *cmsg;
2648 	int ret;
2649 
2650 	for_each_cmsghdr(cmsg, msg) {
2651 		if (!CMSG_OK(msg, cmsg))
2652 			return -EINVAL;
2653 		if (cmsg->cmsg_level != SOL_SOCKET)
2654 			continue;
2655 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2656 		if (ret)
2657 			return ret;
2658 	}
2659 	return 0;
2660 }
2661 EXPORT_SYMBOL(sock_cmsg_send);
2662 
2663 static void sk_enter_memory_pressure(struct sock *sk)
2664 {
2665 	if (!sk->sk_prot->enter_memory_pressure)
2666 		return;
2667 
2668 	sk->sk_prot->enter_memory_pressure(sk);
2669 }
2670 
2671 static void sk_leave_memory_pressure(struct sock *sk)
2672 {
2673 	if (sk->sk_prot->leave_memory_pressure) {
2674 		sk->sk_prot->leave_memory_pressure(sk);
2675 	} else {
2676 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2677 
2678 		if (memory_pressure && READ_ONCE(*memory_pressure))
2679 			WRITE_ONCE(*memory_pressure, 0);
2680 	}
2681 }
2682 
2683 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2684 
2685 /**
2686  * skb_page_frag_refill - check that a page_frag contains enough room
2687  * @sz: minimum size of the fragment we want to get
2688  * @pfrag: pointer to page_frag
2689  * @gfp: priority for memory allocation
2690  *
2691  * Note: While this allocator tries to use high order pages, there is
2692  * no guarantee that allocations succeed. Therefore, @sz MUST be
2693  * less or equal than PAGE_SIZE.
2694  */
2695 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2696 {
2697 	if (pfrag->page) {
2698 		if (page_ref_count(pfrag->page) == 1) {
2699 			pfrag->offset = 0;
2700 			return true;
2701 		}
2702 		if (pfrag->offset + sz <= pfrag->size)
2703 			return true;
2704 		put_page(pfrag->page);
2705 	}
2706 
2707 	pfrag->offset = 0;
2708 	if (SKB_FRAG_PAGE_ORDER &&
2709 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2710 		/* Avoid direct reclaim but allow kswapd to wake */
2711 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2712 					  __GFP_COMP | __GFP_NOWARN |
2713 					  __GFP_NORETRY,
2714 					  SKB_FRAG_PAGE_ORDER);
2715 		if (likely(pfrag->page)) {
2716 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2717 			return true;
2718 		}
2719 	}
2720 	pfrag->page = alloc_page(gfp);
2721 	if (likely(pfrag->page)) {
2722 		pfrag->size = PAGE_SIZE;
2723 		return true;
2724 	}
2725 	return false;
2726 }
2727 EXPORT_SYMBOL(skb_page_frag_refill);
2728 
2729 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2730 {
2731 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2732 		return true;
2733 
2734 	sk_enter_memory_pressure(sk);
2735 	sk_stream_moderate_sndbuf(sk);
2736 	return false;
2737 }
2738 EXPORT_SYMBOL(sk_page_frag_refill);
2739 
2740 void __lock_sock(struct sock *sk)
2741 	__releases(&sk->sk_lock.slock)
2742 	__acquires(&sk->sk_lock.slock)
2743 {
2744 	DEFINE_WAIT(wait);
2745 
2746 	for (;;) {
2747 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2748 					TASK_UNINTERRUPTIBLE);
2749 		spin_unlock_bh(&sk->sk_lock.slock);
2750 		schedule();
2751 		spin_lock_bh(&sk->sk_lock.slock);
2752 		if (!sock_owned_by_user(sk))
2753 			break;
2754 	}
2755 	finish_wait(&sk->sk_lock.wq, &wait);
2756 }
2757 
2758 void __release_sock(struct sock *sk)
2759 	__releases(&sk->sk_lock.slock)
2760 	__acquires(&sk->sk_lock.slock)
2761 {
2762 	struct sk_buff *skb, *next;
2763 
2764 	while ((skb = sk->sk_backlog.head) != NULL) {
2765 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2766 
2767 		spin_unlock_bh(&sk->sk_lock.slock);
2768 
2769 		do {
2770 			next = skb->next;
2771 			prefetch(next);
2772 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2773 			skb_mark_not_on_list(skb);
2774 			sk_backlog_rcv(sk, skb);
2775 
2776 			cond_resched();
2777 
2778 			skb = next;
2779 		} while (skb != NULL);
2780 
2781 		spin_lock_bh(&sk->sk_lock.slock);
2782 	}
2783 
2784 	/*
2785 	 * Doing the zeroing here guarantee we can not loop forever
2786 	 * while a wild producer attempts to flood us.
2787 	 */
2788 	sk->sk_backlog.len = 0;
2789 }
2790 
2791 void __sk_flush_backlog(struct sock *sk)
2792 {
2793 	spin_lock_bh(&sk->sk_lock.slock);
2794 	__release_sock(sk);
2795 	spin_unlock_bh(&sk->sk_lock.slock);
2796 }
2797 
2798 /**
2799  * sk_wait_data - wait for data to arrive at sk_receive_queue
2800  * @sk:    sock to wait on
2801  * @timeo: for how long
2802  * @skb:   last skb seen on sk_receive_queue
2803  *
2804  * Now socket state including sk->sk_err is changed only under lock,
2805  * hence we may omit checks after joining wait queue.
2806  * We check receive queue before schedule() only as optimization;
2807  * it is very likely that release_sock() added new data.
2808  */
2809 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2810 {
2811 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2812 	int rc;
2813 
2814 	add_wait_queue(sk_sleep(sk), &wait);
2815 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2816 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2817 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2818 	remove_wait_queue(sk_sleep(sk), &wait);
2819 	return rc;
2820 }
2821 EXPORT_SYMBOL(sk_wait_data);
2822 
2823 /**
2824  *	__sk_mem_raise_allocated - increase memory_allocated
2825  *	@sk: socket
2826  *	@size: memory size to allocate
2827  *	@amt: pages to allocate
2828  *	@kind: allocation type
2829  *
2830  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2831  */
2832 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2833 {
2834 	struct proto *prot = sk->sk_prot;
2835 	long allocated = sk_memory_allocated_add(sk, amt);
2836 	bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2837 	bool charged = true;
2838 
2839 	if (memcg_charge &&
2840 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2841 						gfp_memcg_charge())))
2842 		goto suppress_allocation;
2843 
2844 	/* Under limit. */
2845 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2846 		sk_leave_memory_pressure(sk);
2847 		return 1;
2848 	}
2849 
2850 	/* Under pressure. */
2851 	if (allocated > sk_prot_mem_limits(sk, 1))
2852 		sk_enter_memory_pressure(sk);
2853 
2854 	/* Over hard limit. */
2855 	if (allocated > sk_prot_mem_limits(sk, 2))
2856 		goto suppress_allocation;
2857 
2858 	/* guarantee minimum buffer size under pressure */
2859 	if (kind == SK_MEM_RECV) {
2860 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2861 			return 1;
2862 
2863 	} else { /* SK_MEM_SEND */
2864 		int wmem0 = sk_get_wmem0(sk, prot);
2865 
2866 		if (sk->sk_type == SOCK_STREAM) {
2867 			if (sk->sk_wmem_queued < wmem0)
2868 				return 1;
2869 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2870 				return 1;
2871 		}
2872 	}
2873 
2874 	if (sk_has_memory_pressure(sk)) {
2875 		u64 alloc;
2876 
2877 		if (!sk_under_memory_pressure(sk))
2878 			return 1;
2879 		alloc = sk_sockets_allocated_read_positive(sk);
2880 		if (sk_prot_mem_limits(sk, 2) > alloc *
2881 		    sk_mem_pages(sk->sk_wmem_queued +
2882 				 atomic_read(&sk->sk_rmem_alloc) +
2883 				 sk->sk_forward_alloc))
2884 			return 1;
2885 	}
2886 
2887 suppress_allocation:
2888 
2889 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2890 		sk_stream_moderate_sndbuf(sk);
2891 
2892 		/* Fail only if socket is _under_ its sndbuf.
2893 		 * In this case we cannot block, so that we have to fail.
2894 		 */
2895 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2896 			/* Force charge with __GFP_NOFAIL */
2897 			if (memcg_charge && !charged) {
2898 				mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2899 					gfp_memcg_charge() | __GFP_NOFAIL);
2900 			}
2901 			return 1;
2902 		}
2903 	}
2904 
2905 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2906 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2907 
2908 	sk_memory_allocated_sub(sk, amt);
2909 
2910 	if (memcg_charge && charged)
2911 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2912 
2913 	return 0;
2914 }
2915 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2916 
2917 /**
2918  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2919  *	@sk: socket
2920  *	@size: memory size to allocate
2921  *	@kind: allocation type
2922  *
2923  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2924  *	rmem allocation. This function assumes that protocols which have
2925  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2926  */
2927 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2928 {
2929 	int ret, amt = sk_mem_pages(size);
2930 
2931 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2932 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2933 	if (!ret)
2934 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2935 	return ret;
2936 }
2937 EXPORT_SYMBOL(__sk_mem_schedule);
2938 
2939 /**
2940  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2941  *	@sk: socket
2942  *	@amount: number of quanta
2943  *
2944  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2945  */
2946 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2947 {
2948 	sk_memory_allocated_sub(sk, amount);
2949 
2950 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2951 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2952 
2953 	if (sk_under_memory_pressure(sk) &&
2954 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2955 		sk_leave_memory_pressure(sk);
2956 }
2957 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2958 
2959 /**
2960  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2961  *	@sk: socket
2962  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2963  */
2964 void __sk_mem_reclaim(struct sock *sk, int amount)
2965 {
2966 	amount >>= SK_MEM_QUANTUM_SHIFT;
2967 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2968 	__sk_mem_reduce_allocated(sk, amount);
2969 }
2970 EXPORT_SYMBOL(__sk_mem_reclaim);
2971 
2972 int sk_set_peek_off(struct sock *sk, int val)
2973 {
2974 	sk->sk_peek_off = val;
2975 	return 0;
2976 }
2977 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2978 
2979 /*
2980  * Set of default routines for initialising struct proto_ops when
2981  * the protocol does not support a particular function. In certain
2982  * cases where it makes no sense for a protocol to have a "do nothing"
2983  * function, some default processing is provided.
2984  */
2985 
2986 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2987 {
2988 	return -EOPNOTSUPP;
2989 }
2990 EXPORT_SYMBOL(sock_no_bind);
2991 
2992 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2993 		    int len, int flags)
2994 {
2995 	return -EOPNOTSUPP;
2996 }
2997 EXPORT_SYMBOL(sock_no_connect);
2998 
2999 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3000 {
3001 	return -EOPNOTSUPP;
3002 }
3003 EXPORT_SYMBOL(sock_no_socketpair);
3004 
3005 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3006 		   bool kern)
3007 {
3008 	return -EOPNOTSUPP;
3009 }
3010 EXPORT_SYMBOL(sock_no_accept);
3011 
3012 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3013 		    int peer)
3014 {
3015 	return -EOPNOTSUPP;
3016 }
3017 EXPORT_SYMBOL(sock_no_getname);
3018 
3019 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3020 {
3021 	return -EOPNOTSUPP;
3022 }
3023 EXPORT_SYMBOL(sock_no_ioctl);
3024 
3025 int sock_no_listen(struct socket *sock, int backlog)
3026 {
3027 	return -EOPNOTSUPP;
3028 }
3029 EXPORT_SYMBOL(sock_no_listen);
3030 
3031 int sock_no_shutdown(struct socket *sock, int how)
3032 {
3033 	return -EOPNOTSUPP;
3034 }
3035 EXPORT_SYMBOL(sock_no_shutdown);
3036 
3037 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3038 {
3039 	return -EOPNOTSUPP;
3040 }
3041 EXPORT_SYMBOL(sock_no_sendmsg);
3042 
3043 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3044 {
3045 	return -EOPNOTSUPP;
3046 }
3047 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3048 
3049 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3050 		    int flags)
3051 {
3052 	return -EOPNOTSUPP;
3053 }
3054 EXPORT_SYMBOL(sock_no_recvmsg);
3055 
3056 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3057 {
3058 	/* Mirror missing mmap method error code */
3059 	return -ENODEV;
3060 }
3061 EXPORT_SYMBOL(sock_no_mmap);
3062 
3063 /*
3064  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3065  * various sock-based usage counts.
3066  */
3067 void __receive_sock(struct file *file)
3068 {
3069 	struct socket *sock;
3070 
3071 	sock = sock_from_file(file);
3072 	if (sock) {
3073 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3074 		sock_update_classid(&sock->sk->sk_cgrp_data);
3075 	}
3076 }
3077 
3078 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3079 {
3080 	ssize_t res;
3081 	struct msghdr msg = {.msg_flags = flags};
3082 	struct kvec iov;
3083 	char *kaddr = kmap(page);
3084 	iov.iov_base = kaddr + offset;
3085 	iov.iov_len = size;
3086 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3087 	kunmap(page);
3088 	return res;
3089 }
3090 EXPORT_SYMBOL(sock_no_sendpage);
3091 
3092 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3093 				int offset, size_t size, int flags)
3094 {
3095 	ssize_t res;
3096 	struct msghdr msg = {.msg_flags = flags};
3097 	struct kvec iov;
3098 	char *kaddr = kmap(page);
3099 
3100 	iov.iov_base = kaddr + offset;
3101 	iov.iov_len = size;
3102 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3103 	kunmap(page);
3104 	return res;
3105 }
3106 EXPORT_SYMBOL(sock_no_sendpage_locked);
3107 
3108 /*
3109  *	Default Socket Callbacks
3110  */
3111 
3112 static void sock_def_wakeup(struct sock *sk)
3113 {
3114 	struct socket_wq *wq;
3115 
3116 	rcu_read_lock();
3117 	wq = rcu_dereference(sk->sk_wq);
3118 	if (skwq_has_sleeper(wq))
3119 		wake_up_interruptible_all(&wq->wait);
3120 	rcu_read_unlock();
3121 }
3122 
3123 static void sock_def_error_report(struct sock *sk)
3124 {
3125 	struct socket_wq *wq;
3126 
3127 	rcu_read_lock();
3128 	wq = rcu_dereference(sk->sk_wq);
3129 	if (skwq_has_sleeper(wq))
3130 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3131 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3132 	rcu_read_unlock();
3133 }
3134 
3135 void sock_def_readable(struct sock *sk)
3136 {
3137 	struct socket_wq *wq;
3138 
3139 	rcu_read_lock();
3140 	wq = rcu_dereference(sk->sk_wq);
3141 	if (skwq_has_sleeper(wq))
3142 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3143 						EPOLLRDNORM | EPOLLRDBAND);
3144 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3145 	rcu_read_unlock();
3146 }
3147 
3148 static void sock_def_write_space(struct sock *sk)
3149 {
3150 	struct socket_wq *wq;
3151 
3152 	rcu_read_lock();
3153 
3154 	/* Do not wake up a writer until he can make "significant"
3155 	 * progress.  --DaveM
3156 	 */
3157 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
3158 		wq = rcu_dereference(sk->sk_wq);
3159 		if (skwq_has_sleeper(wq))
3160 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3161 						EPOLLWRNORM | EPOLLWRBAND);
3162 
3163 		/* Should agree with poll, otherwise some programs break */
3164 		if (sock_writeable(sk))
3165 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3166 	}
3167 
3168 	rcu_read_unlock();
3169 }
3170 
3171 static void sock_def_destruct(struct sock *sk)
3172 {
3173 }
3174 
3175 void sk_send_sigurg(struct sock *sk)
3176 {
3177 	if (sk->sk_socket && sk->sk_socket->file)
3178 		if (send_sigurg(&sk->sk_socket->file->f_owner))
3179 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3180 }
3181 EXPORT_SYMBOL(sk_send_sigurg);
3182 
3183 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3184 		    unsigned long expires)
3185 {
3186 	if (!mod_timer(timer, expires))
3187 		sock_hold(sk);
3188 }
3189 EXPORT_SYMBOL(sk_reset_timer);
3190 
3191 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3192 {
3193 	if (del_timer(timer))
3194 		__sock_put(sk);
3195 }
3196 EXPORT_SYMBOL(sk_stop_timer);
3197 
3198 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3199 {
3200 	if (del_timer_sync(timer))
3201 		__sock_put(sk);
3202 }
3203 EXPORT_SYMBOL(sk_stop_timer_sync);
3204 
3205 void sock_init_data(struct socket *sock, struct sock *sk)
3206 {
3207 	sk_init_common(sk);
3208 	sk->sk_send_head	=	NULL;
3209 
3210 	timer_setup(&sk->sk_timer, NULL, 0);
3211 
3212 	sk->sk_allocation	=	GFP_KERNEL;
3213 	sk->sk_rcvbuf		=	sysctl_rmem_default;
3214 	sk->sk_sndbuf		=	sysctl_wmem_default;
3215 	sk->sk_state		=	TCP_CLOSE;
3216 	sk_set_socket(sk, sock);
3217 
3218 	sock_set_flag(sk, SOCK_ZAPPED);
3219 
3220 	if (sock) {
3221 		sk->sk_type	=	sock->type;
3222 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3223 		sock->sk	=	sk;
3224 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
3225 	} else {
3226 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3227 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
3228 	}
3229 
3230 	rwlock_init(&sk->sk_callback_lock);
3231 	if (sk->sk_kern_sock)
3232 		lockdep_set_class_and_name(
3233 			&sk->sk_callback_lock,
3234 			af_kern_callback_keys + sk->sk_family,
3235 			af_family_kern_clock_key_strings[sk->sk_family]);
3236 	else
3237 		lockdep_set_class_and_name(
3238 			&sk->sk_callback_lock,
3239 			af_callback_keys + sk->sk_family,
3240 			af_family_clock_key_strings[sk->sk_family]);
3241 
3242 	sk->sk_state_change	=	sock_def_wakeup;
3243 	sk->sk_data_ready	=	sock_def_readable;
3244 	sk->sk_write_space	=	sock_def_write_space;
3245 	sk->sk_error_report	=	sock_def_error_report;
3246 	sk->sk_destruct		=	sock_def_destruct;
3247 
3248 	sk->sk_frag.page	=	NULL;
3249 	sk->sk_frag.offset	=	0;
3250 	sk->sk_peek_off		=	-1;
3251 
3252 	sk->sk_peer_pid 	=	NULL;
3253 	sk->sk_peer_cred	=	NULL;
3254 	spin_lock_init(&sk->sk_peer_lock);
3255 
3256 	sk->sk_write_pending	=	0;
3257 	sk->sk_rcvlowat		=	1;
3258 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3259 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3260 
3261 	sk->sk_stamp = SK_DEFAULT_STAMP;
3262 #if BITS_PER_LONG==32
3263 	seqlock_init(&sk->sk_stamp_seq);
3264 #endif
3265 	atomic_set(&sk->sk_zckey, 0);
3266 
3267 #ifdef CONFIG_NET_RX_BUSY_POLL
3268 	sk->sk_napi_id		=	0;
3269 	sk->sk_ll_usec		=	sysctl_net_busy_read;
3270 #endif
3271 
3272 	sk->sk_max_pacing_rate = ~0UL;
3273 	sk->sk_pacing_rate = ~0UL;
3274 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3275 	sk->sk_incoming_cpu = -1;
3276 
3277 	sk_rx_queue_clear(sk);
3278 	/*
3279 	 * Before updating sk_refcnt, we must commit prior changes to memory
3280 	 * (Documentation/RCU/rculist_nulls.rst for details)
3281 	 */
3282 	smp_wmb();
3283 	refcount_set(&sk->sk_refcnt, 1);
3284 	atomic_set(&sk->sk_drops, 0);
3285 }
3286 EXPORT_SYMBOL(sock_init_data);
3287 
3288 void lock_sock_nested(struct sock *sk, int subclass)
3289 {
3290 	/* The sk_lock has mutex_lock() semantics here. */
3291 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3292 
3293 	might_sleep();
3294 	spin_lock_bh(&sk->sk_lock.slock);
3295 	if (sock_owned_by_user_nocheck(sk))
3296 		__lock_sock(sk);
3297 	sk->sk_lock.owned = 1;
3298 	spin_unlock_bh(&sk->sk_lock.slock);
3299 }
3300 EXPORT_SYMBOL(lock_sock_nested);
3301 
3302 void release_sock(struct sock *sk)
3303 {
3304 	spin_lock_bh(&sk->sk_lock.slock);
3305 	if (sk->sk_backlog.tail)
3306 		__release_sock(sk);
3307 
3308 	/* Warning : release_cb() might need to release sk ownership,
3309 	 * ie call sock_release_ownership(sk) before us.
3310 	 */
3311 	if (sk->sk_prot->release_cb)
3312 		sk->sk_prot->release_cb(sk);
3313 
3314 	sock_release_ownership(sk);
3315 	if (waitqueue_active(&sk->sk_lock.wq))
3316 		wake_up(&sk->sk_lock.wq);
3317 	spin_unlock_bh(&sk->sk_lock.slock);
3318 }
3319 EXPORT_SYMBOL(release_sock);
3320 
3321 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3322 {
3323 	might_sleep();
3324 	spin_lock_bh(&sk->sk_lock.slock);
3325 
3326 	if (!sock_owned_by_user_nocheck(sk)) {
3327 		/*
3328 		 * Fast path return with bottom halves disabled and
3329 		 * sock::sk_lock.slock held.
3330 		 *
3331 		 * The 'mutex' is not contended and holding
3332 		 * sock::sk_lock.slock prevents all other lockers to
3333 		 * proceed so the corresponding unlock_sock_fast() can
3334 		 * avoid the slow path of release_sock() completely and
3335 		 * just release slock.
3336 		 *
3337 		 * From a semantical POV this is equivalent to 'acquiring'
3338 		 * the 'mutex', hence the corresponding lockdep
3339 		 * mutex_release() has to happen in the fast path of
3340 		 * unlock_sock_fast().
3341 		 */
3342 		return false;
3343 	}
3344 
3345 	__lock_sock(sk);
3346 	sk->sk_lock.owned = 1;
3347 	__acquire(&sk->sk_lock.slock);
3348 	spin_unlock_bh(&sk->sk_lock.slock);
3349 	return true;
3350 }
3351 EXPORT_SYMBOL(__lock_sock_fast);
3352 
3353 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3354 		   bool timeval, bool time32)
3355 {
3356 	struct sock *sk = sock->sk;
3357 	struct timespec64 ts;
3358 
3359 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3360 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3361 	if (ts.tv_sec == -1)
3362 		return -ENOENT;
3363 	if (ts.tv_sec == 0) {
3364 		ktime_t kt = ktime_get_real();
3365 		sock_write_timestamp(sk, kt);
3366 		ts = ktime_to_timespec64(kt);
3367 	}
3368 
3369 	if (timeval)
3370 		ts.tv_nsec /= 1000;
3371 
3372 #ifdef CONFIG_COMPAT_32BIT_TIME
3373 	if (time32)
3374 		return put_old_timespec32(&ts, userstamp);
3375 #endif
3376 #ifdef CONFIG_SPARC64
3377 	/* beware of padding in sparc64 timeval */
3378 	if (timeval && !in_compat_syscall()) {
3379 		struct __kernel_old_timeval __user tv = {
3380 			.tv_sec = ts.tv_sec,
3381 			.tv_usec = ts.tv_nsec,
3382 		};
3383 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3384 			return -EFAULT;
3385 		return 0;
3386 	}
3387 #endif
3388 	return put_timespec64(&ts, userstamp);
3389 }
3390 EXPORT_SYMBOL(sock_gettstamp);
3391 
3392 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3393 {
3394 	if (!sock_flag(sk, flag)) {
3395 		unsigned long previous_flags = sk->sk_flags;
3396 
3397 		sock_set_flag(sk, flag);
3398 		/*
3399 		 * we just set one of the two flags which require net
3400 		 * time stamping, but time stamping might have been on
3401 		 * already because of the other one
3402 		 */
3403 		if (sock_needs_netstamp(sk) &&
3404 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3405 			net_enable_timestamp();
3406 	}
3407 }
3408 
3409 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3410 		       int level, int type)
3411 {
3412 	struct sock_exterr_skb *serr;
3413 	struct sk_buff *skb;
3414 	int copied, err;
3415 
3416 	err = -EAGAIN;
3417 	skb = sock_dequeue_err_skb(sk);
3418 	if (skb == NULL)
3419 		goto out;
3420 
3421 	copied = skb->len;
3422 	if (copied > len) {
3423 		msg->msg_flags |= MSG_TRUNC;
3424 		copied = len;
3425 	}
3426 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3427 	if (err)
3428 		goto out_free_skb;
3429 
3430 	sock_recv_timestamp(msg, sk, skb);
3431 
3432 	serr = SKB_EXT_ERR(skb);
3433 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3434 
3435 	msg->msg_flags |= MSG_ERRQUEUE;
3436 	err = copied;
3437 
3438 out_free_skb:
3439 	kfree_skb(skb);
3440 out:
3441 	return err;
3442 }
3443 EXPORT_SYMBOL(sock_recv_errqueue);
3444 
3445 /*
3446  *	Get a socket option on an socket.
3447  *
3448  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3449  *	asynchronous errors should be reported by getsockopt. We assume
3450  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3451  */
3452 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3453 			   char __user *optval, int __user *optlen)
3454 {
3455 	struct sock *sk = sock->sk;
3456 
3457 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3458 }
3459 EXPORT_SYMBOL(sock_common_getsockopt);
3460 
3461 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3462 			int flags)
3463 {
3464 	struct sock *sk = sock->sk;
3465 	int addr_len = 0;
3466 	int err;
3467 
3468 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3469 				   flags & ~MSG_DONTWAIT, &addr_len);
3470 	if (err >= 0)
3471 		msg->msg_namelen = addr_len;
3472 	return err;
3473 }
3474 EXPORT_SYMBOL(sock_common_recvmsg);
3475 
3476 /*
3477  *	Set socket options on an inet socket.
3478  */
3479 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3480 			   sockptr_t optval, unsigned int optlen)
3481 {
3482 	struct sock *sk = sock->sk;
3483 
3484 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3485 }
3486 EXPORT_SYMBOL(sock_common_setsockopt);
3487 
3488 void sk_common_release(struct sock *sk)
3489 {
3490 	if (sk->sk_prot->destroy)
3491 		sk->sk_prot->destroy(sk);
3492 
3493 	/*
3494 	 * Observation: when sk_common_release is called, processes have
3495 	 * no access to socket. But net still has.
3496 	 * Step one, detach it from networking:
3497 	 *
3498 	 * A. Remove from hash tables.
3499 	 */
3500 
3501 	sk->sk_prot->unhash(sk);
3502 
3503 	/*
3504 	 * In this point socket cannot receive new packets, but it is possible
3505 	 * that some packets are in flight because some CPU runs receiver and
3506 	 * did hash table lookup before we unhashed socket. They will achieve
3507 	 * receive queue and will be purged by socket destructor.
3508 	 *
3509 	 * Also we still have packets pending on receive queue and probably,
3510 	 * our own packets waiting in device queues. sock_destroy will drain
3511 	 * receive queue, but transmitted packets will delay socket destruction
3512 	 * until the last reference will be released.
3513 	 */
3514 
3515 	sock_orphan(sk);
3516 
3517 	xfrm_sk_free_policy(sk);
3518 
3519 	sk_refcnt_debug_release(sk);
3520 
3521 	sock_put(sk);
3522 }
3523 EXPORT_SYMBOL(sk_common_release);
3524 
3525 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3526 {
3527 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3528 
3529 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3530 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3531 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3532 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3533 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3534 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3535 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3536 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3537 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3538 }
3539 
3540 #ifdef CONFIG_PROC_FS
3541 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3542 
3543 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3544 {
3545 	int cpu, idx = prot->inuse_idx;
3546 	int res = 0;
3547 
3548 	for_each_possible_cpu(cpu)
3549 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3550 
3551 	return res >= 0 ? res : 0;
3552 }
3553 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3554 
3555 int sock_inuse_get(struct net *net)
3556 {
3557 	int cpu, res = 0;
3558 
3559 	for_each_possible_cpu(cpu)
3560 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3561 
3562 	return res;
3563 }
3564 
3565 EXPORT_SYMBOL_GPL(sock_inuse_get);
3566 
3567 static int __net_init sock_inuse_init_net(struct net *net)
3568 {
3569 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3570 	if (net->core.prot_inuse == NULL)
3571 		return -ENOMEM;
3572 	return 0;
3573 }
3574 
3575 static void __net_exit sock_inuse_exit_net(struct net *net)
3576 {
3577 	free_percpu(net->core.prot_inuse);
3578 }
3579 
3580 static struct pernet_operations net_inuse_ops = {
3581 	.init = sock_inuse_init_net,
3582 	.exit = sock_inuse_exit_net,
3583 };
3584 
3585 static __init int net_inuse_init(void)
3586 {
3587 	if (register_pernet_subsys(&net_inuse_ops))
3588 		panic("Cannot initialize net inuse counters");
3589 
3590 	return 0;
3591 }
3592 
3593 core_initcall(net_inuse_init);
3594 
3595 static int assign_proto_idx(struct proto *prot)
3596 {
3597 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3598 
3599 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3600 		pr_err("PROTO_INUSE_NR exhausted\n");
3601 		return -ENOSPC;
3602 	}
3603 
3604 	set_bit(prot->inuse_idx, proto_inuse_idx);
3605 	return 0;
3606 }
3607 
3608 static void release_proto_idx(struct proto *prot)
3609 {
3610 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3611 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3612 }
3613 #else
3614 static inline int assign_proto_idx(struct proto *prot)
3615 {
3616 	return 0;
3617 }
3618 
3619 static inline void release_proto_idx(struct proto *prot)
3620 {
3621 }
3622 
3623 #endif
3624 
3625 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3626 {
3627 	if (!twsk_prot)
3628 		return;
3629 	kfree(twsk_prot->twsk_slab_name);
3630 	twsk_prot->twsk_slab_name = NULL;
3631 	kmem_cache_destroy(twsk_prot->twsk_slab);
3632 	twsk_prot->twsk_slab = NULL;
3633 }
3634 
3635 static int tw_prot_init(const struct proto *prot)
3636 {
3637 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3638 
3639 	if (!twsk_prot)
3640 		return 0;
3641 
3642 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3643 					      prot->name);
3644 	if (!twsk_prot->twsk_slab_name)
3645 		return -ENOMEM;
3646 
3647 	twsk_prot->twsk_slab =
3648 		kmem_cache_create(twsk_prot->twsk_slab_name,
3649 				  twsk_prot->twsk_obj_size, 0,
3650 				  SLAB_ACCOUNT | prot->slab_flags,
3651 				  NULL);
3652 	if (!twsk_prot->twsk_slab) {
3653 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3654 			prot->name);
3655 		return -ENOMEM;
3656 	}
3657 
3658 	return 0;
3659 }
3660 
3661 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3662 {
3663 	if (!rsk_prot)
3664 		return;
3665 	kfree(rsk_prot->slab_name);
3666 	rsk_prot->slab_name = NULL;
3667 	kmem_cache_destroy(rsk_prot->slab);
3668 	rsk_prot->slab = NULL;
3669 }
3670 
3671 static int req_prot_init(const struct proto *prot)
3672 {
3673 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3674 
3675 	if (!rsk_prot)
3676 		return 0;
3677 
3678 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3679 					prot->name);
3680 	if (!rsk_prot->slab_name)
3681 		return -ENOMEM;
3682 
3683 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3684 					   rsk_prot->obj_size, 0,
3685 					   SLAB_ACCOUNT | prot->slab_flags,
3686 					   NULL);
3687 
3688 	if (!rsk_prot->slab) {
3689 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3690 			prot->name);
3691 		return -ENOMEM;
3692 	}
3693 	return 0;
3694 }
3695 
3696 int proto_register(struct proto *prot, int alloc_slab)
3697 {
3698 	int ret = -ENOBUFS;
3699 
3700 	if (alloc_slab) {
3701 		prot->slab = kmem_cache_create_usercopy(prot->name,
3702 					prot->obj_size, 0,
3703 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3704 					prot->slab_flags,
3705 					prot->useroffset, prot->usersize,
3706 					NULL);
3707 
3708 		if (prot->slab == NULL) {
3709 			pr_crit("%s: Can't create sock SLAB cache!\n",
3710 				prot->name);
3711 			goto out;
3712 		}
3713 
3714 		if (req_prot_init(prot))
3715 			goto out_free_request_sock_slab;
3716 
3717 		if (tw_prot_init(prot))
3718 			goto out_free_timewait_sock_slab;
3719 	}
3720 
3721 	mutex_lock(&proto_list_mutex);
3722 	ret = assign_proto_idx(prot);
3723 	if (ret) {
3724 		mutex_unlock(&proto_list_mutex);
3725 		goto out_free_timewait_sock_slab;
3726 	}
3727 	list_add(&prot->node, &proto_list);
3728 	mutex_unlock(&proto_list_mutex);
3729 	return ret;
3730 
3731 out_free_timewait_sock_slab:
3732 	if (alloc_slab)
3733 		tw_prot_cleanup(prot->twsk_prot);
3734 out_free_request_sock_slab:
3735 	if (alloc_slab) {
3736 		req_prot_cleanup(prot->rsk_prot);
3737 
3738 		kmem_cache_destroy(prot->slab);
3739 		prot->slab = NULL;
3740 	}
3741 out:
3742 	return ret;
3743 }
3744 EXPORT_SYMBOL(proto_register);
3745 
3746 void proto_unregister(struct proto *prot)
3747 {
3748 	mutex_lock(&proto_list_mutex);
3749 	release_proto_idx(prot);
3750 	list_del(&prot->node);
3751 	mutex_unlock(&proto_list_mutex);
3752 
3753 	kmem_cache_destroy(prot->slab);
3754 	prot->slab = NULL;
3755 
3756 	req_prot_cleanup(prot->rsk_prot);
3757 	tw_prot_cleanup(prot->twsk_prot);
3758 }
3759 EXPORT_SYMBOL(proto_unregister);
3760 
3761 int sock_load_diag_module(int family, int protocol)
3762 {
3763 	if (!protocol) {
3764 		if (!sock_is_registered(family))
3765 			return -ENOENT;
3766 
3767 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3768 				      NETLINK_SOCK_DIAG, family);
3769 	}
3770 
3771 #ifdef CONFIG_INET
3772 	if (family == AF_INET &&
3773 	    protocol != IPPROTO_RAW &&
3774 	    protocol < MAX_INET_PROTOS &&
3775 	    !rcu_access_pointer(inet_protos[protocol]))
3776 		return -ENOENT;
3777 #endif
3778 
3779 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3780 			      NETLINK_SOCK_DIAG, family, protocol);
3781 }
3782 EXPORT_SYMBOL(sock_load_diag_module);
3783 
3784 #ifdef CONFIG_PROC_FS
3785 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3786 	__acquires(proto_list_mutex)
3787 {
3788 	mutex_lock(&proto_list_mutex);
3789 	return seq_list_start_head(&proto_list, *pos);
3790 }
3791 
3792 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3793 {
3794 	return seq_list_next(v, &proto_list, pos);
3795 }
3796 
3797 static void proto_seq_stop(struct seq_file *seq, void *v)
3798 	__releases(proto_list_mutex)
3799 {
3800 	mutex_unlock(&proto_list_mutex);
3801 }
3802 
3803 static char proto_method_implemented(const void *method)
3804 {
3805 	return method == NULL ? 'n' : 'y';
3806 }
3807 static long sock_prot_memory_allocated(struct proto *proto)
3808 {
3809 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3810 }
3811 
3812 static const char *sock_prot_memory_pressure(struct proto *proto)
3813 {
3814 	return proto->memory_pressure != NULL ?
3815 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3816 }
3817 
3818 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3819 {
3820 
3821 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3822 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3823 		   proto->name,
3824 		   proto->obj_size,
3825 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3826 		   sock_prot_memory_allocated(proto),
3827 		   sock_prot_memory_pressure(proto),
3828 		   proto->max_header,
3829 		   proto->slab == NULL ? "no" : "yes",
3830 		   module_name(proto->owner),
3831 		   proto_method_implemented(proto->close),
3832 		   proto_method_implemented(proto->connect),
3833 		   proto_method_implemented(proto->disconnect),
3834 		   proto_method_implemented(proto->accept),
3835 		   proto_method_implemented(proto->ioctl),
3836 		   proto_method_implemented(proto->init),
3837 		   proto_method_implemented(proto->destroy),
3838 		   proto_method_implemented(proto->shutdown),
3839 		   proto_method_implemented(proto->setsockopt),
3840 		   proto_method_implemented(proto->getsockopt),
3841 		   proto_method_implemented(proto->sendmsg),
3842 		   proto_method_implemented(proto->recvmsg),
3843 		   proto_method_implemented(proto->sendpage),
3844 		   proto_method_implemented(proto->bind),
3845 		   proto_method_implemented(proto->backlog_rcv),
3846 		   proto_method_implemented(proto->hash),
3847 		   proto_method_implemented(proto->unhash),
3848 		   proto_method_implemented(proto->get_port),
3849 		   proto_method_implemented(proto->enter_memory_pressure));
3850 }
3851 
3852 static int proto_seq_show(struct seq_file *seq, void *v)
3853 {
3854 	if (v == &proto_list)
3855 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3856 			   "protocol",
3857 			   "size",
3858 			   "sockets",
3859 			   "memory",
3860 			   "press",
3861 			   "maxhdr",
3862 			   "slab",
3863 			   "module",
3864 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3865 	else
3866 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3867 	return 0;
3868 }
3869 
3870 static const struct seq_operations proto_seq_ops = {
3871 	.start  = proto_seq_start,
3872 	.next   = proto_seq_next,
3873 	.stop   = proto_seq_stop,
3874 	.show   = proto_seq_show,
3875 };
3876 
3877 static __net_init int proto_init_net(struct net *net)
3878 {
3879 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3880 			sizeof(struct seq_net_private)))
3881 		return -ENOMEM;
3882 
3883 	return 0;
3884 }
3885 
3886 static __net_exit void proto_exit_net(struct net *net)
3887 {
3888 	remove_proc_entry("protocols", net->proc_net);
3889 }
3890 
3891 
3892 static __net_initdata struct pernet_operations proto_net_ops = {
3893 	.init = proto_init_net,
3894 	.exit = proto_exit_net,
3895 };
3896 
3897 static int __init proto_init(void)
3898 {
3899 	return register_pernet_subsys(&proto_net_ops);
3900 }
3901 
3902 subsys_initcall(proto_init);
3903 
3904 #endif /* PROC_FS */
3905 
3906 #ifdef CONFIG_NET_RX_BUSY_POLL
3907 bool sk_busy_loop_end(void *p, unsigned long start_time)
3908 {
3909 	struct sock *sk = p;
3910 
3911 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3912 	       sk_busy_loop_timeout(sk, start_time);
3913 }
3914 EXPORT_SYMBOL(sk_busy_loop_end);
3915 #endif /* CONFIG_NET_RX_BUSY_POLL */
3916 
3917 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3918 {
3919 	if (!sk->sk_prot->bind_add)
3920 		return -EOPNOTSUPP;
3921 	return sk->sk_prot->bind_add(sk, addr, addr_len);
3922 }
3923 EXPORT_SYMBOL(sock_bind_add);
3924