xref: /linux/net/core/sock.c (revision e3617433c3da3d0859a4bc67f3f975e87f650ebf)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117 
118 #include <linux/uaccess.h>
119 
120 #include <linux/netdevice.h>
121 #include <net/protocol.h>
122 #include <linux/skbuff.h>
123 #include <net/net_namespace.h>
124 #include <net/request_sock.h>
125 #include <net/sock.h>
126 #include <linux/net_tstamp.h>
127 #include <net/xfrm.h>
128 #include <linux/ipsec.h>
129 #include <net/cls_cgroup.h>
130 #include <net/netprio_cgroup.h>
131 #include <linux/sock_diag.h>
132 
133 #include <linux/filter.h>
134 #include <net/sock_reuseport.h>
135 #include <net/bpf_sk_storage.h>
136 
137 #include <trace/events/sock.h>
138 
139 #include <net/tcp.h>
140 #include <net/busy_poll.h>
141 
142 #include <linux/ethtool.h>
143 
144 static DEFINE_MUTEX(proto_list_mutex);
145 static LIST_HEAD(proto_list);
146 
147 /**
148  * sk_ns_capable - General socket capability test
149  * @sk: Socket to use a capability on or through
150  * @user_ns: The user namespace of the capability to use
151  * @cap: The capability to use
152  *
153  * Test to see if the opener of the socket had when the socket was
154  * created and the current process has the capability @cap in the user
155  * namespace @user_ns.
156  */
157 bool sk_ns_capable(const struct sock *sk,
158 		   struct user_namespace *user_ns, int cap)
159 {
160 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
161 		ns_capable(user_ns, cap);
162 }
163 EXPORT_SYMBOL(sk_ns_capable);
164 
165 /**
166  * sk_capable - Socket global capability test
167  * @sk: Socket to use a capability on or through
168  * @cap: The global capability to use
169  *
170  * Test to see if the opener of the socket had when the socket was
171  * created and the current process has the capability @cap in all user
172  * namespaces.
173  */
174 bool sk_capable(const struct sock *sk, int cap)
175 {
176 	return sk_ns_capable(sk, &init_user_ns, cap);
177 }
178 EXPORT_SYMBOL(sk_capable);
179 
180 /**
181  * sk_net_capable - Network namespace socket capability test
182  * @sk: Socket to use a capability on or through
183  * @cap: The capability to use
184  *
185  * Test to see if the opener of the socket had when the socket was created
186  * and the current process has the capability @cap over the network namespace
187  * the socket is a member of.
188  */
189 bool sk_net_capable(const struct sock *sk, int cap)
190 {
191 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
192 }
193 EXPORT_SYMBOL(sk_net_capable);
194 
195 /*
196  * Each address family might have different locking rules, so we have
197  * one slock key per address family and separate keys for internal and
198  * userspace sockets.
199  */
200 static struct lock_class_key af_family_keys[AF_MAX];
201 static struct lock_class_key af_family_kern_keys[AF_MAX];
202 static struct lock_class_key af_family_slock_keys[AF_MAX];
203 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
204 
205 /*
206  * Make lock validator output more readable. (we pre-construct these
207  * strings build-time, so that runtime initialization of socket
208  * locks is fast):
209  */
210 
211 #define _sock_locks(x)						  \
212   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
213   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
214   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
215   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
216   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
217   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
218   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
219   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
220   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
221   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
222   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
223   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
224   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
225   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
226   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
227   x "AF_MCTP"  , \
228   x "AF_MAX"
229 
230 static const char *const af_family_key_strings[AF_MAX+1] = {
231 	_sock_locks("sk_lock-")
232 };
233 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
234 	_sock_locks("slock-")
235 };
236 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
237 	_sock_locks("clock-")
238 };
239 
240 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
241 	_sock_locks("k-sk_lock-")
242 };
243 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
244 	_sock_locks("k-slock-")
245 };
246 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
247 	_sock_locks("k-clock-")
248 };
249 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
250 	_sock_locks("rlock-")
251 };
252 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
253 	_sock_locks("wlock-")
254 };
255 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
256 	_sock_locks("elock-")
257 };
258 
259 /*
260  * sk_callback_lock and sk queues locking rules are per-address-family,
261  * so split the lock classes by using a per-AF key:
262  */
263 static struct lock_class_key af_callback_keys[AF_MAX];
264 static struct lock_class_key af_rlock_keys[AF_MAX];
265 static struct lock_class_key af_wlock_keys[AF_MAX];
266 static struct lock_class_key af_elock_keys[AF_MAX];
267 static struct lock_class_key af_kern_callback_keys[AF_MAX];
268 
269 /* Run time adjustable parameters. */
270 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
271 EXPORT_SYMBOL(sysctl_wmem_max);
272 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
273 EXPORT_SYMBOL(sysctl_rmem_max);
274 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
275 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
276 
277 /* Maximal space eaten by iovec or ancillary data plus some space */
278 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
279 EXPORT_SYMBOL(sysctl_optmem_max);
280 
281 int sysctl_tstamp_allow_data __read_mostly = 1;
282 
283 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
284 EXPORT_SYMBOL_GPL(memalloc_socks_key);
285 
286 /**
287  * sk_set_memalloc - sets %SOCK_MEMALLOC
288  * @sk: socket to set it on
289  *
290  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
291  * It's the responsibility of the admin to adjust min_free_kbytes
292  * to meet the requirements
293  */
294 void sk_set_memalloc(struct sock *sk)
295 {
296 	sock_set_flag(sk, SOCK_MEMALLOC);
297 	sk->sk_allocation |= __GFP_MEMALLOC;
298 	static_branch_inc(&memalloc_socks_key);
299 }
300 EXPORT_SYMBOL_GPL(sk_set_memalloc);
301 
302 void sk_clear_memalloc(struct sock *sk)
303 {
304 	sock_reset_flag(sk, SOCK_MEMALLOC);
305 	sk->sk_allocation &= ~__GFP_MEMALLOC;
306 	static_branch_dec(&memalloc_socks_key);
307 
308 	/*
309 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
310 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
311 	 * it has rmem allocations due to the last swapfile being deactivated
312 	 * but there is a risk that the socket is unusable due to exceeding
313 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
314 	 */
315 	sk_mem_reclaim(sk);
316 }
317 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
318 
319 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
320 {
321 	int ret;
322 	unsigned int noreclaim_flag;
323 
324 	/* these should have been dropped before queueing */
325 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
326 
327 	noreclaim_flag = memalloc_noreclaim_save();
328 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
329 				 tcp_v6_do_rcv,
330 				 tcp_v4_do_rcv,
331 				 sk, skb);
332 	memalloc_noreclaim_restore(noreclaim_flag);
333 
334 	return ret;
335 }
336 EXPORT_SYMBOL(__sk_backlog_rcv);
337 
338 void sk_error_report(struct sock *sk)
339 {
340 	sk->sk_error_report(sk);
341 
342 	switch (sk->sk_family) {
343 	case AF_INET:
344 		fallthrough;
345 	case AF_INET6:
346 		trace_inet_sk_error_report(sk);
347 		break;
348 	default:
349 		break;
350 	}
351 }
352 EXPORT_SYMBOL(sk_error_report);
353 
354 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
355 {
356 	struct __kernel_sock_timeval tv;
357 
358 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
359 		tv.tv_sec = 0;
360 		tv.tv_usec = 0;
361 	} else {
362 		tv.tv_sec = timeo / HZ;
363 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
364 	}
365 
366 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
367 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
368 		*(struct old_timeval32 *)optval = tv32;
369 		return sizeof(tv32);
370 	}
371 
372 	if (old_timeval) {
373 		struct __kernel_old_timeval old_tv;
374 		old_tv.tv_sec = tv.tv_sec;
375 		old_tv.tv_usec = tv.tv_usec;
376 		*(struct __kernel_old_timeval *)optval = old_tv;
377 		return sizeof(old_tv);
378 	}
379 
380 	*(struct __kernel_sock_timeval *)optval = tv;
381 	return sizeof(tv);
382 }
383 EXPORT_SYMBOL(sock_get_timeout);
384 
385 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
386 			   sockptr_t optval, int optlen, bool old_timeval)
387 {
388 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
389 		struct old_timeval32 tv32;
390 
391 		if (optlen < sizeof(tv32))
392 			return -EINVAL;
393 
394 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
395 			return -EFAULT;
396 		tv->tv_sec = tv32.tv_sec;
397 		tv->tv_usec = tv32.tv_usec;
398 	} else if (old_timeval) {
399 		struct __kernel_old_timeval old_tv;
400 
401 		if (optlen < sizeof(old_tv))
402 			return -EINVAL;
403 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
404 			return -EFAULT;
405 		tv->tv_sec = old_tv.tv_sec;
406 		tv->tv_usec = old_tv.tv_usec;
407 	} else {
408 		if (optlen < sizeof(*tv))
409 			return -EINVAL;
410 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
411 			return -EFAULT;
412 	}
413 
414 	return 0;
415 }
416 EXPORT_SYMBOL(sock_copy_user_timeval);
417 
418 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
419 			    bool old_timeval)
420 {
421 	struct __kernel_sock_timeval tv;
422 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
423 
424 	if (err)
425 		return err;
426 
427 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
428 		return -EDOM;
429 
430 	if (tv.tv_sec < 0) {
431 		static int warned __read_mostly;
432 
433 		*timeo_p = 0;
434 		if (warned < 10 && net_ratelimit()) {
435 			warned++;
436 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
437 				__func__, current->comm, task_pid_nr(current));
438 		}
439 		return 0;
440 	}
441 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
442 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
443 		return 0;
444 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
445 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
446 	return 0;
447 }
448 
449 static bool sock_needs_netstamp(const struct sock *sk)
450 {
451 	switch (sk->sk_family) {
452 	case AF_UNSPEC:
453 	case AF_UNIX:
454 		return false;
455 	default:
456 		return true;
457 	}
458 }
459 
460 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
461 {
462 	if (sk->sk_flags & flags) {
463 		sk->sk_flags &= ~flags;
464 		if (sock_needs_netstamp(sk) &&
465 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
466 			net_disable_timestamp();
467 	}
468 }
469 
470 
471 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
472 {
473 	unsigned long flags;
474 	struct sk_buff_head *list = &sk->sk_receive_queue;
475 
476 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
477 		atomic_inc(&sk->sk_drops);
478 		trace_sock_rcvqueue_full(sk, skb);
479 		return -ENOMEM;
480 	}
481 
482 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
483 		atomic_inc(&sk->sk_drops);
484 		return -ENOBUFS;
485 	}
486 
487 	skb->dev = NULL;
488 	skb_set_owner_r(skb, sk);
489 
490 	/* we escape from rcu protected region, make sure we dont leak
491 	 * a norefcounted dst
492 	 */
493 	skb_dst_force(skb);
494 
495 	spin_lock_irqsave(&list->lock, flags);
496 	sock_skb_set_dropcount(sk, skb);
497 	__skb_queue_tail(list, skb);
498 	spin_unlock_irqrestore(&list->lock, flags);
499 
500 	if (!sock_flag(sk, SOCK_DEAD))
501 		sk->sk_data_ready(sk);
502 	return 0;
503 }
504 EXPORT_SYMBOL(__sock_queue_rcv_skb);
505 
506 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
507 {
508 	int err;
509 
510 	err = sk_filter(sk, skb);
511 	if (err)
512 		return err;
513 
514 	return __sock_queue_rcv_skb(sk, skb);
515 }
516 EXPORT_SYMBOL(sock_queue_rcv_skb);
517 
518 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
519 		     const int nested, unsigned int trim_cap, bool refcounted)
520 {
521 	int rc = NET_RX_SUCCESS;
522 
523 	if (sk_filter_trim_cap(sk, skb, trim_cap))
524 		goto discard_and_relse;
525 
526 	skb->dev = NULL;
527 
528 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
529 		atomic_inc(&sk->sk_drops);
530 		goto discard_and_relse;
531 	}
532 	if (nested)
533 		bh_lock_sock_nested(sk);
534 	else
535 		bh_lock_sock(sk);
536 	if (!sock_owned_by_user(sk)) {
537 		/*
538 		 * trylock + unlock semantics:
539 		 */
540 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
541 
542 		rc = sk_backlog_rcv(sk, skb);
543 
544 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
545 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
546 		bh_unlock_sock(sk);
547 		atomic_inc(&sk->sk_drops);
548 		goto discard_and_relse;
549 	}
550 
551 	bh_unlock_sock(sk);
552 out:
553 	if (refcounted)
554 		sock_put(sk);
555 	return rc;
556 discard_and_relse:
557 	kfree_skb(skb);
558 	goto out;
559 }
560 EXPORT_SYMBOL(__sk_receive_skb);
561 
562 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
563 							  u32));
564 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
565 							   u32));
566 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
567 {
568 	struct dst_entry *dst = __sk_dst_get(sk);
569 
570 	if (dst && dst->obsolete &&
571 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
572 			       dst, cookie) == NULL) {
573 		sk_tx_queue_clear(sk);
574 		sk->sk_dst_pending_confirm = 0;
575 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
576 		dst_release(dst);
577 		return NULL;
578 	}
579 
580 	return dst;
581 }
582 EXPORT_SYMBOL(__sk_dst_check);
583 
584 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
585 {
586 	struct dst_entry *dst = sk_dst_get(sk);
587 
588 	if (dst && dst->obsolete &&
589 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
590 			       dst, cookie) == NULL) {
591 		sk_dst_reset(sk);
592 		dst_release(dst);
593 		return NULL;
594 	}
595 
596 	return dst;
597 }
598 EXPORT_SYMBOL(sk_dst_check);
599 
600 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
601 {
602 	int ret = -ENOPROTOOPT;
603 #ifdef CONFIG_NETDEVICES
604 	struct net *net = sock_net(sk);
605 
606 	/* Sorry... */
607 	ret = -EPERM;
608 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
609 		goto out;
610 
611 	ret = -EINVAL;
612 	if (ifindex < 0)
613 		goto out;
614 
615 	sk->sk_bound_dev_if = ifindex;
616 	if (sk->sk_prot->rehash)
617 		sk->sk_prot->rehash(sk);
618 	sk_dst_reset(sk);
619 
620 	ret = 0;
621 
622 out:
623 #endif
624 
625 	return ret;
626 }
627 
628 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
629 {
630 	int ret;
631 
632 	if (lock_sk)
633 		lock_sock(sk);
634 	ret = sock_bindtoindex_locked(sk, ifindex);
635 	if (lock_sk)
636 		release_sock(sk);
637 
638 	return ret;
639 }
640 EXPORT_SYMBOL(sock_bindtoindex);
641 
642 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
643 {
644 	int ret = -ENOPROTOOPT;
645 #ifdef CONFIG_NETDEVICES
646 	struct net *net = sock_net(sk);
647 	char devname[IFNAMSIZ];
648 	int index;
649 
650 	ret = -EINVAL;
651 	if (optlen < 0)
652 		goto out;
653 
654 	/* Bind this socket to a particular device like "eth0",
655 	 * as specified in the passed interface name. If the
656 	 * name is "" or the option length is zero the socket
657 	 * is not bound.
658 	 */
659 	if (optlen > IFNAMSIZ - 1)
660 		optlen = IFNAMSIZ - 1;
661 	memset(devname, 0, sizeof(devname));
662 
663 	ret = -EFAULT;
664 	if (copy_from_sockptr(devname, optval, optlen))
665 		goto out;
666 
667 	index = 0;
668 	if (devname[0] != '\0') {
669 		struct net_device *dev;
670 
671 		rcu_read_lock();
672 		dev = dev_get_by_name_rcu(net, devname);
673 		if (dev)
674 			index = dev->ifindex;
675 		rcu_read_unlock();
676 		ret = -ENODEV;
677 		if (!dev)
678 			goto out;
679 	}
680 
681 	return sock_bindtoindex(sk, index, true);
682 out:
683 #endif
684 
685 	return ret;
686 }
687 
688 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
689 				int __user *optlen, int len)
690 {
691 	int ret = -ENOPROTOOPT;
692 #ifdef CONFIG_NETDEVICES
693 	struct net *net = sock_net(sk);
694 	char devname[IFNAMSIZ];
695 
696 	if (sk->sk_bound_dev_if == 0) {
697 		len = 0;
698 		goto zero;
699 	}
700 
701 	ret = -EINVAL;
702 	if (len < IFNAMSIZ)
703 		goto out;
704 
705 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
706 	if (ret)
707 		goto out;
708 
709 	len = strlen(devname) + 1;
710 
711 	ret = -EFAULT;
712 	if (copy_to_user(optval, devname, len))
713 		goto out;
714 
715 zero:
716 	ret = -EFAULT;
717 	if (put_user(len, optlen))
718 		goto out;
719 
720 	ret = 0;
721 
722 out:
723 #endif
724 
725 	return ret;
726 }
727 
728 bool sk_mc_loop(struct sock *sk)
729 {
730 	if (dev_recursion_level())
731 		return false;
732 	if (!sk)
733 		return true;
734 	switch (sk->sk_family) {
735 	case AF_INET:
736 		return inet_sk(sk)->mc_loop;
737 #if IS_ENABLED(CONFIG_IPV6)
738 	case AF_INET6:
739 		return inet6_sk(sk)->mc_loop;
740 #endif
741 	}
742 	WARN_ON_ONCE(1);
743 	return true;
744 }
745 EXPORT_SYMBOL(sk_mc_loop);
746 
747 void sock_set_reuseaddr(struct sock *sk)
748 {
749 	lock_sock(sk);
750 	sk->sk_reuse = SK_CAN_REUSE;
751 	release_sock(sk);
752 }
753 EXPORT_SYMBOL(sock_set_reuseaddr);
754 
755 void sock_set_reuseport(struct sock *sk)
756 {
757 	lock_sock(sk);
758 	sk->sk_reuseport = true;
759 	release_sock(sk);
760 }
761 EXPORT_SYMBOL(sock_set_reuseport);
762 
763 void sock_no_linger(struct sock *sk)
764 {
765 	lock_sock(sk);
766 	sk->sk_lingertime = 0;
767 	sock_set_flag(sk, SOCK_LINGER);
768 	release_sock(sk);
769 }
770 EXPORT_SYMBOL(sock_no_linger);
771 
772 void sock_set_priority(struct sock *sk, u32 priority)
773 {
774 	lock_sock(sk);
775 	sk->sk_priority = priority;
776 	release_sock(sk);
777 }
778 EXPORT_SYMBOL(sock_set_priority);
779 
780 void sock_set_sndtimeo(struct sock *sk, s64 secs)
781 {
782 	lock_sock(sk);
783 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
784 		sk->sk_sndtimeo = secs * HZ;
785 	else
786 		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
787 	release_sock(sk);
788 }
789 EXPORT_SYMBOL(sock_set_sndtimeo);
790 
791 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
792 {
793 	if (val)  {
794 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
795 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
796 		sock_set_flag(sk, SOCK_RCVTSTAMP);
797 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
798 	} else {
799 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
800 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
801 	}
802 }
803 
804 void sock_enable_timestamps(struct sock *sk)
805 {
806 	lock_sock(sk);
807 	__sock_set_timestamps(sk, true, false, true);
808 	release_sock(sk);
809 }
810 EXPORT_SYMBOL(sock_enable_timestamps);
811 
812 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
813 {
814 	switch (optname) {
815 	case SO_TIMESTAMP_OLD:
816 		__sock_set_timestamps(sk, valbool, false, false);
817 		break;
818 	case SO_TIMESTAMP_NEW:
819 		__sock_set_timestamps(sk, valbool, true, false);
820 		break;
821 	case SO_TIMESTAMPNS_OLD:
822 		__sock_set_timestamps(sk, valbool, false, true);
823 		break;
824 	case SO_TIMESTAMPNS_NEW:
825 		__sock_set_timestamps(sk, valbool, true, true);
826 		break;
827 	}
828 }
829 
830 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
831 {
832 	struct net *net = sock_net(sk);
833 	struct net_device *dev = NULL;
834 	bool match = false;
835 	int *vclock_index;
836 	int i, num;
837 
838 	if (sk->sk_bound_dev_if)
839 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
840 
841 	if (!dev) {
842 		pr_err("%s: sock not bind to device\n", __func__);
843 		return -EOPNOTSUPP;
844 	}
845 
846 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
847 	for (i = 0; i < num; i++) {
848 		if (*(vclock_index + i) == phc_index) {
849 			match = true;
850 			break;
851 		}
852 	}
853 
854 	if (num > 0)
855 		kfree(vclock_index);
856 
857 	if (!match)
858 		return -EINVAL;
859 
860 	sk->sk_bind_phc = phc_index;
861 
862 	return 0;
863 }
864 
865 int sock_set_timestamping(struct sock *sk, int optname,
866 			  struct so_timestamping timestamping)
867 {
868 	int val = timestamping.flags;
869 	int ret;
870 
871 	if (val & ~SOF_TIMESTAMPING_MASK)
872 		return -EINVAL;
873 
874 	if (val & SOF_TIMESTAMPING_OPT_ID &&
875 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
876 		if (sk_is_tcp(sk)) {
877 			if ((1 << sk->sk_state) &
878 			    (TCPF_CLOSE | TCPF_LISTEN))
879 				return -EINVAL;
880 			sk->sk_tskey = tcp_sk(sk)->snd_una;
881 		} else {
882 			sk->sk_tskey = 0;
883 		}
884 	}
885 
886 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
887 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
888 		return -EINVAL;
889 
890 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
891 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
892 		if (ret)
893 			return ret;
894 	}
895 
896 	sk->sk_tsflags = val;
897 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
898 
899 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
900 		sock_enable_timestamp(sk,
901 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
902 	else
903 		sock_disable_timestamp(sk,
904 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
905 	return 0;
906 }
907 
908 void sock_set_keepalive(struct sock *sk)
909 {
910 	lock_sock(sk);
911 	if (sk->sk_prot->keepalive)
912 		sk->sk_prot->keepalive(sk, true);
913 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
914 	release_sock(sk);
915 }
916 EXPORT_SYMBOL(sock_set_keepalive);
917 
918 static void __sock_set_rcvbuf(struct sock *sk, int val)
919 {
920 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
921 	 * as a negative value.
922 	 */
923 	val = min_t(int, val, INT_MAX / 2);
924 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
925 
926 	/* We double it on the way in to account for "struct sk_buff" etc.
927 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
928 	 * will allow that much actual data to be received on that socket.
929 	 *
930 	 * Applications are unaware that "struct sk_buff" and other overheads
931 	 * allocate from the receive buffer during socket buffer allocation.
932 	 *
933 	 * And after considering the possible alternatives, returning the value
934 	 * we actually used in getsockopt is the most desirable behavior.
935 	 */
936 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
937 }
938 
939 void sock_set_rcvbuf(struct sock *sk, int val)
940 {
941 	lock_sock(sk);
942 	__sock_set_rcvbuf(sk, val);
943 	release_sock(sk);
944 }
945 EXPORT_SYMBOL(sock_set_rcvbuf);
946 
947 static void __sock_set_mark(struct sock *sk, u32 val)
948 {
949 	if (val != sk->sk_mark) {
950 		sk->sk_mark = val;
951 		sk_dst_reset(sk);
952 	}
953 }
954 
955 void sock_set_mark(struct sock *sk, u32 val)
956 {
957 	lock_sock(sk);
958 	__sock_set_mark(sk, val);
959 	release_sock(sk);
960 }
961 EXPORT_SYMBOL(sock_set_mark);
962 
963 static void sock_release_reserved_memory(struct sock *sk, int bytes)
964 {
965 	/* Round down bytes to multiple of pages */
966 	bytes &= ~(SK_MEM_QUANTUM - 1);
967 
968 	WARN_ON(bytes > sk->sk_reserved_mem);
969 	sk->sk_reserved_mem -= bytes;
970 	sk_mem_reclaim(sk);
971 }
972 
973 static int sock_reserve_memory(struct sock *sk, int bytes)
974 {
975 	long allocated;
976 	bool charged;
977 	int pages;
978 
979 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
980 		return -EOPNOTSUPP;
981 
982 	if (!bytes)
983 		return 0;
984 
985 	pages = sk_mem_pages(bytes);
986 
987 	/* pre-charge to memcg */
988 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
989 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
990 	if (!charged)
991 		return -ENOMEM;
992 
993 	/* pre-charge to forward_alloc */
994 	allocated = sk_memory_allocated_add(sk, pages);
995 	/* If the system goes into memory pressure with this
996 	 * precharge, give up and return error.
997 	 */
998 	if (allocated > sk_prot_mem_limits(sk, 1)) {
999 		sk_memory_allocated_sub(sk, pages);
1000 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1001 		return -ENOMEM;
1002 	}
1003 	sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT;
1004 
1005 	sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT;
1006 
1007 	return 0;
1008 }
1009 
1010 /*
1011  *	This is meant for all protocols to use and covers goings on
1012  *	at the socket level. Everything here is generic.
1013  */
1014 
1015 int sock_setsockopt(struct socket *sock, int level, int optname,
1016 		    sockptr_t optval, unsigned int optlen)
1017 {
1018 	struct so_timestamping timestamping;
1019 	struct sock_txtime sk_txtime;
1020 	struct sock *sk = sock->sk;
1021 	int val;
1022 	int valbool;
1023 	struct linger ling;
1024 	int ret = 0;
1025 
1026 	/*
1027 	 *	Options without arguments
1028 	 */
1029 
1030 	if (optname == SO_BINDTODEVICE)
1031 		return sock_setbindtodevice(sk, optval, optlen);
1032 
1033 	if (optlen < sizeof(int))
1034 		return -EINVAL;
1035 
1036 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1037 		return -EFAULT;
1038 
1039 	valbool = val ? 1 : 0;
1040 
1041 	lock_sock(sk);
1042 
1043 	switch (optname) {
1044 	case SO_DEBUG:
1045 		if (val && !capable(CAP_NET_ADMIN))
1046 			ret = -EACCES;
1047 		else
1048 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1049 		break;
1050 	case SO_REUSEADDR:
1051 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1052 		break;
1053 	case SO_REUSEPORT:
1054 		sk->sk_reuseport = valbool;
1055 		break;
1056 	case SO_TYPE:
1057 	case SO_PROTOCOL:
1058 	case SO_DOMAIN:
1059 	case SO_ERROR:
1060 		ret = -ENOPROTOOPT;
1061 		break;
1062 	case SO_DONTROUTE:
1063 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1064 		sk_dst_reset(sk);
1065 		break;
1066 	case SO_BROADCAST:
1067 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1068 		break;
1069 	case SO_SNDBUF:
1070 		/* Don't error on this BSD doesn't and if you think
1071 		 * about it this is right. Otherwise apps have to
1072 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1073 		 * are treated in BSD as hints
1074 		 */
1075 		val = min_t(u32, val, sysctl_wmem_max);
1076 set_sndbuf:
1077 		/* Ensure val * 2 fits into an int, to prevent max_t()
1078 		 * from treating it as a negative value.
1079 		 */
1080 		val = min_t(int, val, INT_MAX / 2);
1081 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1082 		WRITE_ONCE(sk->sk_sndbuf,
1083 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1084 		/* Wake up sending tasks if we upped the value. */
1085 		sk->sk_write_space(sk);
1086 		break;
1087 
1088 	case SO_SNDBUFFORCE:
1089 		if (!capable(CAP_NET_ADMIN)) {
1090 			ret = -EPERM;
1091 			break;
1092 		}
1093 
1094 		/* No negative values (to prevent underflow, as val will be
1095 		 * multiplied by 2).
1096 		 */
1097 		if (val < 0)
1098 			val = 0;
1099 		goto set_sndbuf;
1100 
1101 	case SO_RCVBUF:
1102 		/* Don't error on this BSD doesn't and if you think
1103 		 * about it this is right. Otherwise apps have to
1104 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1105 		 * are treated in BSD as hints
1106 		 */
1107 		__sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
1108 		break;
1109 
1110 	case SO_RCVBUFFORCE:
1111 		if (!capable(CAP_NET_ADMIN)) {
1112 			ret = -EPERM;
1113 			break;
1114 		}
1115 
1116 		/* No negative values (to prevent underflow, as val will be
1117 		 * multiplied by 2).
1118 		 */
1119 		__sock_set_rcvbuf(sk, max(val, 0));
1120 		break;
1121 
1122 	case SO_KEEPALIVE:
1123 		if (sk->sk_prot->keepalive)
1124 			sk->sk_prot->keepalive(sk, valbool);
1125 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1126 		break;
1127 
1128 	case SO_OOBINLINE:
1129 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1130 		break;
1131 
1132 	case SO_NO_CHECK:
1133 		sk->sk_no_check_tx = valbool;
1134 		break;
1135 
1136 	case SO_PRIORITY:
1137 		if ((val >= 0 && val <= 6) ||
1138 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1139 			sk->sk_priority = val;
1140 		else
1141 			ret = -EPERM;
1142 		break;
1143 
1144 	case SO_LINGER:
1145 		if (optlen < sizeof(ling)) {
1146 			ret = -EINVAL;	/* 1003.1g */
1147 			break;
1148 		}
1149 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1150 			ret = -EFAULT;
1151 			break;
1152 		}
1153 		if (!ling.l_onoff)
1154 			sock_reset_flag(sk, SOCK_LINGER);
1155 		else {
1156 #if (BITS_PER_LONG == 32)
1157 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1158 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1159 			else
1160 #endif
1161 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1162 			sock_set_flag(sk, SOCK_LINGER);
1163 		}
1164 		break;
1165 
1166 	case SO_BSDCOMPAT:
1167 		break;
1168 
1169 	case SO_PASSCRED:
1170 		if (valbool)
1171 			set_bit(SOCK_PASSCRED, &sock->flags);
1172 		else
1173 			clear_bit(SOCK_PASSCRED, &sock->flags);
1174 		break;
1175 
1176 	case SO_TIMESTAMP_OLD:
1177 	case SO_TIMESTAMP_NEW:
1178 	case SO_TIMESTAMPNS_OLD:
1179 	case SO_TIMESTAMPNS_NEW:
1180 		sock_set_timestamp(sk, optname, valbool);
1181 		break;
1182 
1183 	case SO_TIMESTAMPING_NEW:
1184 	case SO_TIMESTAMPING_OLD:
1185 		if (optlen == sizeof(timestamping)) {
1186 			if (copy_from_sockptr(&timestamping, optval,
1187 					      sizeof(timestamping))) {
1188 				ret = -EFAULT;
1189 				break;
1190 			}
1191 		} else {
1192 			memset(&timestamping, 0, sizeof(timestamping));
1193 			timestamping.flags = val;
1194 		}
1195 		ret = sock_set_timestamping(sk, optname, timestamping);
1196 		break;
1197 
1198 	case SO_RCVLOWAT:
1199 		if (val < 0)
1200 			val = INT_MAX;
1201 		if (sock->ops->set_rcvlowat)
1202 			ret = sock->ops->set_rcvlowat(sk, val);
1203 		else
1204 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1205 		break;
1206 
1207 	case SO_RCVTIMEO_OLD:
1208 	case SO_RCVTIMEO_NEW:
1209 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1210 				       optlen, optname == SO_RCVTIMEO_OLD);
1211 		break;
1212 
1213 	case SO_SNDTIMEO_OLD:
1214 	case SO_SNDTIMEO_NEW:
1215 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1216 				       optlen, optname == SO_SNDTIMEO_OLD);
1217 		break;
1218 
1219 	case SO_ATTACH_FILTER: {
1220 		struct sock_fprog fprog;
1221 
1222 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1223 		if (!ret)
1224 			ret = sk_attach_filter(&fprog, sk);
1225 		break;
1226 	}
1227 	case SO_ATTACH_BPF:
1228 		ret = -EINVAL;
1229 		if (optlen == sizeof(u32)) {
1230 			u32 ufd;
1231 
1232 			ret = -EFAULT;
1233 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1234 				break;
1235 
1236 			ret = sk_attach_bpf(ufd, sk);
1237 		}
1238 		break;
1239 
1240 	case SO_ATTACH_REUSEPORT_CBPF: {
1241 		struct sock_fprog fprog;
1242 
1243 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1244 		if (!ret)
1245 			ret = sk_reuseport_attach_filter(&fprog, sk);
1246 		break;
1247 	}
1248 	case SO_ATTACH_REUSEPORT_EBPF:
1249 		ret = -EINVAL;
1250 		if (optlen == sizeof(u32)) {
1251 			u32 ufd;
1252 
1253 			ret = -EFAULT;
1254 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1255 				break;
1256 
1257 			ret = sk_reuseport_attach_bpf(ufd, sk);
1258 		}
1259 		break;
1260 
1261 	case SO_DETACH_REUSEPORT_BPF:
1262 		ret = reuseport_detach_prog(sk);
1263 		break;
1264 
1265 	case SO_DETACH_FILTER:
1266 		ret = sk_detach_filter(sk);
1267 		break;
1268 
1269 	case SO_LOCK_FILTER:
1270 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1271 			ret = -EPERM;
1272 		else
1273 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1274 		break;
1275 
1276 	case SO_PASSSEC:
1277 		if (valbool)
1278 			set_bit(SOCK_PASSSEC, &sock->flags);
1279 		else
1280 			clear_bit(SOCK_PASSSEC, &sock->flags);
1281 		break;
1282 	case SO_MARK:
1283 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1284 			ret = -EPERM;
1285 			break;
1286 		}
1287 
1288 		__sock_set_mark(sk, val);
1289 		break;
1290 
1291 	case SO_RXQ_OVFL:
1292 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1293 		break;
1294 
1295 	case SO_WIFI_STATUS:
1296 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1297 		break;
1298 
1299 	case SO_PEEK_OFF:
1300 		if (sock->ops->set_peek_off)
1301 			ret = sock->ops->set_peek_off(sk, val);
1302 		else
1303 			ret = -EOPNOTSUPP;
1304 		break;
1305 
1306 	case SO_NOFCS:
1307 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1308 		break;
1309 
1310 	case SO_SELECT_ERR_QUEUE:
1311 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1312 		break;
1313 
1314 #ifdef CONFIG_NET_RX_BUSY_POLL
1315 	case SO_BUSY_POLL:
1316 		/* allow unprivileged users to decrease the value */
1317 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1318 			ret = -EPERM;
1319 		else {
1320 			if (val < 0)
1321 				ret = -EINVAL;
1322 			else
1323 				WRITE_ONCE(sk->sk_ll_usec, val);
1324 		}
1325 		break;
1326 	case SO_PREFER_BUSY_POLL:
1327 		if (valbool && !capable(CAP_NET_ADMIN))
1328 			ret = -EPERM;
1329 		else
1330 			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1331 		break;
1332 	case SO_BUSY_POLL_BUDGET:
1333 		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1334 			ret = -EPERM;
1335 		} else {
1336 			if (val < 0 || val > U16_MAX)
1337 				ret = -EINVAL;
1338 			else
1339 				WRITE_ONCE(sk->sk_busy_poll_budget, val);
1340 		}
1341 		break;
1342 #endif
1343 
1344 	case SO_MAX_PACING_RATE:
1345 		{
1346 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1347 
1348 		if (sizeof(ulval) != sizeof(val) &&
1349 		    optlen >= sizeof(ulval) &&
1350 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1351 			ret = -EFAULT;
1352 			break;
1353 		}
1354 		if (ulval != ~0UL)
1355 			cmpxchg(&sk->sk_pacing_status,
1356 				SK_PACING_NONE,
1357 				SK_PACING_NEEDED);
1358 		sk->sk_max_pacing_rate = ulval;
1359 		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1360 		break;
1361 		}
1362 	case SO_INCOMING_CPU:
1363 		WRITE_ONCE(sk->sk_incoming_cpu, val);
1364 		break;
1365 
1366 	case SO_CNX_ADVICE:
1367 		if (val == 1)
1368 			dst_negative_advice(sk);
1369 		break;
1370 
1371 	case SO_ZEROCOPY:
1372 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1373 			if (!(sk_is_tcp(sk) ||
1374 			      (sk->sk_type == SOCK_DGRAM &&
1375 			       sk->sk_protocol == IPPROTO_UDP)))
1376 				ret = -ENOTSUPP;
1377 		} else if (sk->sk_family != PF_RDS) {
1378 			ret = -ENOTSUPP;
1379 		}
1380 		if (!ret) {
1381 			if (val < 0 || val > 1)
1382 				ret = -EINVAL;
1383 			else
1384 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1385 		}
1386 		break;
1387 
1388 	case SO_TXTIME:
1389 		if (optlen != sizeof(struct sock_txtime)) {
1390 			ret = -EINVAL;
1391 			break;
1392 		} else if (copy_from_sockptr(&sk_txtime, optval,
1393 			   sizeof(struct sock_txtime))) {
1394 			ret = -EFAULT;
1395 			break;
1396 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1397 			ret = -EINVAL;
1398 			break;
1399 		}
1400 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1401 		 * scheduler has enough safe guards.
1402 		 */
1403 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1404 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1405 			ret = -EPERM;
1406 			break;
1407 		}
1408 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1409 		sk->sk_clockid = sk_txtime.clockid;
1410 		sk->sk_txtime_deadline_mode =
1411 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1412 		sk->sk_txtime_report_errors =
1413 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1414 		break;
1415 
1416 	case SO_BINDTOIFINDEX:
1417 		ret = sock_bindtoindex_locked(sk, val);
1418 		break;
1419 
1420 	case SO_BUF_LOCK:
1421 		if (val & ~SOCK_BUF_LOCK_MASK) {
1422 			ret = -EINVAL;
1423 			break;
1424 		}
1425 		sk->sk_userlocks = val | (sk->sk_userlocks &
1426 					  ~SOCK_BUF_LOCK_MASK);
1427 		break;
1428 
1429 	case SO_RESERVE_MEM:
1430 	{
1431 		int delta;
1432 
1433 		if (val < 0) {
1434 			ret = -EINVAL;
1435 			break;
1436 		}
1437 
1438 		delta = val - sk->sk_reserved_mem;
1439 		if (delta < 0)
1440 			sock_release_reserved_memory(sk, -delta);
1441 		else
1442 			ret = sock_reserve_memory(sk, delta);
1443 		break;
1444 	}
1445 
1446 	default:
1447 		ret = -ENOPROTOOPT;
1448 		break;
1449 	}
1450 	release_sock(sk);
1451 	return ret;
1452 }
1453 EXPORT_SYMBOL(sock_setsockopt);
1454 
1455 static const struct cred *sk_get_peer_cred(struct sock *sk)
1456 {
1457 	const struct cred *cred;
1458 
1459 	spin_lock(&sk->sk_peer_lock);
1460 	cred = get_cred(sk->sk_peer_cred);
1461 	spin_unlock(&sk->sk_peer_lock);
1462 
1463 	return cred;
1464 }
1465 
1466 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1467 			  struct ucred *ucred)
1468 {
1469 	ucred->pid = pid_vnr(pid);
1470 	ucred->uid = ucred->gid = -1;
1471 	if (cred) {
1472 		struct user_namespace *current_ns = current_user_ns();
1473 
1474 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1475 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1476 	}
1477 }
1478 
1479 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1480 {
1481 	struct user_namespace *user_ns = current_user_ns();
1482 	int i;
1483 
1484 	for (i = 0; i < src->ngroups; i++)
1485 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1486 			return -EFAULT;
1487 
1488 	return 0;
1489 }
1490 
1491 int sock_getsockopt(struct socket *sock, int level, int optname,
1492 		    char __user *optval, int __user *optlen)
1493 {
1494 	struct sock *sk = sock->sk;
1495 
1496 	union {
1497 		int val;
1498 		u64 val64;
1499 		unsigned long ulval;
1500 		struct linger ling;
1501 		struct old_timeval32 tm32;
1502 		struct __kernel_old_timeval tm;
1503 		struct  __kernel_sock_timeval stm;
1504 		struct sock_txtime txtime;
1505 		struct so_timestamping timestamping;
1506 	} v;
1507 
1508 	int lv = sizeof(int);
1509 	int len;
1510 
1511 	if (get_user(len, optlen))
1512 		return -EFAULT;
1513 	if (len < 0)
1514 		return -EINVAL;
1515 
1516 	memset(&v, 0, sizeof(v));
1517 
1518 	switch (optname) {
1519 	case SO_DEBUG:
1520 		v.val = sock_flag(sk, SOCK_DBG);
1521 		break;
1522 
1523 	case SO_DONTROUTE:
1524 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1525 		break;
1526 
1527 	case SO_BROADCAST:
1528 		v.val = sock_flag(sk, SOCK_BROADCAST);
1529 		break;
1530 
1531 	case SO_SNDBUF:
1532 		v.val = sk->sk_sndbuf;
1533 		break;
1534 
1535 	case SO_RCVBUF:
1536 		v.val = sk->sk_rcvbuf;
1537 		break;
1538 
1539 	case SO_REUSEADDR:
1540 		v.val = sk->sk_reuse;
1541 		break;
1542 
1543 	case SO_REUSEPORT:
1544 		v.val = sk->sk_reuseport;
1545 		break;
1546 
1547 	case SO_KEEPALIVE:
1548 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1549 		break;
1550 
1551 	case SO_TYPE:
1552 		v.val = sk->sk_type;
1553 		break;
1554 
1555 	case SO_PROTOCOL:
1556 		v.val = sk->sk_protocol;
1557 		break;
1558 
1559 	case SO_DOMAIN:
1560 		v.val = sk->sk_family;
1561 		break;
1562 
1563 	case SO_ERROR:
1564 		v.val = -sock_error(sk);
1565 		if (v.val == 0)
1566 			v.val = xchg(&sk->sk_err_soft, 0);
1567 		break;
1568 
1569 	case SO_OOBINLINE:
1570 		v.val = sock_flag(sk, SOCK_URGINLINE);
1571 		break;
1572 
1573 	case SO_NO_CHECK:
1574 		v.val = sk->sk_no_check_tx;
1575 		break;
1576 
1577 	case SO_PRIORITY:
1578 		v.val = sk->sk_priority;
1579 		break;
1580 
1581 	case SO_LINGER:
1582 		lv		= sizeof(v.ling);
1583 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1584 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1585 		break;
1586 
1587 	case SO_BSDCOMPAT:
1588 		break;
1589 
1590 	case SO_TIMESTAMP_OLD:
1591 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1592 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1593 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1594 		break;
1595 
1596 	case SO_TIMESTAMPNS_OLD:
1597 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1598 		break;
1599 
1600 	case SO_TIMESTAMP_NEW:
1601 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1602 		break;
1603 
1604 	case SO_TIMESTAMPNS_NEW:
1605 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1606 		break;
1607 
1608 	case SO_TIMESTAMPING_OLD:
1609 		lv = sizeof(v.timestamping);
1610 		v.timestamping.flags = sk->sk_tsflags;
1611 		v.timestamping.bind_phc = sk->sk_bind_phc;
1612 		break;
1613 
1614 	case SO_RCVTIMEO_OLD:
1615 	case SO_RCVTIMEO_NEW:
1616 		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1617 		break;
1618 
1619 	case SO_SNDTIMEO_OLD:
1620 	case SO_SNDTIMEO_NEW:
1621 		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1622 		break;
1623 
1624 	case SO_RCVLOWAT:
1625 		v.val = sk->sk_rcvlowat;
1626 		break;
1627 
1628 	case SO_SNDLOWAT:
1629 		v.val = 1;
1630 		break;
1631 
1632 	case SO_PASSCRED:
1633 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1634 		break;
1635 
1636 	case SO_PEERCRED:
1637 	{
1638 		struct ucred peercred;
1639 		if (len > sizeof(peercred))
1640 			len = sizeof(peercred);
1641 
1642 		spin_lock(&sk->sk_peer_lock);
1643 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1644 		spin_unlock(&sk->sk_peer_lock);
1645 
1646 		if (copy_to_user(optval, &peercred, len))
1647 			return -EFAULT;
1648 		goto lenout;
1649 	}
1650 
1651 	case SO_PEERGROUPS:
1652 	{
1653 		const struct cred *cred;
1654 		int ret, n;
1655 
1656 		cred = sk_get_peer_cred(sk);
1657 		if (!cred)
1658 			return -ENODATA;
1659 
1660 		n = cred->group_info->ngroups;
1661 		if (len < n * sizeof(gid_t)) {
1662 			len = n * sizeof(gid_t);
1663 			put_cred(cred);
1664 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1665 		}
1666 		len = n * sizeof(gid_t);
1667 
1668 		ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1669 		put_cred(cred);
1670 		if (ret)
1671 			return ret;
1672 		goto lenout;
1673 	}
1674 
1675 	case SO_PEERNAME:
1676 	{
1677 		char address[128];
1678 
1679 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1680 		if (lv < 0)
1681 			return -ENOTCONN;
1682 		if (lv < len)
1683 			return -EINVAL;
1684 		if (copy_to_user(optval, address, len))
1685 			return -EFAULT;
1686 		goto lenout;
1687 	}
1688 
1689 	/* Dubious BSD thing... Probably nobody even uses it, but
1690 	 * the UNIX standard wants it for whatever reason... -DaveM
1691 	 */
1692 	case SO_ACCEPTCONN:
1693 		v.val = sk->sk_state == TCP_LISTEN;
1694 		break;
1695 
1696 	case SO_PASSSEC:
1697 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1698 		break;
1699 
1700 	case SO_PEERSEC:
1701 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1702 
1703 	case SO_MARK:
1704 		v.val = sk->sk_mark;
1705 		break;
1706 
1707 	case SO_RXQ_OVFL:
1708 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1709 		break;
1710 
1711 	case SO_WIFI_STATUS:
1712 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1713 		break;
1714 
1715 	case SO_PEEK_OFF:
1716 		if (!sock->ops->set_peek_off)
1717 			return -EOPNOTSUPP;
1718 
1719 		v.val = sk->sk_peek_off;
1720 		break;
1721 	case SO_NOFCS:
1722 		v.val = sock_flag(sk, SOCK_NOFCS);
1723 		break;
1724 
1725 	case SO_BINDTODEVICE:
1726 		return sock_getbindtodevice(sk, optval, optlen, len);
1727 
1728 	case SO_GET_FILTER:
1729 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1730 		if (len < 0)
1731 			return len;
1732 
1733 		goto lenout;
1734 
1735 	case SO_LOCK_FILTER:
1736 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1737 		break;
1738 
1739 	case SO_BPF_EXTENSIONS:
1740 		v.val = bpf_tell_extensions();
1741 		break;
1742 
1743 	case SO_SELECT_ERR_QUEUE:
1744 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1745 		break;
1746 
1747 #ifdef CONFIG_NET_RX_BUSY_POLL
1748 	case SO_BUSY_POLL:
1749 		v.val = sk->sk_ll_usec;
1750 		break;
1751 	case SO_PREFER_BUSY_POLL:
1752 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1753 		break;
1754 #endif
1755 
1756 	case SO_MAX_PACING_RATE:
1757 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1758 			lv = sizeof(v.ulval);
1759 			v.ulval = sk->sk_max_pacing_rate;
1760 		} else {
1761 			/* 32bit version */
1762 			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1763 		}
1764 		break;
1765 
1766 	case SO_INCOMING_CPU:
1767 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1768 		break;
1769 
1770 	case SO_MEMINFO:
1771 	{
1772 		u32 meminfo[SK_MEMINFO_VARS];
1773 
1774 		sk_get_meminfo(sk, meminfo);
1775 
1776 		len = min_t(unsigned int, len, sizeof(meminfo));
1777 		if (copy_to_user(optval, &meminfo, len))
1778 			return -EFAULT;
1779 
1780 		goto lenout;
1781 	}
1782 
1783 #ifdef CONFIG_NET_RX_BUSY_POLL
1784 	case SO_INCOMING_NAPI_ID:
1785 		v.val = READ_ONCE(sk->sk_napi_id);
1786 
1787 		/* aggregate non-NAPI IDs down to 0 */
1788 		if (v.val < MIN_NAPI_ID)
1789 			v.val = 0;
1790 
1791 		break;
1792 #endif
1793 
1794 	case SO_COOKIE:
1795 		lv = sizeof(u64);
1796 		if (len < lv)
1797 			return -EINVAL;
1798 		v.val64 = sock_gen_cookie(sk);
1799 		break;
1800 
1801 	case SO_ZEROCOPY:
1802 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1803 		break;
1804 
1805 	case SO_TXTIME:
1806 		lv = sizeof(v.txtime);
1807 		v.txtime.clockid = sk->sk_clockid;
1808 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1809 				  SOF_TXTIME_DEADLINE_MODE : 0;
1810 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1811 				  SOF_TXTIME_REPORT_ERRORS : 0;
1812 		break;
1813 
1814 	case SO_BINDTOIFINDEX:
1815 		v.val = sk->sk_bound_dev_if;
1816 		break;
1817 
1818 	case SO_NETNS_COOKIE:
1819 		lv = sizeof(u64);
1820 		if (len != lv)
1821 			return -EINVAL;
1822 		v.val64 = sock_net(sk)->net_cookie;
1823 		break;
1824 
1825 	case SO_BUF_LOCK:
1826 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1827 		break;
1828 
1829 	case SO_RESERVE_MEM:
1830 		v.val = sk->sk_reserved_mem;
1831 		break;
1832 
1833 	default:
1834 		/* We implement the SO_SNDLOWAT etc to not be settable
1835 		 * (1003.1g 7).
1836 		 */
1837 		return -ENOPROTOOPT;
1838 	}
1839 
1840 	if (len > lv)
1841 		len = lv;
1842 	if (copy_to_user(optval, &v, len))
1843 		return -EFAULT;
1844 lenout:
1845 	if (put_user(len, optlen))
1846 		return -EFAULT;
1847 	return 0;
1848 }
1849 
1850 /*
1851  * Initialize an sk_lock.
1852  *
1853  * (We also register the sk_lock with the lock validator.)
1854  */
1855 static inline void sock_lock_init(struct sock *sk)
1856 {
1857 	if (sk->sk_kern_sock)
1858 		sock_lock_init_class_and_name(
1859 			sk,
1860 			af_family_kern_slock_key_strings[sk->sk_family],
1861 			af_family_kern_slock_keys + sk->sk_family,
1862 			af_family_kern_key_strings[sk->sk_family],
1863 			af_family_kern_keys + sk->sk_family);
1864 	else
1865 		sock_lock_init_class_and_name(
1866 			sk,
1867 			af_family_slock_key_strings[sk->sk_family],
1868 			af_family_slock_keys + sk->sk_family,
1869 			af_family_key_strings[sk->sk_family],
1870 			af_family_keys + sk->sk_family);
1871 }
1872 
1873 /*
1874  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1875  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1876  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1877  */
1878 static void sock_copy(struct sock *nsk, const struct sock *osk)
1879 {
1880 	const struct proto *prot = READ_ONCE(osk->sk_prot);
1881 #ifdef CONFIG_SECURITY_NETWORK
1882 	void *sptr = nsk->sk_security;
1883 #endif
1884 
1885 	/* If we move sk_tx_queue_mapping out of the private section,
1886 	 * we must check if sk_tx_queue_clear() is called after
1887 	 * sock_copy() in sk_clone_lock().
1888 	 */
1889 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1890 		     offsetof(struct sock, sk_dontcopy_begin) ||
1891 		     offsetof(struct sock, sk_tx_queue_mapping) >=
1892 		     offsetof(struct sock, sk_dontcopy_end));
1893 
1894 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1895 
1896 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1897 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1898 
1899 #ifdef CONFIG_SECURITY_NETWORK
1900 	nsk->sk_security = sptr;
1901 	security_sk_clone(osk, nsk);
1902 #endif
1903 }
1904 
1905 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1906 		int family)
1907 {
1908 	struct sock *sk;
1909 	struct kmem_cache *slab;
1910 
1911 	slab = prot->slab;
1912 	if (slab != NULL) {
1913 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1914 		if (!sk)
1915 			return sk;
1916 		if (want_init_on_alloc(priority))
1917 			sk_prot_clear_nulls(sk, prot->obj_size);
1918 	} else
1919 		sk = kmalloc(prot->obj_size, priority);
1920 
1921 	if (sk != NULL) {
1922 		if (security_sk_alloc(sk, family, priority))
1923 			goto out_free;
1924 
1925 		if (!try_module_get(prot->owner))
1926 			goto out_free_sec;
1927 	}
1928 
1929 	return sk;
1930 
1931 out_free_sec:
1932 	security_sk_free(sk);
1933 out_free:
1934 	if (slab != NULL)
1935 		kmem_cache_free(slab, sk);
1936 	else
1937 		kfree(sk);
1938 	return NULL;
1939 }
1940 
1941 static void sk_prot_free(struct proto *prot, struct sock *sk)
1942 {
1943 	struct kmem_cache *slab;
1944 	struct module *owner;
1945 
1946 	owner = prot->owner;
1947 	slab = prot->slab;
1948 
1949 	cgroup_sk_free(&sk->sk_cgrp_data);
1950 	mem_cgroup_sk_free(sk);
1951 	security_sk_free(sk);
1952 	if (slab != NULL)
1953 		kmem_cache_free(slab, sk);
1954 	else
1955 		kfree(sk);
1956 	module_put(owner);
1957 }
1958 
1959 /**
1960  *	sk_alloc - All socket objects are allocated here
1961  *	@net: the applicable net namespace
1962  *	@family: protocol family
1963  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1964  *	@prot: struct proto associated with this new sock instance
1965  *	@kern: is this to be a kernel socket?
1966  */
1967 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1968 		      struct proto *prot, int kern)
1969 {
1970 	struct sock *sk;
1971 
1972 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1973 	if (sk) {
1974 		sk->sk_family = family;
1975 		/*
1976 		 * See comment in struct sock definition to understand
1977 		 * why we need sk_prot_creator -acme
1978 		 */
1979 		sk->sk_prot = sk->sk_prot_creator = prot;
1980 		sk->sk_kern_sock = kern;
1981 		sock_lock_init(sk);
1982 		sk->sk_net_refcnt = kern ? 0 : 1;
1983 		if (likely(sk->sk_net_refcnt)) {
1984 			get_net(net);
1985 			sock_inuse_add(net, 1);
1986 		}
1987 
1988 		sock_net_set(sk, net);
1989 		refcount_set(&sk->sk_wmem_alloc, 1);
1990 
1991 		mem_cgroup_sk_alloc(sk);
1992 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1993 		sock_update_classid(&sk->sk_cgrp_data);
1994 		sock_update_netprioidx(&sk->sk_cgrp_data);
1995 		sk_tx_queue_clear(sk);
1996 	}
1997 
1998 	return sk;
1999 }
2000 EXPORT_SYMBOL(sk_alloc);
2001 
2002 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2003  * grace period. This is the case for UDP sockets and TCP listeners.
2004  */
2005 static void __sk_destruct(struct rcu_head *head)
2006 {
2007 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2008 	struct sk_filter *filter;
2009 
2010 	if (sk->sk_destruct)
2011 		sk->sk_destruct(sk);
2012 
2013 	filter = rcu_dereference_check(sk->sk_filter,
2014 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2015 	if (filter) {
2016 		sk_filter_uncharge(sk, filter);
2017 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2018 	}
2019 
2020 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2021 
2022 #ifdef CONFIG_BPF_SYSCALL
2023 	bpf_sk_storage_free(sk);
2024 #endif
2025 
2026 	if (atomic_read(&sk->sk_omem_alloc))
2027 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2028 			 __func__, atomic_read(&sk->sk_omem_alloc));
2029 
2030 	if (sk->sk_frag.page) {
2031 		put_page(sk->sk_frag.page);
2032 		sk->sk_frag.page = NULL;
2033 	}
2034 
2035 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2036 	put_cred(sk->sk_peer_cred);
2037 	put_pid(sk->sk_peer_pid);
2038 
2039 	if (likely(sk->sk_net_refcnt))
2040 		put_net(sock_net(sk));
2041 	sk_prot_free(sk->sk_prot_creator, sk);
2042 }
2043 
2044 void sk_destruct(struct sock *sk)
2045 {
2046 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2047 
2048 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2049 		reuseport_detach_sock(sk);
2050 		use_call_rcu = true;
2051 	}
2052 
2053 	if (use_call_rcu)
2054 		call_rcu(&sk->sk_rcu, __sk_destruct);
2055 	else
2056 		__sk_destruct(&sk->sk_rcu);
2057 }
2058 
2059 static void __sk_free(struct sock *sk)
2060 {
2061 	if (likely(sk->sk_net_refcnt))
2062 		sock_inuse_add(sock_net(sk), -1);
2063 
2064 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2065 		sock_diag_broadcast_destroy(sk);
2066 	else
2067 		sk_destruct(sk);
2068 }
2069 
2070 void sk_free(struct sock *sk)
2071 {
2072 	/*
2073 	 * We subtract one from sk_wmem_alloc and can know if
2074 	 * some packets are still in some tx queue.
2075 	 * If not null, sock_wfree() will call __sk_free(sk) later
2076 	 */
2077 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2078 		__sk_free(sk);
2079 }
2080 EXPORT_SYMBOL(sk_free);
2081 
2082 static void sk_init_common(struct sock *sk)
2083 {
2084 	skb_queue_head_init(&sk->sk_receive_queue);
2085 	skb_queue_head_init(&sk->sk_write_queue);
2086 	skb_queue_head_init(&sk->sk_error_queue);
2087 
2088 	rwlock_init(&sk->sk_callback_lock);
2089 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2090 			af_rlock_keys + sk->sk_family,
2091 			af_family_rlock_key_strings[sk->sk_family]);
2092 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2093 			af_wlock_keys + sk->sk_family,
2094 			af_family_wlock_key_strings[sk->sk_family]);
2095 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2096 			af_elock_keys + sk->sk_family,
2097 			af_family_elock_key_strings[sk->sk_family]);
2098 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2099 			af_callback_keys + sk->sk_family,
2100 			af_family_clock_key_strings[sk->sk_family]);
2101 }
2102 
2103 /**
2104  *	sk_clone_lock - clone a socket, and lock its clone
2105  *	@sk: the socket to clone
2106  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2107  *
2108  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2109  */
2110 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2111 {
2112 	struct proto *prot = READ_ONCE(sk->sk_prot);
2113 	struct sk_filter *filter;
2114 	bool is_charged = true;
2115 	struct sock *newsk;
2116 
2117 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2118 	if (!newsk)
2119 		goto out;
2120 
2121 	sock_copy(newsk, sk);
2122 
2123 	newsk->sk_prot_creator = prot;
2124 
2125 	/* SANITY */
2126 	if (likely(newsk->sk_net_refcnt)) {
2127 		get_net(sock_net(newsk));
2128 		sock_inuse_add(sock_net(newsk), 1);
2129 	}
2130 	sk_node_init(&newsk->sk_node);
2131 	sock_lock_init(newsk);
2132 	bh_lock_sock(newsk);
2133 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2134 	newsk->sk_backlog.len = 0;
2135 
2136 	atomic_set(&newsk->sk_rmem_alloc, 0);
2137 
2138 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2139 	refcount_set(&newsk->sk_wmem_alloc, 1);
2140 
2141 	atomic_set(&newsk->sk_omem_alloc, 0);
2142 	sk_init_common(newsk);
2143 
2144 	newsk->sk_dst_cache	= NULL;
2145 	newsk->sk_dst_pending_confirm = 0;
2146 	newsk->sk_wmem_queued	= 0;
2147 	newsk->sk_forward_alloc = 0;
2148 	newsk->sk_reserved_mem  = 0;
2149 	atomic_set(&newsk->sk_drops, 0);
2150 	newsk->sk_send_head	= NULL;
2151 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2152 	atomic_set(&newsk->sk_zckey, 0);
2153 
2154 	sock_reset_flag(newsk, SOCK_DONE);
2155 
2156 	/* sk->sk_memcg will be populated at accept() time */
2157 	newsk->sk_memcg = NULL;
2158 
2159 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2160 
2161 	rcu_read_lock();
2162 	filter = rcu_dereference(sk->sk_filter);
2163 	if (filter != NULL)
2164 		/* though it's an empty new sock, the charging may fail
2165 		 * if sysctl_optmem_max was changed between creation of
2166 		 * original socket and cloning
2167 		 */
2168 		is_charged = sk_filter_charge(newsk, filter);
2169 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2170 	rcu_read_unlock();
2171 
2172 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2173 		/* We need to make sure that we don't uncharge the new
2174 		 * socket if we couldn't charge it in the first place
2175 		 * as otherwise we uncharge the parent's filter.
2176 		 */
2177 		if (!is_charged)
2178 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2179 		sk_free_unlock_clone(newsk);
2180 		newsk = NULL;
2181 		goto out;
2182 	}
2183 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2184 
2185 	if (bpf_sk_storage_clone(sk, newsk)) {
2186 		sk_free_unlock_clone(newsk);
2187 		newsk = NULL;
2188 		goto out;
2189 	}
2190 
2191 	/* Clear sk_user_data if parent had the pointer tagged
2192 	 * as not suitable for copying when cloning.
2193 	 */
2194 	if (sk_user_data_is_nocopy(newsk))
2195 		newsk->sk_user_data = NULL;
2196 
2197 	newsk->sk_err	   = 0;
2198 	newsk->sk_err_soft = 0;
2199 	newsk->sk_priority = 0;
2200 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2201 
2202 	/* Before updating sk_refcnt, we must commit prior changes to memory
2203 	 * (Documentation/RCU/rculist_nulls.rst for details)
2204 	 */
2205 	smp_wmb();
2206 	refcount_set(&newsk->sk_refcnt, 2);
2207 
2208 	/* Increment the counter in the same struct proto as the master
2209 	 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2210 	 * is the same as sk->sk_prot->socks, as this field was copied
2211 	 * with memcpy).
2212 	 *
2213 	 * This _changes_ the previous behaviour, where
2214 	 * tcp_create_openreq_child always was incrementing the
2215 	 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2216 	 * to be taken into account in all callers. -acme
2217 	 */
2218 	sk_refcnt_debug_inc(newsk);
2219 	sk_set_socket(newsk, NULL);
2220 	sk_tx_queue_clear(newsk);
2221 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2222 
2223 	if (newsk->sk_prot->sockets_allocated)
2224 		sk_sockets_allocated_inc(newsk);
2225 
2226 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2227 		net_enable_timestamp();
2228 out:
2229 	return newsk;
2230 }
2231 EXPORT_SYMBOL_GPL(sk_clone_lock);
2232 
2233 void sk_free_unlock_clone(struct sock *sk)
2234 {
2235 	/* It is still raw copy of parent, so invalidate
2236 	 * destructor and make plain sk_free() */
2237 	sk->sk_destruct = NULL;
2238 	bh_unlock_sock(sk);
2239 	sk_free(sk);
2240 }
2241 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2242 
2243 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2244 {
2245 	u32 max_segs = 1;
2246 
2247 	sk_dst_set(sk, dst);
2248 	sk->sk_route_caps = dst->dev->features;
2249 	if (sk_is_tcp(sk))
2250 		sk->sk_route_caps |= NETIF_F_GSO;
2251 	if (sk->sk_route_caps & NETIF_F_GSO)
2252 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2253 	if (unlikely(sk->sk_gso_disabled))
2254 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2255 	if (sk_can_gso(sk)) {
2256 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2257 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2258 		} else {
2259 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2260 			sk->sk_gso_max_size = dst->dev->gso_max_size;
2261 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2262 		}
2263 	}
2264 	sk->sk_gso_max_segs = max_segs;
2265 }
2266 EXPORT_SYMBOL_GPL(sk_setup_caps);
2267 
2268 /*
2269  *	Simple resource managers for sockets.
2270  */
2271 
2272 
2273 /*
2274  * Write buffer destructor automatically called from kfree_skb.
2275  */
2276 void sock_wfree(struct sk_buff *skb)
2277 {
2278 	struct sock *sk = skb->sk;
2279 	unsigned int len = skb->truesize;
2280 
2281 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2282 		/*
2283 		 * Keep a reference on sk_wmem_alloc, this will be released
2284 		 * after sk_write_space() call
2285 		 */
2286 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2287 		sk->sk_write_space(sk);
2288 		len = 1;
2289 	}
2290 	/*
2291 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2292 	 * could not do because of in-flight packets
2293 	 */
2294 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2295 		__sk_free(sk);
2296 }
2297 EXPORT_SYMBOL(sock_wfree);
2298 
2299 /* This variant of sock_wfree() is used by TCP,
2300  * since it sets SOCK_USE_WRITE_QUEUE.
2301  */
2302 void __sock_wfree(struct sk_buff *skb)
2303 {
2304 	struct sock *sk = skb->sk;
2305 
2306 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2307 		__sk_free(sk);
2308 }
2309 
2310 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2311 {
2312 	skb_orphan(skb);
2313 	skb->sk = sk;
2314 #ifdef CONFIG_INET
2315 	if (unlikely(!sk_fullsock(sk))) {
2316 		skb->destructor = sock_edemux;
2317 		sock_hold(sk);
2318 		return;
2319 	}
2320 #endif
2321 	skb->destructor = sock_wfree;
2322 	skb_set_hash_from_sk(skb, sk);
2323 	/*
2324 	 * We used to take a refcount on sk, but following operation
2325 	 * is enough to guarantee sk_free() wont free this sock until
2326 	 * all in-flight packets are completed
2327 	 */
2328 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2329 }
2330 EXPORT_SYMBOL(skb_set_owner_w);
2331 
2332 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2333 {
2334 #ifdef CONFIG_TLS_DEVICE
2335 	/* Drivers depend on in-order delivery for crypto offload,
2336 	 * partial orphan breaks out-of-order-OK logic.
2337 	 */
2338 	if (skb->decrypted)
2339 		return false;
2340 #endif
2341 	return (skb->destructor == sock_wfree ||
2342 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2343 }
2344 
2345 /* This helper is used by netem, as it can hold packets in its
2346  * delay queue. We want to allow the owner socket to send more
2347  * packets, as if they were already TX completed by a typical driver.
2348  * But we also want to keep skb->sk set because some packet schedulers
2349  * rely on it (sch_fq for example).
2350  */
2351 void skb_orphan_partial(struct sk_buff *skb)
2352 {
2353 	if (skb_is_tcp_pure_ack(skb))
2354 		return;
2355 
2356 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2357 		return;
2358 
2359 	skb_orphan(skb);
2360 }
2361 EXPORT_SYMBOL(skb_orphan_partial);
2362 
2363 /*
2364  * Read buffer destructor automatically called from kfree_skb.
2365  */
2366 void sock_rfree(struct sk_buff *skb)
2367 {
2368 	struct sock *sk = skb->sk;
2369 	unsigned int len = skb->truesize;
2370 
2371 	atomic_sub(len, &sk->sk_rmem_alloc);
2372 	sk_mem_uncharge(sk, len);
2373 }
2374 EXPORT_SYMBOL(sock_rfree);
2375 
2376 /*
2377  * Buffer destructor for skbs that are not used directly in read or write
2378  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2379  */
2380 void sock_efree(struct sk_buff *skb)
2381 {
2382 	sock_put(skb->sk);
2383 }
2384 EXPORT_SYMBOL(sock_efree);
2385 
2386 /* Buffer destructor for prefetch/receive path where reference count may
2387  * not be held, e.g. for listen sockets.
2388  */
2389 #ifdef CONFIG_INET
2390 void sock_pfree(struct sk_buff *skb)
2391 {
2392 	if (sk_is_refcounted(skb->sk))
2393 		sock_gen_put(skb->sk);
2394 }
2395 EXPORT_SYMBOL(sock_pfree);
2396 #endif /* CONFIG_INET */
2397 
2398 kuid_t sock_i_uid(struct sock *sk)
2399 {
2400 	kuid_t uid;
2401 
2402 	read_lock_bh(&sk->sk_callback_lock);
2403 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2404 	read_unlock_bh(&sk->sk_callback_lock);
2405 	return uid;
2406 }
2407 EXPORT_SYMBOL(sock_i_uid);
2408 
2409 unsigned long sock_i_ino(struct sock *sk)
2410 {
2411 	unsigned long ino;
2412 
2413 	read_lock_bh(&sk->sk_callback_lock);
2414 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2415 	read_unlock_bh(&sk->sk_callback_lock);
2416 	return ino;
2417 }
2418 EXPORT_SYMBOL(sock_i_ino);
2419 
2420 /*
2421  * Allocate a skb from the socket's send buffer.
2422  */
2423 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2424 			     gfp_t priority)
2425 {
2426 	if (force ||
2427 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2428 		struct sk_buff *skb = alloc_skb(size, priority);
2429 
2430 		if (skb) {
2431 			skb_set_owner_w(skb, sk);
2432 			return skb;
2433 		}
2434 	}
2435 	return NULL;
2436 }
2437 EXPORT_SYMBOL(sock_wmalloc);
2438 
2439 static void sock_ofree(struct sk_buff *skb)
2440 {
2441 	struct sock *sk = skb->sk;
2442 
2443 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2444 }
2445 
2446 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2447 			     gfp_t priority)
2448 {
2449 	struct sk_buff *skb;
2450 
2451 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2452 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2453 	    sysctl_optmem_max)
2454 		return NULL;
2455 
2456 	skb = alloc_skb(size, priority);
2457 	if (!skb)
2458 		return NULL;
2459 
2460 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2461 	skb->sk = sk;
2462 	skb->destructor = sock_ofree;
2463 	return skb;
2464 }
2465 
2466 /*
2467  * Allocate a memory block from the socket's option memory buffer.
2468  */
2469 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2470 {
2471 	if ((unsigned int)size <= sysctl_optmem_max &&
2472 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2473 		void *mem;
2474 		/* First do the add, to avoid the race if kmalloc
2475 		 * might sleep.
2476 		 */
2477 		atomic_add(size, &sk->sk_omem_alloc);
2478 		mem = kmalloc(size, priority);
2479 		if (mem)
2480 			return mem;
2481 		atomic_sub(size, &sk->sk_omem_alloc);
2482 	}
2483 	return NULL;
2484 }
2485 EXPORT_SYMBOL(sock_kmalloc);
2486 
2487 /* Free an option memory block. Note, we actually want the inline
2488  * here as this allows gcc to detect the nullify and fold away the
2489  * condition entirely.
2490  */
2491 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2492 				  const bool nullify)
2493 {
2494 	if (WARN_ON_ONCE(!mem))
2495 		return;
2496 	if (nullify)
2497 		kfree_sensitive(mem);
2498 	else
2499 		kfree(mem);
2500 	atomic_sub(size, &sk->sk_omem_alloc);
2501 }
2502 
2503 void sock_kfree_s(struct sock *sk, void *mem, int size)
2504 {
2505 	__sock_kfree_s(sk, mem, size, false);
2506 }
2507 EXPORT_SYMBOL(sock_kfree_s);
2508 
2509 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2510 {
2511 	__sock_kfree_s(sk, mem, size, true);
2512 }
2513 EXPORT_SYMBOL(sock_kzfree_s);
2514 
2515 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2516    I think, these locks should be removed for datagram sockets.
2517  */
2518 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2519 {
2520 	DEFINE_WAIT(wait);
2521 
2522 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2523 	for (;;) {
2524 		if (!timeo)
2525 			break;
2526 		if (signal_pending(current))
2527 			break;
2528 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2529 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2530 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2531 			break;
2532 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2533 			break;
2534 		if (sk->sk_err)
2535 			break;
2536 		timeo = schedule_timeout(timeo);
2537 	}
2538 	finish_wait(sk_sleep(sk), &wait);
2539 	return timeo;
2540 }
2541 
2542 
2543 /*
2544  *	Generic send/receive buffer handlers
2545  */
2546 
2547 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2548 				     unsigned long data_len, int noblock,
2549 				     int *errcode, int max_page_order)
2550 {
2551 	struct sk_buff *skb;
2552 	long timeo;
2553 	int err;
2554 
2555 	timeo = sock_sndtimeo(sk, noblock);
2556 	for (;;) {
2557 		err = sock_error(sk);
2558 		if (err != 0)
2559 			goto failure;
2560 
2561 		err = -EPIPE;
2562 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2563 			goto failure;
2564 
2565 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2566 			break;
2567 
2568 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2569 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2570 		err = -EAGAIN;
2571 		if (!timeo)
2572 			goto failure;
2573 		if (signal_pending(current))
2574 			goto interrupted;
2575 		timeo = sock_wait_for_wmem(sk, timeo);
2576 	}
2577 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2578 				   errcode, sk->sk_allocation);
2579 	if (skb)
2580 		skb_set_owner_w(skb, sk);
2581 	return skb;
2582 
2583 interrupted:
2584 	err = sock_intr_errno(timeo);
2585 failure:
2586 	*errcode = err;
2587 	return NULL;
2588 }
2589 EXPORT_SYMBOL(sock_alloc_send_pskb);
2590 
2591 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2592 				    int noblock, int *errcode)
2593 {
2594 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2595 }
2596 EXPORT_SYMBOL(sock_alloc_send_skb);
2597 
2598 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2599 		     struct sockcm_cookie *sockc)
2600 {
2601 	u32 tsflags;
2602 
2603 	switch (cmsg->cmsg_type) {
2604 	case SO_MARK:
2605 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2606 			return -EPERM;
2607 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2608 			return -EINVAL;
2609 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2610 		break;
2611 	case SO_TIMESTAMPING_OLD:
2612 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2613 			return -EINVAL;
2614 
2615 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2616 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2617 			return -EINVAL;
2618 
2619 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2620 		sockc->tsflags |= tsflags;
2621 		break;
2622 	case SCM_TXTIME:
2623 		if (!sock_flag(sk, SOCK_TXTIME))
2624 			return -EINVAL;
2625 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2626 			return -EINVAL;
2627 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2628 		break;
2629 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2630 	case SCM_RIGHTS:
2631 	case SCM_CREDENTIALS:
2632 		break;
2633 	default:
2634 		return -EINVAL;
2635 	}
2636 	return 0;
2637 }
2638 EXPORT_SYMBOL(__sock_cmsg_send);
2639 
2640 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2641 		   struct sockcm_cookie *sockc)
2642 {
2643 	struct cmsghdr *cmsg;
2644 	int ret;
2645 
2646 	for_each_cmsghdr(cmsg, msg) {
2647 		if (!CMSG_OK(msg, cmsg))
2648 			return -EINVAL;
2649 		if (cmsg->cmsg_level != SOL_SOCKET)
2650 			continue;
2651 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2652 		if (ret)
2653 			return ret;
2654 	}
2655 	return 0;
2656 }
2657 EXPORT_SYMBOL(sock_cmsg_send);
2658 
2659 static void sk_enter_memory_pressure(struct sock *sk)
2660 {
2661 	if (!sk->sk_prot->enter_memory_pressure)
2662 		return;
2663 
2664 	sk->sk_prot->enter_memory_pressure(sk);
2665 }
2666 
2667 static void sk_leave_memory_pressure(struct sock *sk)
2668 {
2669 	if (sk->sk_prot->leave_memory_pressure) {
2670 		sk->sk_prot->leave_memory_pressure(sk);
2671 	} else {
2672 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2673 
2674 		if (memory_pressure && READ_ONCE(*memory_pressure))
2675 			WRITE_ONCE(*memory_pressure, 0);
2676 	}
2677 }
2678 
2679 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2680 
2681 /**
2682  * skb_page_frag_refill - check that a page_frag contains enough room
2683  * @sz: minimum size of the fragment we want to get
2684  * @pfrag: pointer to page_frag
2685  * @gfp: priority for memory allocation
2686  *
2687  * Note: While this allocator tries to use high order pages, there is
2688  * no guarantee that allocations succeed. Therefore, @sz MUST be
2689  * less or equal than PAGE_SIZE.
2690  */
2691 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2692 {
2693 	if (pfrag->page) {
2694 		if (page_ref_count(pfrag->page) == 1) {
2695 			pfrag->offset = 0;
2696 			return true;
2697 		}
2698 		if (pfrag->offset + sz <= pfrag->size)
2699 			return true;
2700 		put_page(pfrag->page);
2701 	}
2702 
2703 	pfrag->offset = 0;
2704 	if (SKB_FRAG_PAGE_ORDER &&
2705 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2706 		/* Avoid direct reclaim but allow kswapd to wake */
2707 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2708 					  __GFP_COMP | __GFP_NOWARN |
2709 					  __GFP_NORETRY,
2710 					  SKB_FRAG_PAGE_ORDER);
2711 		if (likely(pfrag->page)) {
2712 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2713 			return true;
2714 		}
2715 	}
2716 	pfrag->page = alloc_page(gfp);
2717 	if (likely(pfrag->page)) {
2718 		pfrag->size = PAGE_SIZE;
2719 		return true;
2720 	}
2721 	return false;
2722 }
2723 EXPORT_SYMBOL(skb_page_frag_refill);
2724 
2725 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2726 {
2727 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2728 		return true;
2729 
2730 	sk_enter_memory_pressure(sk);
2731 	sk_stream_moderate_sndbuf(sk);
2732 	return false;
2733 }
2734 EXPORT_SYMBOL(sk_page_frag_refill);
2735 
2736 void __lock_sock(struct sock *sk)
2737 	__releases(&sk->sk_lock.slock)
2738 	__acquires(&sk->sk_lock.slock)
2739 {
2740 	DEFINE_WAIT(wait);
2741 
2742 	for (;;) {
2743 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2744 					TASK_UNINTERRUPTIBLE);
2745 		spin_unlock_bh(&sk->sk_lock.slock);
2746 		schedule();
2747 		spin_lock_bh(&sk->sk_lock.slock);
2748 		if (!sock_owned_by_user(sk))
2749 			break;
2750 	}
2751 	finish_wait(&sk->sk_lock.wq, &wait);
2752 }
2753 
2754 void __release_sock(struct sock *sk)
2755 	__releases(&sk->sk_lock.slock)
2756 	__acquires(&sk->sk_lock.slock)
2757 {
2758 	struct sk_buff *skb, *next;
2759 
2760 	while ((skb = sk->sk_backlog.head) != NULL) {
2761 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2762 
2763 		spin_unlock_bh(&sk->sk_lock.slock);
2764 
2765 		do {
2766 			next = skb->next;
2767 			prefetch(next);
2768 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2769 			skb_mark_not_on_list(skb);
2770 			sk_backlog_rcv(sk, skb);
2771 
2772 			cond_resched();
2773 
2774 			skb = next;
2775 		} while (skb != NULL);
2776 
2777 		spin_lock_bh(&sk->sk_lock.slock);
2778 	}
2779 
2780 	/*
2781 	 * Doing the zeroing here guarantee we can not loop forever
2782 	 * while a wild producer attempts to flood us.
2783 	 */
2784 	sk->sk_backlog.len = 0;
2785 }
2786 
2787 void __sk_flush_backlog(struct sock *sk)
2788 {
2789 	spin_lock_bh(&sk->sk_lock.slock);
2790 	__release_sock(sk);
2791 	spin_unlock_bh(&sk->sk_lock.slock);
2792 }
2793 
2794 /**
2795  * sk_wait_data - wait for data to arrive at sk_receive_queue
2796  * @sk:    sock to wait on
2797  * @timeo: for how long
2798  * @skb:   last skb seen on sk_receive_queue
2799  *
2800  * Now socket state including sk->sk_err is changed only under lock,
2801  * hence we may omit checks after joining wait queue.
2802  * We check receive queue before schedule() only as optimization;
2803  * it is very likely that release_sock() added new data.
2804  */
2805 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2806 {
2807 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2808 	int rc;
2809 
2810 	add_wait_queue(sk_sleep(sk), &wait);
2811 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2812 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2813 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2814 	remove_wait_queue(sk_sleep(sk), &wait);
2815 	return rc;
2816 }
2817 EXPORT_SYMBOL(sk_wait_data);
2818 
2819 /**
2820  *	__sk_mem_raise_allocated - increase memory_allocated
2821  *	@sk: socket
2822  *	@size: memory size to allocate
2823  *	@amt: pages to allocate
2824  *	@kind: allocation type
2825  *
2826  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2827  */
2828 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2829 {
2830 	struct proto *prot = sk->sk_prot;
2831 	long allocated = sk_memory_allocated_add(sk, amt);
2832 	bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2833 	bool charged = true;
2834 
2835 	if (memcg_charge &&
2836 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2837 						gfp_memcg_charge())))
2838 		goto suppress_allocation;
2839 
2840 	/* Under limit. */
2841 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2842 		sk_leave_memory_pressure(sk);
2843 		return 1;
2844 	}
2845 
2846 	/* Under pressure. */
2847 	if (allocated > sk_prot_mem_limits(sk, 1))
2848 		sk_enter_memory_pressure(sk);
2849 
2850 	/* Over hard limit. */
2851 	if (allocated > sk_prot_mem_limits(sk, 2))
2852 		goto suppress_allocation;
2853 
2854 	/* guarantee minimum buffer size under pressure */
2855 	if (kind == SK_MEM_RECV) {
2856 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2857 			return 1;
2858 
2859 	} else { /* SK_MEM_SEND */
2860 		int wmem0 = sk_get_wmem0(sk, prot);
2861 
2862 		if (sk->sk_type == SOCK_STREAM) {
2863 			if (sk->sk_wmem_queued < wmem0)
2864 				return 1;
2865 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2866 				return 1;
2867 		}
2868 	}
2869 
2870 	if (sk_has_memory_pressure(sk)) {
2871 		u64 alloc;
2872 
2873 		if (!sk_under_memory_pressure(sk))
2874 			return 1;
2875 		alloc = sk_sockets_allocated_read_positive(sk);
2876 		if (sk_prot_mem_limits(sk, 2) > alloc *
2877 		    sk_mem_pages(sk->sk_wmem_queued +
2878 				 atomic_read(&sk->sk_rmem_alloc) +
2879 				 sk->sk_forward_alloc))
2880 			return 1;
2881 	}
2882 
2883 suppress_allocation:
2884 
2885 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2886 		sk_stream_moderate_sndbuf(sk);
2887 
2888 		/* Fail only if socket is _under_ its sndbuf.
2889 		 * In this case we cannot block, so that we have to fail.
2890 		 */
2891 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2892 			/* Force charge with __GFP_NOFAIL */
2893 			if (memcg_charge && !charged) {
2894 				mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2895 					gfp_memcg_charge() | __GFP_NOFAIL);
2896 			}
2897 			return 1;
2898 		}
2899 	}
2900 
2901 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2902 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2903 
2904 	sk_memory_allocated_sub(sk, amt);
2905 
2906 	if (memcg_charge && charged)
2907 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2908 
2909 	return 0;
2910 }
2911 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2912 
2913 /**
2914  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2915  *	@sk: socket
2916  *	@size: memory size to allocate
2917  *	@kind: allocation type
2918  *
2919  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2920  *	rmem allocation. This function assumes that protocols which have
2921  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2922  */
2923 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2924 {
2925 	int ret, amt = sk_mem_pages(size);
2926 
2927 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2928 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2929 	if (!ret)
2930 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2931 	return ret;
2932 }
2933 EXPORT_SYMBOL(__sk_mem_schedule);
2934 
2935 /**
2936  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2937  *	@sk: socket
2938  *	@amount: number of quanta
2939  *
2940  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2941  */
2942 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2943 {
2944 	sk_memory_allocated_sub(sk, amount);
2945 
2946 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2947 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2948 
2949 	if (sk_under_memory_pressure(sk) &&
2950 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2951 		sk_leave_memory_pressure(sk);
2952 }
2953 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2954 
2955 /**
2956  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2957  *	@sk: socket
2958  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2959  */
2960 void __sk_mem_reclaim(struct sock *sk, int amount)
2961 {
2962 	amount >>= SK_MEM_QUANTUM_SHIFT;
2963 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2964 	__sk_mem_reduce_allocated(sk, amount);
2965 }
2966 EXPORT_SYMBOL(__sk_mem_reclaim);
2967 
2968 int sk_set_peek_off(struct sock *sk, int val)
2969 {
2970 	sk->sk_peek_off = val;
2971 	return 0;
2972 }
2973 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2974 
2975 /*
2976  * Set of default routines for initialising struct proto_ops when
2977  * the protocol does not support a particular function. In certain
2978  * cases where it makes no sense for a protocol to have a "do nothing"
2979  * function, some default processing is provided.
2980  */
2981 
2982 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2983 {
2984 	return -EOPNOTSUPP;
2985 }
2986 EXPORT_SYMBOL(sock_no_bind);
2987 
2988 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2989 		    int len, int flags)
2990 {
2991 	return -EOPNOTSUPP;
2992 }
2993 EXPORT_SYMBOL(sock_no_connect);
2994 
2995 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2996 {
2997 	return -EOPNOTSUPP;
2998 }
2999 EXPORT_SYMBOL(sock_no_socketpair);
3000 
3001 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3002 		   bool kern)
3003 {
3004 	return -EOPNOTSUPP;
3005 }
3006 EXPORT_SYMBOL(sock_no_accept);
3007 
3008 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3009 		    int peer)
3010 {
3011 	return -EOPNOTSUPP;
3012 }
3013 EXPORT_SYMBOL(sock_no_getname);
3014 
3015 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3016 {
3017 	return -EOPNOTSUPP;
3018 }
3019 EXPORT_SYMBOL(sock_no_ioctl);
3020 
3021 int sock_no_listen(struct socket *sock, int backlog)
3022 {
3023 	return -EOPNOTSUPP;
3024 }
3025 EXPORT_SYMBOL(sock_no_listen);
3026 
3027 int sock_no_shutdown(struct socket *sock, int how)
3028 {
3029 	return -EOPNOTSUPP;
3030 }
3031 EXPORT_SYMBOL(sock_no_shutdown);
3032 
3033 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3034 {
3035 	return -EOPNOTSUPP;
3036 }
3037 EXPORT_SYMBOL(sock_no_sendmsg);
3038 
3039 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3040 {
3041 	return -EOPNOTSUPP;
3042 }
3043 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3044 
3045 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3046 		    int flags)
3047 {
3048 	return -EOPNOTSUPP;
3049 }
3050 EXPORT_SYMBOL(sock_no_recvmsg);
3051 
3052 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3053 {
3054 	/* Mirror missing mmap method error code */
3055 	return -ENODEV;
3056 }
3057 EXPORT_SYMBOL(sock_no_mmap);
3058 
3059 /*
3060  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3061  * various sock-based usage counts.
3062  */
3063 void __receive_sock(struct file *file)
3064 {
3065 	struct socket *sock;
3066 
3067 	sock = sock_from_file(file);
3068 	if (sock) {
3069 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3070 		sock_update_classid(&sock->sk->sk_cgrp_data);
3071 	}
3072 }
3073 
3074 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3075 {
3076 	ssize_t res;
3077 	struct msghdr msg = {.msg_flags = flags};
3078 	struct kvec iov;
3079 	char *kaddr = kmap(page);
3080 	iov.iov_base = kaddr + offset;
3081 	iov.iov_len = size;
3082 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3083 	kunmap(page);
3084 	return res;
3085 }
3086 EXPORT_SYMBOL(sock_no_sendpage);
3087 
3088 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3089 				int offset, size_t size, int flags)
3090 {
3091 	ssize_t res;
3092 	struct msghdr msg = {.msg_flags = flags};
3093 	struct kvec iov;
3094 	char *kaddr = kmap(page);
3095 
3096 	iov.iov_base = kaddr + offset;
3097 	iov.iov_len = size;
3098 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3099 	kunmap(page);
3100 	return res;
3101 }
3102 EXPORT_SYMBOL(sock_no_sendpage_locked);
3103 
3104 /*
3105  *	Default Socket Callbacks
3106  */
3107 
3108 static void sock_def_wakeup(struct sock *sk)
3109 {
3110 	struct socket_wq *wq;
3111 
3112 	rcu_read_lock();
3113 	wq = rcu_dereference(sk->sk_wq);
3114 	if (skwq_has_sleeper(wq))
3115 		wake_up_interruptible_all(&wq->wait);
3116 	rcu_read_unlock();
3117 }
3118 
3119 static void sock_def_error_report(struct sock *sk)
3120 {
3121 	struct socket_wq *wq;
3122 
3123 	rcu_read_lock();
3124 	wq = rcu_dereference(sk->sk_wq);
3125 	if (skwq_has_sleeper(wq))
3126 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3127 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3128 	rcu_read_unlock();
3129 }
3130 
3131 void sock_def_readable(struct sock *sk)
3132 {
3133 	struct socket_wq *wq;
3134 
3135 	rcu_read_lock();
3136 	wq = rcu_dereference(sk->sk_wq);
3137 	if (skwq_has_sleeper(wq))
3138 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3139 						EPOLLRDNORM | EPOLLRDBAND);
3140 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3141 	rcu_read_unlock();
3142 }
3143 
3144 static void sock_def_write_space(struct sock *sk)
3145 {
3146 	struct socket_wq *wq;
3147 
3148 	rcu_read_lock();
3149 
3150 	/* Do not wake up a writer until he can make "significant"
3151 	 * progress.  --DaveM
3152 	 */
3153 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
3154 		wq = rcu_dereference(sk->sk_wq);
3155 		if (skwq_has_sleeper(wq))
3156 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3157 						EPOLLWRNORM | EPOLLWRBAND);
3158 
3159 		/* Should agree with poll, otherwise some programs break */
3160 		if (sock_writeable(sk))
3161 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3162 	}
3163 
3164 	rcu_read_unlock();
3165 }
3166 
3167 static void sock_def_destruct(struct sock *sk)
3168 {
3169 }
3170 
3171 void sk_send_sigurg(struct sock *sk)
3172 {
3173 	if (sk->sk_socket && sk->sk_socket->file)
3174 		if (send_sigurg(&sk->sk_socket->file->f_owner))
3175 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3176 }
3177 EXPORT_SYMBOL(sk_send_sigurg);
3178 
3179 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3180 		    unsigned long expires)
3181 {
3182 	if (!mod_timer(timer, expires))
3183 		sock_hold(sk);
3184 }
3185 EXPORT_SYMBOL(sk_reset_timer);
3186 
3187 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3188 {
3189 	if (del_timer(timer))
3190 		__sock_put(sk);
3191 }
3192 EXPORT_SYMBOL(sk_stop_timer);
3193 
3194 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3195 {
3196 	if (del_timer_sync(timer))
3197 		__sock_put(sk);
3198 }
3199 EXPORT_SYMBOL(sk_stop_timer_sync);
3200 
3201 void sock_init_data(struct socket *sock, struct sock *sk)
3202 {
3203 	sk_init_common(sk);
3204 	sk->sk_send_head	=	NULL;
3205 
3206 	timer_setup(&sk->sk_timer, NULL, 0);
3207 
3208 	sk->sk_allocation	=	GFP_KERNEL;
3209 	sk->sk_rcvbuf		=	sysctl_rmem_default;
3210 	sk->sk_sndbuf		=	sysctl_wmem_default;
3211 	sk->sk_state		=	TCP_CLOSE;
3212 	sk_set_socket(sk, sock);
3213 
3214 	sock_set_flag(sk, SOCK_ZAPPED);
3215 
3216 	if (sock) {
3217 		sk->sk_type	=	sock->type;
3218 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3219 		sock->sk	=	sk;
3220 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
3221 	} else {
3222 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3223 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
3224 	}
3225 
3226 	rwlock_init(&sk->sk_callback_lock);
3227 	if (sk->sk_kern_sock)
3228 		lockdep_set_class_and_name(
3229 			&sk->sk_callback_lock,
3230 			af_kern_callback_keys + sk->sk_family,
3231 			af_family_kern_clock_key_strings[sk->sk_family]);
3232 	else
3233 		lockdep_set_class_and_name(
3234 			&sk->sk_callback_lock,
3235 			af_callback_keys + sk->sk_family,
3236 			af_family_clock_key_strings[sk->sk_family]);
3237 
3238 	sk->sk_state_change	=	sock_def_wakeup;
3239 	sk->sk_data_ready	=	sock_def_readable;
3240 	sk->sk_write_space	=	sock_def_write_space;
3241 	sk->sk_error_report	=	sock_def_error_report;
3242 	sk->sk_destruct		=	sock_def_destruct;
3243 
3244 	sk->sk_frag.page	=	NULL;
3245 	sk->sk_frag.offset	=	0;
3246 	sk->sk_peek_off		=	-1;
3247 
3248 	sk->sk_peer_pid 	=	NULL;
3249 	sk->sk_peer_cred	=	NULL;
3250 	spin_lock_init(&sk->sk_peer_lock);
3251 
3252 	sk->sk_write_pending	=	0;
3253 	sk->sk_rcvlowat		=	1;
3254 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3255 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3256 
3257 	sk->sk_stamp = SK_DEFAULT_STAMP;
3258 #if BITS_PER_LONG==32
3259 	seqlock_init(&sk->sk_stamp_seq);
3260 #endif
3261 	atomic_set(&sk->sk_zckey, 0);
3262 
3263 #ifdef CONFIG_NET_RX_BUSY_POLL
3264 	sk->sk_napi_id		=	0;
3265 	sk->sk_ll_usec		=	sysctl_net_busy_read;
3266 #endif
3267 
3268 	sk->sk_max_pacing_rate = ~0UL;
3269 	sk->sk_pacing_rate = ~0UL;
3270 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3271 	sk->sk_incoming_cpu = -1;
3272 
3273 	sk_rx_queue_clear(sk);
3274 	/*
3275 	 * Before updating sk_refcnt, we must commit prior changes to memory
3276 	 * (Documentation/RCU/rculist_nulls.rst for details)
3277 	 */
3278 	smp_wmb();
3279 	refcount_set(&sk->sk_refcnt, 1);
3280 	atomic_set(&sk->sk_drops, 0);
3281 }
3282 EXPORT_SYMBOL(sock_init_data);
3283 
3284 void lock_sock_nested(struct sock *sk, int subclass)
3285 {
3286 	/* The sk_lock has mutex_lock() semantics here. */
3287 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3288 
3289 	might_sleep();
3290 	spin_lock_bh(&sk->sk_lock.slock);
3291 	if (sk->sk_lock.owned)
3292 		__lock_sock(sk);
3293 	sk->sk_lock.owned = 1;
3294 	spin_unlock_bh(&sk->sk_lock.slock);
3295 }
3296 EXPORT_SYMBOL(lock_sock_nested);
3297 
3298 void release_sock(struct sock *sk)
3299 {
3300 	spin_lock_bh(&sk->sk_lock.slock);
3301 	if (sk->sk_backlog.tail)
3302 		__release_sock(sk);
3303 
3304 	/* Warning : release_cb() might need to release sk ownership,
3305 	 * ie call sock_release_ownership(sk) before us.
3306 	 */
3307 	if (sk->sk_prot->release_cb)
3308 		sk->sk_prot->release_cb(sk);
3309 
3310 	sock_release_ownership(sk);
3311 	if (waitqueue_active(&sk->sk_lock.wq))
3312 		wake_up(&sk->sk_lock.wq);
3313 	spin_unlock_bh(&sk->sk_lock.slock);
3314 }
3315 EXPORT_SYMBOL(release_sock);
3316 
3317 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3318 {
3319 	might_sleep();
3320 	spin_lock_bh(&sk->sk_lock.slock);
3321 
3322 	if (!sk->sk_lock.owned) {
3323 		/*
3324 		 * Fast path return with bottom halves disabled and
3325 		 * sock::sk_lock.slock held.
3326 		 *
3327 		 * The 'mutex' is not contended and holding
3328 		 * sock::sk_lock.slock prevents all other lockers to
3329 		 * proceed so the corresponding unlock_sock_fast() can
3330 		 * avoid the slow path of release_sock() completely and
3331 		 * just release slock.
3332 		 *
3333 		 * From a semantical POV this is equivalent to 'acquiring'
3334 		 * the 'mutex', hence the corresponding lockdep
3335 		 * mutex_release() has to happen in the fast path of
3336 		 * unlock_sock_fast().
3337 		 */
3338 		return false;
3339 	}
3340 
3341 	__lock_sock(sk);
3342 	sk->sk_lock.owned = 1;
3343 	__acquire(&sk->sk_lock.slock);
3344 	spin_unlock_bh(&sk->sk_lock.slock);
3345 	return true;
3346 }
3347 EXPORT_SYMBOL(__lock_sock_fast);
3348 
3349 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3350 		   bool timeval, bool time32)
3351 {
3352 	struct sock *sk = sock->sk;
3353 	struct timespec64 ts;
3354 
3355 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3356 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3357 	if (ts.tv_sec == -1)
3358 		return -ENOENT;
3359 	if (ts.tv_sec == 0) {
3360 		ktime_t kt = ktime_get_real();
3361 		sock_write_timestamp(sk, kt);
3362 		ts = ktime_to_timespec64(kt);
3363 	}
3364 
3365 	if (timeval)
3366 		ts.tv_nsec /= 1000;
3367 
3368 #ifdef CONFIG_COMPAT_32BIT_TIME
3369 	if (time32)
3370 		return put_old_timespec32(&ts, userstamp);
3371 #endif
3372 #ifdef CONFIG_SPARC64
3373 	/* beware of padding in sparc64 timeval */
3374 	if (timeval && !in_compat_syscall()) {
3375 		struct __kernel_old_timeval __user tv = {
3376 			.tv_sec = ts.tv_sec,
3377 			.tv_usec = ts.tv_nsec,
3378 		};
3379 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3380 			return -EFAULT;
3381 		return 0;
3382 	}
3383 #endif
3384 	return put_timespec64(&ts, userstamp);
3385 }
3386 EXPORT_SYMBOL(sock_gettstamp);
3387 
3388 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3389 {
3390 	if (!sock_flag(sk, flag)) {
3391 		unsigned long previous_flags = sk->sk_flags;
3392 
3393 		sock_set_flag(sk, flag);
3394 		/*
3395 		 * we just set one of the two flags which require net
3396 		 * time stamping, but time stamping might have been on
3397 		 * already because of the other one
3398 		 */
3399 		if (sock_needs_netstamp(sk) &&
3400 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3401 			net_enable_timestamp();
3402 	}
3403 }
3404 
3405 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3406 		       int level, int type)
3407 {
3408 	struct sock_exterr_skb *serr;
3409 	struct sk_buff *skb;
3410 	int copied, err;
3411 
3412 	err = -EAGAIN;
3413 	skb = sock_dequeue_err_skb(sk);
3414 	if (skb == NULL)
3415 		goto out;
3416 
3417 	copied = skb->len;
3418 	if (copied > len) {
3419 		msg->msg_flags |= MSG_TRUNC;
3420 		copied = len;
3421 	}
3422 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3423 	if (err)
3424 		goto out_free_skb;
3425 
3426 	sock_recv_timestamp(msg, sk, skb);
3427 
3428 	serr = SKB_EXT_ERR(skb);
3429 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3430 
3431 	msg->msg_flags |= MSG_ERRQUEUE;
3432 	err = copied;
3433 
3434 out_free_skb:
3435 	kfree_skb(skb);
3436 out:
3437 	return err;
3438 }
3439 EXPORT_SYMBOL(sock_recv_errqueue);
3440 
3441 /*
3442  *	Get a socket option on an socket.
3443  *
3444  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3445  *	asynchronous errors should be reported by getsockopt. We assume
3446  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3447  */
3448 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3449 			   char __user *optval, int __user *optlen)
3450 {
3451 	struct sock *sk = sock->sk;
3452 
3453 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3454 }
3455 EXPORT_SYMBOL(sock_common_getsockopt);
3456 
3457 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3458 			int flags)
3459 {
3460 	struct sock *sk = sock->sk;
3461 	int addr_len = 0;
3462 	int err;
3463 
3464 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3465 				   flags & ~MSG_DONTWAIT, &addr_len);
3466 	if (err >= 0)
3467 		msg->msg_namelen = addr_len;
3468 	return err;
3469 }
3470 EXPORT_SYMBOL(sock_common_recvmsg);
3471 
3472 /*
3473  *	Set socket options on an inet socket.
3474  */
3475 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3476 			   sockptr_t optval, unsigned int optlen)
3477 {
3478 	struct sock *sk = sock->sk;
3479 
3480 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3481 }
3482 EXPORT_SYMBOL(sock_common_setsockopt);
3483 
3484 void sk_common_release(struct sock *sk)
3485 {
3486 	if (sk->sk_prot->destroy)
3487 		sk->sk_prot->destroy(sk);
3488 
3489 	/*
3490 	 * Observation: when sk_common_release is called, processes have
3491 	 * no access to socket. But net still has.
3492 	 * Step one, detach it from networking:
3493 	 *
3494 	 * A. Remove from hash tables.
3495 	 */
3496 
3497 	sk->sk_prot->unhash(sk);
3498 
3499 	/*
3500 	 * In this point socket cannot receive new packets, but it is possible
3501 	 * that some packets are in flight because some CPU runs receiver and
3502 	 * did hash table lookup before we unhashed socket. They will achieve
3503 	 * receive queue and will be purged by socket destructor.
3504 	 *
3505 	 * Also we still have packets pending on receive queue and probably,
3506 	 * our own packets waiting in device queues. sock_destroy will drain
3507 	 * receive queue, but transmitted packets will delay socket destruction
3508 	 * until the last reference will be released.
3509 	 */
3510 
3511 	sock_orphan(sk);
3512 
3513 	xfrm_sk_free_policy(sk);
3514 
3515 	sk_refcnt_debug_release(sk);
3516 
3517 	sock_put(sk);
3518 }
3519 EXPORT_SYMBOL(sk_common_release);
3520 
3521 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3522 {
3523 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3524 
3525 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3526 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3527 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3528 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3529 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3530 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3531 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3532 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3533 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3534 }
3535 
3536 #ifdef CONFIG_PROC_FS
3537 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3538 
3539 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3540 {
3541 	int cpu, idx = prot->inuse_idx;
3542 	int res = 0;
3543 
3544 	for_each_possible_cpu(cpu)
3545 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3546 
3547 	return res >= 0 ? res : 0;
3548 }
3549 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3550 
3551 int sock_inuse_get(struct net *net)
3552 {
3553 	int cpu, res = 0;
3554 
3555 	for_each_possible_cpu(cpu)
3556 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3557 
3558 	return res;
3559 }
3560 
3561 EXPORT_SYMBOL_GPL(sock_inuse_get);
3562 
3563 static int __net_init sock_inuse_init_net(struct net *net)
3564 {
3565 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3566 	if (net->core.prot_inuse == NULL)
3567 		return -ENOMEM;
3568 	return 0;
3569 }
3570 
3571 static void __net_exit sock_inuse_exit_net(struct net *net)
3572 {
3573 	free_percpu(net->core.prot_inuse);
3574 }
3575 
3576 static struct pernet_operations net_inuse_ops = {
3577 	.init = sock_inuse_init_net,
3578 	.exit = sock_inuse_exit_net,
3579 };
3580 
3581 static __init int net_inuse_init(void)
3582 {
3583 	if (register_pernet_subsys(&net_inuse_ops))
3584 		panic("Cannot initialize net inuse counters");
3585 
3586 	return 0;
3587 }
3588 
3589 core_initcall(net_inuse_init);
3590 
3591 static int assign_proto_idx(struct proto *prot)
3592 {
3593 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3594 
3595 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3596 		pr_err("PROTO_INUSE_NR exhausted\n");
3597 		return -ENOSPC;
3598 	}
3599 
3600 	set_bit(prot->inuse_idx, proto_inuse_idx);
3601 	return 0;
3602 }
3603 
3604 static void release_proto_idx(struct proto *prot)
3605 {
3606 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3607 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3608 }
3609 #else
3610 static inline int assign_proto_idx(struct proto *prot)
3611 {
3612 	return 0;
3613 }
3614 
3615 static inline void release_proto_idx(struct proto *prot)
3616 {
3617 }
3618 
3619 #endif
3620 
3621 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3622 {
3623 	if (!twsk_prot)
3624 		return;
3625 	kfree(twsk_prot->twsk_slab_name);
3626 	twsk_prot->twsk_slab_name = NULL;
3627 	kmem_cache_destroy(twsk_prot->twsk_slab);
3628 	twsk_prot->twsk_slab = NULL;
3629 }
3630 
3631 static int tw_prot_init(const struct proto *prot)
3632 {
3633 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3634 
3635 	if (!twsk_prot)
3636 		return 0;
3637 
3638 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3639 					      prot->name);
3640 	if (!twsk_prot->twsk_slab_name)
3641 		return -ENOMEM;
3642 
3643 	twsk_prot->twsk_slab =
3644 		kmem_cache_create(twsk_prot->twsk_slab_name,
3645 				  twsk_prot->twsk_obj_size, 0,
3646 				  SLAB_ACCOUNT | prot->slab_flags,
3647 				  NULL);
3648 	if (!twsk_prot->twsk_slab) {
3649 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3650 			prot->name);
3651 		return -ENOMEM;
3652 	}
3653 
3654 	return 0;
3655 }
3656 
3657 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3658 {
3659 	if (!rsk_prot)
3660 		return;
3661 	kfree(rsk_prot->slab_name);
3662 	rsk_prot->slab_name = NULL;
3663 	kmem_cache_destroy(rsk_prot->slab);
3664 	rsk_prot->slab = NULL;
3665 }
3666 
3667 static int req_prot_init(const struct proto *prot)
3668 {
3669 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3670 
3671 	if (!rsk_prot)
3672 		return 0;
3673 
3674 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3675 					prot->name);
3676 	if (!rsk_prot->slab_name)
3677 		return -ENOMEM;
3678 
3679 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3680 					   rsk_prot->obj_size, 0,
3681 					   SLAB_ACCOUNT | prot->slab_flags,
3682 					   NULL);
3683 
3684 	if (!rsk_prot->slab) {
3685 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3686 			prot->name);
3687 		return -ENOMEM;
3688 	}
3689 	return 0;
3690 }
3691 
3692 int proto_register(struct proto *prot, int alloc_slab)
3693 {
3694 	int ret = -ENOBUFS;
3695 
3696 	if (alloc_slab) {
3697 		prot->slab = kmem_cache_create_usercopy(prot->name,
3698 					prot->obj_size, 0,
3699 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3700 					prot->slab_flags,
3701 					prot->useroffset, prot->usersize,
3702 					NULL);
3703 
3704 		if (prot->slab == NULL) {
3705 			pr_crit("%s: Can't create sock SLAB cache!\n",
3706 				prot->name);
3707 			goto out;
3708 		}
3709 
3710 		if (req_prot_init(prot))
3711 			goto out_free_request_sock_slab;
3712 
3713 		if (tw_prot_init(prot))
3714 			goto out_free_timewait_sock_slab;
3715 	}
3716 
3717 	mutex_lock(&proto_list_mutex);
3718 	ret = assign_proto_idx(prot);
3719 	if (ret) {
3720 		mutex_unlock(&proto_list_mutex);
3721 		goto out_free_timewait_sock_slab;
3722 	}
3723 	list_add(&prot->node, &proto_list);
3724 	mutex_unlock(&proto_list_mutex);
3725 	return ret;
3726 
3727 out_free_timewait_sock_slab:
3728 	if (alloc_slab)
3729 		tw_prot_cleanup(prot->twsk_prot);
3730 out_free_request_sock_slab:
3731 	if (alloc_slab) {
3732 		req_prot_cleanup(prot->rsk_prot);
3733 
3734 		kmem_cache_destroy(prot->slab);
3735 		prot->slab = NULL;
3736 	}
3737 out:
3738 	return ret;
3739 }
3740 EXPORT_SYMBOL(proto_register);
3741 
3742 void proto_unregister(struct proto *prot)
3743 {
3744 	mutex_lock(&proto_list_mutex);
3745 	release_proto_idx(prot);
3746 	list_del(&prot->node);
3747 	mutex_unlock(&proto_list_mutex);
3748 
3749 	kmem_cache_destroy(prot->slab);
3750 	prot->slab = NULL;
3751 
3752 	req_prot_cleanup(prot->rsk_prot);
3753 	tw_prot_cleanup(prot->twsk_prot);
3754 }
3755 EXPORT_SYMBOL(proto_unregister);
3756 
3757 int sock_load_diag_module(int family, int protocol)
3758 {
3759 	if (!protocol) {
3760 		if (!sock_is_registered(family))
3761 			return -ENOENT;
3762 
3763 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3764 				      NETLINK_SOCK_DIAG, family);
3765 	}
3766 
3767 #ifdef CONFIG_INET
3768 	if (family == AF_INET &&
3769 	    protocol != IPPROTO_RAW &&
3770 	    protocol < MAX_INET_PROTOS &&
3771 	    !rcu_access_pointer(inet_protos[protocol]))
3772 		return -ENOENT;
3773 #endif
3774 
3775 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3776 			      NETLINK_SOCK_DIAG, family, protocol);
3777 }
3778 EXPORT_SYMBOL(sock_load_diag_module);
3779 
3780 #ifdef CONFIG_PROC_FS
3781 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3782 	__acquires(proto_list_mutex)
3783 {
3784 	mutex_lock(&proto_list_mutex);
3785 	return seq_list_start_head(&proto_list, *pos);
3786 }
3787 
3788 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3789 {
3790 	return seq_list_next(v, &proto_list, pos);
3791 }
3792 
3793 static void proto_seq_stop(struct seq_file *seq, void *v)
3794 	__releases(proto_list_mutex)
3795 {
3796 	mutex_unlock(&proto_list_mutex);
3797 }
3798 
3799 static char proto_method_implemented(const void *method)
3800 {
3801 	return method == NULL ? 'n' : 'y';
3802 }
3803 static long sock_prot_memory_allocated(struct proto *proto)
3804 {
3805 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3806 }
3807 
3808 static const char *sock_prot_memory_pressure(struct proto *proto)
3809 {
3810 	return proto->memory_pressure != NULL ?
3811 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3812 }
3813 
3814 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3815 {
3816 
3817 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3818 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3819 		   proto->name,
3820 		   proto->obj_size,
3821 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3822 		   sock_prot_memory_allocated(proto),
3823 		   sock_prot_memory_pressure(proto),
3824 		   proto->max_header,
3825 		   proto->slab == NULL ? "no" : "yes",
3826 		   module_name(proto->owner),
3827 		   proto_method_implemented(proto->close),
3828 		   proto_method_implemented(proto->connect),
3829 		   proto_method_implemented(proto->disconnect),
3830 		   proto_method_implemented(proto->accept),
3831 		   proto_method_implemented(proto->ioctl),
3832 		   proto_method_implemented(proto->init),
3833 		   proto_method_implemented(proto->destroy),
3834 		   proto_method_implemented(proto->shutdown),
3835 		   proto_method_implemented(proto->setsockopt),
3836 		   proto_method_implemented(proto->getsockopt),
3837 		   proto_method_implemented(proto->sendmsg),
3838 		   proto_method_implemented(proto->recvmsg),
3839 		   proto_method_implemented(proto->sendpage),
3840 		   proto_method_implemented(proto->bind),
3841 		   proto_method_implemented(proto->backlog_rcv),
3842 		   proto_method_implemented(proto->hash),
3843 		   proto_method_implemented(proto->unhash),
3844 		   proto_method_implemented(proto->get_port),
3845 		   proto_method_implemented(proto->enter_memory_pressure));
3846 }
3847 
3848 static int proto_seq_show(struct seq_file *seq, void *v)
3849 {
3850 	if (v == &proto_list)
3851 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3852 			   "protocol",
3853 			   "size",
3854 			   "sockets",
3855 			   "memory",
3856 			   "press",
3857 			   "maxhdr",
3858 			   "slab",
3859 			   "module",
3860 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3861 	else
3862 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3863 	return 0;
3864 }
3865 
3866 static const struct seq_operations proto_seq_ops = {
3867 	.start  = proto_seq_start,
3868 	.next   = proto_seq_next,
3869 	.stop   = proto_seq_stop,
3870 	.show   = proto_seq_show,
3871 };
3872 
3873 static __net_init int proto_init_net(struct net *net)
3874 {
3875 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3876 			sizeof(struct seq_net_private)))
3877 		return -ENOMEM;
3878 
3879 	return 0;
3880 }
3881 
3882 static __net_exit void proto_exit_net(struct net *net)
3883 {
3884 	remove_proc_entry("protocols", net->proc_net);
3885 }
3886 
3887 
3888 static __net_initdata struct pernet_operations proto_net_ops = {
3889 	.init = proto_init_net,
3890 	.exit = proto_exit_net,
3891 };
3892 
3893 static int __init proto_init(void)
3894 {
3895 	return register_pernet_subsys(&proto_net_ops);
3896 }
3897 
3898 subsys_initcall(proto_init);
3899 
3900 #endif /* PROC_FS */
3901 
3902 #ifdef CONFIG_NET_RX_BUSY_POLL
3903 bool sk_busy_loop_end(void *p, unsigned long start_time)
3904 {
3905 	struct sock *sk = p;
3906 
3907 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3908 	       sk_busy_loop_timeout(sk, start_time);
3909 }
3910 EXPORT_SYMBOL(sk_busy_loop_end);
3911 #endif /* CONFIG_NET_RX_BUSY_POLL */
3912 
3913 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3914 {
3915 	if (!sk->sk_prot->bind_add)
3916 		return -EOPNOTSUPP;
3917 	return sk->sk_prot->bind_add(sk, addr, addr_len);
3918 }
3919 EXPORT_SYMBOL(sock_bind_add);
3920