xref: /linux/net/core/sock.c (revision 8ce936c2f1a68c3a4f46578eed016ff92a67fbc6)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117 
118 #include <linux/uaccess.h>
119 
120 #include <linux/netdevice.h>
121 #include <net/protocol.h>
122 #include <linux/skbuff.h>
123 #include <net/net_namespace.h>
124 #include <net/request_sock.h>
125 #include <net/sock.h>
126 #include <linux/net_tstamp.h>
127 #include <net/xfrm.h>
128 #include <linux/ipsec.h>
129 #include <net/cls_cgroup.h>
130 #include <net/netprio_cgroup.h>
131 #include <linux/sock_diag.h>
132 
133 #include <linux/filter.h>
134 #include <net/sock_reuseport.h>
135 #include <net/bpf_sk_storage.h>
136 
137 #include <trace/events/sock.h>
138 
139 #include <net/tcp.h>
140 #include <net/busy_poll.h>
141 
142 #include <linux/ethtool.h>
143 
144 static DEFINE_MUTEX(proto_list_mutex);
145 static LIST_HEAD(proto_list);
146 
147 static void sock_inuse_add(struct net *net, int val);
148 
149 /**
150  * sk_ns_capable - General socket capability test
151  * @sk: Socket to use a capability on or through
152  * @user_ns: The user namespace of the capability to use
153  * @cap: The capability to use
154  *
155  * Test to see if the opener of the socket had when the socket was
156  * created and the current process has the capability @cap in the user
157  * namespace @user_ns.
158  */
159 bool sk_ns_capable(const struct sock *sk,
160 		   struct user_namespace *user_ns, int cap)
161 {
162 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
163 		ns_capable(user_ns, cap);
164 }
165 EXPORT_SYMBOL(sk_ns_capable);
166 
167 /**
168  * sk_capable - Socket global capability test
169  * @sk: Socket to use a capability on or through
170  * @cap: The global capability to use
171  *
172  * Test to see if the opener of the socket had when the socket was
173  * created and the current process has the capability @cap in all user
174  * namespaces.
175  */
176 bool sk_capable(const struct sock *sk, int cap)
177 {
178 	return sk_ns_capable(sk, &init_user_ns, cap);
179 }
180 EXPORT_SYMBOL(sk_capable);
181 
182 /**
183  * sk_net_capable - Network namespace socket capability test
184  * @sk: Socket to use a capability on or through
185  * @cap: The capability to use
186  *
187  * Test to see if the opener of the socket had when the socket was created
188  * and the current process has the capability @cap over the network namespace
189  * the socket is a member of.
190  */
191 bool sk_net_capable(const struct sock *sk, int cap)
192 {
193 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
194 }
195 EXPORT_SYMBOL(sk_net_capable);
196 
197 /*
198  * Each address family might have different locking rules, so we have
199  * one slock key per address family and separate keys for internal and
200  * userspace sockets.
201  */
202 static struct lock_class_key af_family_keys[AF_MAX];
203 static struct lock_class_key af_family_kern_keys[AF_MAX];
204 static struct lock_class_key af_family_slock_keys[AF_MAX];
205 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
206 
207 /*
208  * Make lock validator output more readable. (we pre-construct these
209  * strings build-time, so that runtime initialization of socket
210  * locks is fast):
211  */
212 
213 #define _sock_locks(x)						  \
214   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
215   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
216   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
217   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
218   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
219   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
220   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
221   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
222   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
223   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
224   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
225   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
226   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
227   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
228   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
229   x "AF_MCTP"  , \
230   x "AF_MAX"
231 
232 static const char *const af_family_key_strings[AF_MAX+1] = {
233 	_sock_locks("sk_lock-")
234 };
235 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
236 	_sock_locks("slock-")
237 };
238 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
239 	_sock_locks("clock-")
240 };
241 
242 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
243 	_sock_locks("k-sk_lock-")
244 };
245 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
246 	_sock_locks("k-slock-")
247 };
248 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
249 	_sock_locks("k-clock-")
250 };
251 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
252 	_sock_locks("rlock-")
253 };
254 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
255 	_sock_locks("wlock-")
256 };
257 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
258 	_sock_locks("elock-")
259 };
260 
261 /*
262  * sk_callback_lock and sk queues locking rules are per-address-family,
263  * so split the lock classes by using a per-AF key:
264  */
265 static struct lock_class_key af_callback_keys[AF_MAX];
266 static struct lock_class_key af_rlock_keys[AF_MAX];
267 static struct lock_class_key af_wlock_keys[AF_MAX];
268 static struct lock_class_key af_elock_keys[AF_MAX];
269 static struct lock_class_key af_kern_callback_keys[AF_MAX];
270 
271 /* Run time adjustable parameters. */
272 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
273 EXPORT_SYMBOL(sysctl_wmem_max);
274 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
275 EXPORT_SYMBOL(sysctl_rmem_max);
276 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
277 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
278 
279 /* Maximal space eaten by iovec or ancillary data plus some space */
280 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
281 EXPORT_SYMBOL(sysctl_optmem_max);
282 
283 int sysctl_tstamp_allow_data __read_mostly = 1;
284 
285 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
286 EXPORT_SYMBOL_GPL(memalloc_socks_key);
287 
288 /**
289  * sk_set_memalloc - sets %SOCK_MEMALLOC
290  * @sk: socket to set it on
291  *
292  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
293  * It's the responsibility of the admin to adjust min_free_kbytes
294  * to meet the requirements
295  */
296 void sk_set_memalloc(struct sock *sk)
297 {
298 	sock_set_flag(sk, SOCK_MEMALLOC);
299 	sk->sk_allocation |= __GFP_MEMALLOC;
300 	static_branch_inc(&memalloc_socks_key);
301 }
302 EXPORT_SYMBOL_GPL(sk_set_memalloc);
303 
304 void sk_clear_memalloc(struct sock *sk)
305 {
306 	sock_reset_flag(sk, SOCK_MEMALLOC);
307 	sk->sk_allocation &= ~__GFP_MEMALLOC;
308 	static_branch_dec(&memalloc_socks_key);
309 
310 	/*
311 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
312 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
313 	 * it has rmem allocations due to the last swapfile being deactivated
314 	 * but there is a risk that the socket is unusable due to exceeding
315 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
316 	 */
317 	sk_mem_reclaim(sk);
318 }
319 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
320 
321 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
322 {
323 	int ret;
324 	unsigned int noreclaim_flag;
325 
326 	/* these should have been dropped before queueing */
327 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
328 
329 	noreclaim_flag = memalloc_noreclaim_save();
330 	ret = sk->sk_backlog_rcv(sk, skb);
331 	memalloc_noreclaim_restore(noreclaim_flag);
332 
333 	return ret;
334 }
335 EXPORT_SYMBOL(__sk_backlog_rcv);
336 
337 void sk_error_report(struct sock *sk)
338 {
339 	sk->sk_error_report(sk);
340 
341 	switch (sk->sk_family) {
342 	case AF_INET:
343 		fallthrough;
344 	case AF_INET6:
345 		trace_inet_sk_error_report(sk);
346 		break;
347 	default:
348 		break;
349 	}
350 }
351 EXPORT_SYMBOL(sk_error_report);
352 
353 static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
354 {
355 	struct __kernel_sock_timeval tv;
356 
357 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
358 		tv.tv_sec = 0;
359 		tv.tv_usec = 0;
360 	} else {
361 		tv.tv_sec = timeo / HZ;
362 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
363 	}
364 
365 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
366 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
367 		*(struct old_timeval32 *)optval = tv32;
368 		return sizeof(tv32);
369 	}
370 
371 	if (old_timeval) {
372 		struct __kernel_old_timeval old_tv;
373 		old_tv.tv_sec = tv.tv_sec;
374 		old_tv.tv_usec = tv.tv_usec;
375 		*(struct __kernel_old_timeval *)optval = old_tv;
376 		return sizeof(old_tv);
377 	}
378 
379 	*(struct __kernel_sock_timeval *)optval = tv;
380 	return sizeof(tv);
381 }
382 
383 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
384 			    bool old_timeval)
385 {
386 	struct __kernel_sock_timeval tv;
387 
388 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
389 		struct old_timeval32 tv32;
390 
391 		if (optlen < sizeof(tv32))
392 			return -EINVAL;
393 
394 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
395 			return -EFAULT;
396 		tv.tv_sec = tv32.tv_sec;
397 		tv.tv_usec = tv32.tv_usec;
398 	} else if (old_timeval) {
399 		struct __kernel_old_timeval old_tv;
400 
401 		if (optlen < sizeof(old_tv))
402 			return -EINVAL;
403 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
404 			return -EFAULT;
405 		tv.tv_sec = old_tv.tv_sec;
406 		tv.tv_usec = old_tv.tv_usec;
407 	} else {
408 		if (optlen < sizeof(tv))
409 			return -EINVAL;
410 		if (copy_from_sockptr(&tv, optval, sizeof(tv)))
411 			return -EFAULT;
412 	}
413 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
414 		return -EDOM;
415 
416 	if (tv.tv_sec < 0) {
417 		static int warned __read_mostly;
418 
419 		*timeo_p = 0;
420 		if (warned < 10 && net_ratelimit()) {
421 			warned++;
422 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
423 				__func__, current->comm, task_pid_nr(current));
424 		}
425 		return 0;
426 	}
427 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
428 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
429 		return 0;
430 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
431 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
432 	return 0;
433 }
434 
435 static bool sock_needs_netstamp(const struct sock *sk)
436 {
437 	switch (sk->sk_family) {
438 	case AF_UNSPEC:
439 	case AF_UNIX:
440 		return false;
441 	default:
442 		return true;
443 	}
444 }
445 
446 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
447 {
448 	if (sk->sk_flags & flags) {
449 		sk->sk_flags &= ~flags;
450 		if (sock_needs_netstamp(sk) &&
451 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
452 			net_disable_timestamp();
453 	}
454 }
455 
456 
457 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
458 {
459 	unsigned long flags;
460 	struct sk_buff_head *list = &sk->sk_receive_queue;
461 
462 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
463 		atomic_inc(&sk->sk_drops);
464 		trace_sock_rcvqueue_full(sk, skb);
465 		return -ENOMEM;
466 	}
467 
468 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
469 		atomic_inc(&sk->sk_drops);
470 		return -ENOBUFS;
471 	}
472 
473 	skb->dev = NULL;
474 	skb_set_owner_r(skb, sk);
475 
476 	/* we escape from rcu protected region, make sure we dont leak
477 	 * a norefcounted dst
478 	 */
479 	skb_dst_force(skb);
480 
481 	spin_lock_irqsave(&list->lock, flags);
482 	sock_skb_set_dropcount(sk, skb);
483 	__skb_queue_tail(list, skb);
484 	spin_unlock_irqrestore(&list->lock, flags);
485 
486 	if (!sock_flag(sk, SOCK_DEAD))
487 		sk->sk_data_ready(sk);
488 	return 0;
489 }
490 EXPORT_SYMBOL(__sock_queue_rcv_skb);
491 
492 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
493 {
494 	int err;
495 
496 	err = sk_filter(sk, skb);
497 	if (err)
498 		return err;
499 
500 	return __sock_queue_rcv_skb(sk, skb);
501 }
502 EXPORT_SYMBOL(sock_queue_rcv_skb);
503 
504 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
505 		     const int nested, unsigned int trim_cap, bool refcounted)
506 {
507 	int rc = NET_RX_SUCCESS;
508 
509 	if (sk_filter_trim_cap(sk, skb, trim_cap))
510 		goto discard_and_relse;
511 
512 	skb->dev = NULL;
513 
514 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
515 		atomic_inc(&sk->sk_drops);
516 		goto discard_and_relse;
517 	}
518 	if (nested)
519 		bh_lock_sock_nested(sk);
520 	else
521 		bh_lock_sock(sk);
522 	if (!sock_owned_by_user(sk)) {
523 		/*
524 		 * trylock + unlock semantics:
525 		 */
526 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
527 
528 		rc = sk_backlog_rcv(sk, skb);
529 
530 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
531 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
532 		bh_unlock_sock(sk);
533 		atomic_inc(&sk->sk_drops);
534 		goto discard_and_relse;
535 	}
536 
537 	bh_unlock_sock(sk);
538 out:
539 	if (refcounted)
540 		sock_put(sk);
541 	return rc;
542 discard_and_relse:
543 	kfree_skb(skb);
544 	goto out;
545 }
546 EXPORT_SYMBOL(__sk_receive_skb);
547 
548 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
549 							  u32));
550 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
551 							   u32));
552 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
553 {
554 	struct dst_entry *dst = __sk_dst_get(sk);
555 
556 	if (dst && dst->obsolete &&
557 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
558 			       dst, cookie) == NULL) {
559 		sk_tx_queue_clear(sk);
560 		sk->sk_dst_pending_confirm = 0;
561 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
562 		dst_release(dst);
563 		return NULL;
564 	}
565 
566 	return dst;
567 }
568 EXPORT_SYMBOL(__sk_dst_check);
569 
570 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
571 {
572 	struct dst_entry *dst = sk_dst_get(sk);
573 
574 	if (dst && dst->obsolete &&
575 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
576 			       dst, cookie) == NULL) {
577 		sk_dst_reset(sk);
578 		dst_release(dst);
579 		return NULL;
580 	}
581 
582 	return dst;
583 }
584 EXPORT_SYMBOL(sk_dst_check);
585 
586 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
587 {
588 	int ret = -ENOPROTOOPT;
589 #ifdef CONFIG_NETDEVICES
590 	struct net *net = sock_net(sk);
591 
592 	/* Sorry... */
593 	ret = -EPERM;
594 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
595 		goto out;
596 
597 	ret = -EINVAL;
598 	if (ifindex < 0)
599 		goto out;
600 
601 	sk->sk_bound_dev_if = ifindex;
602 	if (sk->sk_prot->rehash)
603 		sk->sk_prot->rehash(sk);
604 	sk_dst_reset(sk);
605 
606 	ret = 0;
607 
608 out:
609 #endif
610 
611 	return ret;
612 }
613 
614 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
615 {
616 	int ret;
617 
618 	if (lock_sk)
619 		lock_sock(sk);
620 	ret = sock_bindtoindex_locked(sk, ifindex);
621 	if (lock_sk)
622 		release_sock(sk);
623 
624 	return ret;
625 }
626 EXPORT_SYMBOL(sock_bindtoindex);
627 
628 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
629 {
630 	int ret = -ENOPROTOOPT;
631 #ifdef CONFIG_NETDEVICES
632 	struct net *net = sock_net(sk);
633 	char devname[IFNAMSIZ];
634 	int index;
635 
636 	ret = -EINVAL;
637 	if (optlen < 0)
638 		goto out;
639 
640 	/* Bind this socket to a particular device like "eth0",
641 	 * as specified in the passed interface name. If the
642 	 * name is "" or the option length is zero the socket
643 	 * is not bound.
644 	 */
645 	if (optlen > IFNAMSIZ - 1)
646 		optlen = IFNAMSIZ - 1;
647 	memset(devname, 0, sizeof(devname));
648 
649 	ret = -EFAULT;
650 	if (copy_from_sockptr(devname, optval, optlen))
651 		goto out;
652 
653 	index = 0;
654 	if (devname[0] != '\0') {
655 		struct net_device *dev;
656 
657 		rcu_read_lock();
658 		dev = dev_get_by_name_rcu(net, devname);
659 		if (dev)
660 			index = dev->ifindex;
661 		rcu_read_unlock();
662 		ret = -ENODEV;
663 		if (!dev)
664 			goto out;
665 	}
666 
667 	return sock_bindtoindex(sk, index, true);
668 out:
669 #endif
670 
671 	return ret;
672 }
673 
674 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
675 				int __user *optlen, int len)
676 {
677 	int ret = -ENOPROTOOPT;
678 #ifdef CONFIG_NETDEVICES
679 	struct net *net = sock_net(sk);
680 	char devname[IFNAMSIZ];
681 
682 	if (sk->sk_bound_dev_if == 0) {
683 		len = 0;
684 		goto zero;
685 	}
686 
687 	ret = -EINVAL;
688 	if (len < IFNAMSIZ)
689 		goto out;
690 
691 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
692 	if (ret)
693 		goto out;
694 
695 	len = strlen(devname) + 1;
696 
697 	ret = -EFAULT;
698 	if (copy_to_user(optval, devname, len))
699 		goto out;
700 
701 zero:
702 	ret = -EFAULT;
703 	if (put_user(len, optlen))
704 		goto out;
705 
706 	ret = 0;
707 
708 out:
709 #endif
710 
711 	return ret;
712 }
713 
714 bool sk_mc_loop(struct sock *sk)
715 {
716 	if (dev_recursion_level())
717 		return false;
718 	if (!sk)
719 		return true;
720 	switch (sk->sk_family) {
721 	case AF_INET:
722 		return inet_sk(sk)->mc_loop;
723 #if IS_ENABLED(CONFIG_IPV6)
724 	case AF_INET6:
725 		return inet6_sk(sk)->mc_loop;
726 #endif
727 	}
728 	WARN_ON_ONCE(1);
729 	return true;
730 }
731 EXPORT_SYMBOL(sk_mc_loop);
732 
733 void sock_set_reuseaddr(struct sock *sk)
734 {
735 	lock_sock(sk);
736 	sk->sk_reuse = SK_CAN_REUSE;
737 	release_sock(sk);
738 }
739 EXPORT_SYMBOL(sock_set_reuseaddr);
740 
741 void sock_set_reuseport(struct sock *sk)
742 {
743 	lock_sock(sk);
744 	sk->sk_reuseport = true;
745 	release_sock(sk);
746 }
747 EXPORT_SYMBOL(sock_set_reuseport);
748 
749 void sock_no_linger(struct sock *sk)
750 {
751 	lock_sock(sk);
752 	sk->sk_lingertime = 0;
753 	sock_set_flag(sk, SOCK_LINGER);
754 	release_sock(sk);
755 }
756 EXPORT_SYMBOL(sock_no_linger);
757 
758 void sock_set_priority(struct sock *sk, u32 priority)
759 {
760 	lock_sock(sk);
761 	sk->sk_priority = priority;
762 	release_sock(sk);
763 }
764 EXPORT_SYMBOL(sock_set_priority);
765 
766 void sock_set_sndtimeo(struct sock *sk, s64 secs)
767 {
768 	lock_sock(sk);
769 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
770 		sk->sk_sndtimeo = secs * HZ;
771 	else
772 		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
773 	release_sock(sk);
774 }
775 EXPORT_SYMBOL(sock_set_sndtimeo);
776 
777 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
778 {
779 	if (val)  {
780 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
781 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
782 		sock_set_flag(sk, SOCK_RCVTSTAMP);
783 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
784 	} else {
785 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
786 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
787 	}
788 }
789 
790 void sock_enable_timestamps(struct sock *sk)
791 {
792 	lock_sock(sk);
793 	__sock_set_timestamps(sk, true, false, true);
794 	release_sock(sk);
795 }
796 EXPORT_SYMBOL(sock_enable_timestamps);
797 
798 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
799 {
800 	switch (optname) {
801 	case SO_TIMESTAMP_OLD:
802 		__sock_set_timestamps(sk, valbool, false, false);
803 		break;
804 	case SO_TIMESTAMP_NEW:
805 		__sock_set_timestamps(sk, valbool, true, false);
806 		break;
807 	case SO_TIMESTAMPNS_OLD:
808 		__sock_set_timestamps(sk, valbool, false, true);
809 		break;
810 	case SO_TIMESTAMPNS_NEW:
811 		__sock_set_timestamps(sk, valbool, true, true);
812 		break;
813 	}
814 }
815 
816 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
817 {
818 	struct net *net = sock_net(sk);
819 	struct net_device *dev = NULL;
820 	bool match = false;
821 	int *vclock_index;
822 	int i, num;
823 
824 	if (sk->sk_bound_dev_if)
825 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
826 
827 	if (!dev) {
828 		pr_err("%s: sock not bind to device\n", __func__);
829 		return -EOPNOTSUPP;
830 	}
831 
832 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
833 	for (i = 0; i < num; i++) {
834 		if (*(vclock_index + i) == phc_index) {
835 			match = true;
836 			break;
837 		}
838 	}
839 
840 	if (num > 0)
841 		kfree(vclock_index);
842 
843 	if (!match)
844 		return -EINVAL;
845 
846 	sk->sk_bind_phc = phc_index;
847 
848 	return 0;
849 }
850 
851 int sock_set_timestamping(struct sock *sk, int optname,
852 			  struct so_timestamping timestamping)
853 {
854 	int val = timestamping.flags;
855 	int ret;
856 
857 	if (val & ~SOF_TIMESTAMPING_MASK)
858 		return -EINVAL;
859 
860 	if (val & SOF_TIMESTAMPING_OPT_ID &&
861 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
862 		if (sk->sk_protocol == IPPROTO_TCP &&
863 		    sk->sk_type == SOCK_STREAM) {
864 			if ((1 << sk->sk_state) &
865 			    (TCPF_CLOSE | TCPF_LISTEN))
866 				return -EINVAL;
867 			sk->sk_tskey = tcp_sk(sk)->snd_una;
868 		} else {
869 			sk->sk_tskey = 0;
870 		}
871 	}
872 
873 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
874 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
875 		return -EINVAL;
876 
877 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
878 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
879 		if (ret)
880 			return ret;
881 	}
882 
883 	sk->sk_tsflags = val;
884 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
885 
886 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
887 		sock_enable_timestamp(sk,
888 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
889 	else
890 		sock_disable_timestamp(sk,
891 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
892 	return 0;
893 }
894 
895 void sock_set_keepalive(struct sock *sk)
896 {
897 	lock_sock(sk);
898 	if (sk->sk_prot->keepalive)
899 		sk->sk_prot->keepalive(sk, true);
900 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
901 	release_sock(sk);
902 }
903 EXPORT_SYMBOL(sock_set_keepalive);
904 
905 static void __sock_set_rcvbuf(struct sock *sk, int val)
906 {
907 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
908 	 * as a negative value.
909 	 */
910 	val = min_t(int, val, INT_MAX / 2);
911 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
912 
913 	/* We double it on the way in to account for "struct sk_buff" etc.
914 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
915 	 * will allow that much actual data to be received on that socket.
916 	 *
917 	 * Applications are unaware that "struct sk_buff" and other overheads
918 	 * allocate from the receive buffer during socket buffer allocation.
919 	 *
920 	 * And after considering the possible alternatives, returning the value
921 	 * we actually used in getsockopt is the most desirable behavior.
922 	 */
923 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
924 }
925 
926 void sock_set_rcvbuf(struct sock *sk, int val)
927 {
928 	lock_sock(sk);
929 	__sock_set_rcvbuf(sk, val);
930 	release_sock(sk);
931 }
932 EXPORT_SYMBOL(sock_set_rcvbuf);
933 
934 static void __sock_set_mark(struct sock *sk, u32 val)
935 {
936 	if (val != sk->sk_mark) {
937 		sk->sk_mark = val;
938 		sk_dst_reset(sk);
939 	}
940 }
941 
942 void sock_set_mark(struct sock *sk, u32 val)
943 {
944 	lock_sock(sk);
945 	__sock_set_mark(sk, val);
946 	release_sock(sk);
947 }
948 EXPORT_SYMBOL(sock_set_mark);
949 
950 /*
951  *	This is meant for all protocols to use and covers goings on
952  *	at the socket level. Everything here is generic.
953  */
954 
955 int sock_setsockopt(struct socket *sock, int level, int optname,
956 		    sockptr_t optval, unsigned int optlen)
957 {
958 	struct so_timestamping timestamping;
959 	struct sock_txtime sk_txtime;
960 	struct sock *sk = sock->sk;
961 	int val;
962 	int valbool;
963 	struct linger ling;
964 	int ret = 0;
965 
966 	/*
967 	 *	Options without arguments
968 	 */
969 
970 	if (optname == SO_BINDTODEVICE)
971 		return sock_setbindtodevice(sk, optval, optlen);
972 
973 	if (optlen < sizeof(int))
974 		return -EINVAL;
975 
976 	if (copy_from_sockptr(&val, optval, sizeof(val)))
977 		return -EFAULT;
978 
979 	valbool = val ? 1 : 0;
980 
981 	lock_sock(sk);
982 
983 	switch (optname) {
984 	case SO_DEBUG:
985 		if (val && !capable(CAP_NET_ADMIN))
986 			ret = -EACCES;
987 		else
988 			sock_valbool_flag(sk, SOCK_DBG, valbool);
989 		break;
990 	case SO_REUSEADDR:
991 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
992 		break;
993 	case SO_REUSEPORT:
994 		sk->sk_reuseport = valbool;
995 		break;
996 	case SO_TYPE:
997 	case SO_PROTOCOL:
998 	case SO_DOMAIN:
999 	case SO_ERROR:
1000 		ret = -ENOPROTOOPT;
1001 		break;
1002 	case SO_DONTROUTE:
1003 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1004 		sk_dst_reset(sk);
1005 		break;
1006 	case SO_BROADCAST:
1007 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1008 		break;
1009 	case SO_SNDBUF:
1010 		/* Don't error on this BSD doesn't and if you think
1011 		 * about it this is right. Otherwise apps have to
1012 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1013 		 * are treated in BSD as hints
1014 		 */
1015 		val = min_t(u32, val, sysctl_wmem_max);
1016 set_sndbuf:
1017 		/* Ensure val * 2 fits into an int, to prevent max_t()
1018 		 * from treating it as a negative value.
1019 		 */
1020 		val = min_t(int, val, INT_MAX / 2);
1021 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1022 		WRITE_ONCE(sk->sk_sndbuf,
1023 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1024 		/* Wake up sending tasks if we upped the value. */
1025 		sk->sk_write_space(sk);
1026 		break;
1027 
1028 	case SO_SNDBUFFORCE:
1029 		if (!capable(CAP_NET_ADMIN)) {
1030 			ret = -EPERM;
1031 			break;
1032 		}
1033 
1034 		/* No negative values (to prevent underflow, as val will be
1035 		 * multiplied by 2).
1036 		 */
1037 		if (val < 0)
1038 			val = 0;
1039 		goto set_sndbuf;
1040 
1041 	case SO_RCVBUF:
1042 		/* Don't error on this BSD doesn't and if you think
1043 		 * about it this is right. Otherwise apps have to
1044 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1045 		 * are treated in BSD as hints
1046 		 */
1047 		__sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
1048 		break;
1049 
1050 	case SO_RCVBUFFORCE:
1051 		if (!capable(CAP_NET_ADMIN)) {
1052 			ret = -EPERM;
1053 			break;
1054 		}
1055 
1056 		/* No negative values (to prevent underflow, as val will be
1057 		 * multiplied by 2).
1058 		 */
1059 		__sock_set_rcvbuf(sk, max(val, 0));
1060 		break;
1061 
1062 	case SO_KEEPALIVE:
1063 		if (sk->sk_prot->keepalive)
1064 			sk->sk_prot->keepalive(sk, valbool);
1065 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1066 		break;
1067 
1068 	case SO_OOBINLINE:
1069 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1070 		break;
1071 
1072 	case SO_NO_CHECK:
1073 		sk->sk_no_check_tx = valbool;
1074 		break;
1075 
1076 	case SO_PRIORITY:
1077 		if ((val >= 0 && val <= 6) ||
1078 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1079 			sk->sk_priority = val;
1080 		else
1081 			ret = -EPERM;
1082 		break;
1083 
1084 	case SO_LINGER:
1085 		if (optlen < sizeof(ling)) {
1086 			ret = -EINVAL;	/* 1003.1g */
1087 			break;
1088 		}
1089 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1090 			ret = -EFAULT;
1091 			break;
1092 		}
1093 		if (!ling.l_onoff)
1094 			sock_reset_flag(sk, SOCK_LINGER);
1095 		else {
1096 #if (BITS_PER_LONG == 32)
1097 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1098 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1099 			else
1100 #endif
1101 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1102 			sock_set_flag(sk, SOCK_LINGER);
1103 		}
1104 		break;
1105 
1106 	case SO_BSDCOMPAT:
1107 		break;
1108 
1109 	case SO_PASSCRED:
1110 		if (valbool)
1111 			set_bit(SOCK_PASSCRED, &sock->flags);
1112 		else
1113 			clear_bit(SOCK_PASSCRED, &sock->flags);
1114 		break;
1115 
1116 	case SO_TIMESTAMP_OLD:
1117 	case SO_TIMESTAMP_NEW:
1118 	case SO_TIMESTAMPNS_OLD:
1119 	case SO_TIMESTAMPNS_NEW:
1120 		sock_set_timestamp(sk, optname, valbool);
1121 		break;
1122 
1123 	case SO_TIMESTAMPING_NEW:
1124 	case SO_TIMESTAMPING_OLD:
1125 		if (optlen == sizeof(timestamping)) {
1126 			if (copy_from_sockptr(&timestamping, optval,
1127 					      sizeof(timestamping))) {
1128 				ret = -EFAULT;
1129 				break;
1130 			}
1131 		} else {
1132 			memset(&timestamping, 0, sizeof(timestamping));
1133 			timestamping.flags = val;
1134 		}
1135 		ret = sock_set_timestamping(sk, optname, timestamping);
1136 		break;
1137 
1138 	case SO_RCVLOWAT:
1139 		if (val < 0)
1140 			val = INT_MAX;
1141 		if (sock->ops->set_rcvlowat)
1142 			ret = sock->ops->set_rcvlowat(sk, val);
1143 		else
1144 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1145 		break;
1146 
1147 	case SO_RCVTIMEO_OLD:
1148 	case SO_RCVTIMEO_NEW:
1149 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1150 				       optlen, optname == SO_RCVTIMEO_OLD);
1151 		break;
1152 
1153 	case SO_SNDTIMEO_OLD:
1154 	case SO_SNDTIMEO_NEW:
1155 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1156 				       optlen, optname == SO_SNDTIMEO_OLD);
1157 		break;
1158 
1159 	case SO_ATTACH_FILTER: {
1160 		struct sock_fprog fprog;
1161 
1162 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1163 		if (!ret)
1164 			ret = sk_attach_filter(&fprog, sk);
1165 		break;
1166 	}
1167 	case SO_ATTACH_BPF:
1168 		ret = -EINVAL;
1169 		if (optlen == sizeof(u32)) {
1170 			u32 ufd;
1171 
1172 			ret = -EFAULT;
1173 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1174 				break;
1175 
1176 			ret = sk_attach_bpf(ufd, sk);
1177 		}
1178 		break;
1179 
1180 	case SO_ATTACH_REUSEPORT_CBPF: {
1181 		struct sock_fprog fprog;
1182 
1183 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1184 		if (!ret)
1185 			ret = sk_reuseport_attach_filter(&fprog, sk);
1186 		break;
1187 	}
1188 	case SO_ATTACH_REUSEPORT_EBPF:
1189 		ret = -EINVAL;
1190 		if (optlen == sizeof(u32)) {
1191 			u32 ufd;
1192 
1193 			ret = -EFAULT;
1194 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1195 				break;
1196 
1197 			ret = sk_reuseport_attach_bpf(ufd, sk);
1198 		}
1199 		break;
1200 
1201 	case SO_DETACH_REUSEPORT_BPF:
1202 		ret = reuseport_detach_prog(sk);
1203 		break;
1204 
1205 	case SO_DETACH_FILTER:
1206 		ret = sk_detach_filter(sk);
1207 		break;
1208 
1209 	case SO_LOCK_FILTER:
1210 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1211 			ret = -EPERM;
1212 		else
1213 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1214 		break;
1215 
1216 	case SO_PASSSEC:
1217 		if (valbool)
1218 			set_bit(SOCK_PASSSEC, &sock->flags);
1219 		else
1220 			clear_bit(SOCK_PASSSEC, &sock->flags);
1221 		break;
1222 	case SO_MARK:
1223 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1224 			ret = -EPERM;
1225 			break;
1226 		}
1227 
1228 		__sock_set_mark(sk, val);
1229 		break;
1230 
1231 	case SO_RXQ_OVFL:
1232 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1233 		break;
1234 
1235 	case SO_WIFI_STATUS:
1236 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1237 		break;
1238 
1239 	case SO_PEEK_OFF:
1240 		if (sock->ops->set_peek_off)
1241 			ret = sock->ops->set_peek_off(sk, val);
1242 		else
1243 			ret = -EOPNOTSUPP;
1244 		break;
1245 
1246 	case SO_NOFCS:
1247 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1248 		break;
1249 
1250 	case SO_SELECT_ERR_QUEUE:
1251 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1252 		break;
1253 
1254 #ifdef CONFIG_NET_RX_BUSY_POLL
1255 	case SO_BUSY_POLL:
1256 		/* allow unprivileged users to decrease the value */
1257 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1258 			ret = -EPERM;
1259 		else {
1260 			if (val < 0)
1261 				ret = -EINVAL;
1262 			else
1263 				WRITE_ONCE(sk->sk_ll_usec, val);
1264 		}
1265 		break;
1266 	case SO_PREFER_BUSY_POLL:
1267 		if (valbool && !capable(CAP_NET_ADMIN))
1268 			ret = -EPERM;
1269 		else
1270 			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1271 		break;
1272 	case SO_BUSY_POLL_BUDGET:
1273 		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1274 			ret = -EPERM;
1275 		} else {
1276 			if (val < 0 || val > U16_MAX)
1277 				ret = -EINVAL;
1278 			else
1279 				WRITE_ONCE(sk->sk_busy_poll_budget, val);
1280 		}
1281 		break;
1282 #endif
1283 
1284 	case SO_MAX_PACING_RATE:
1285 		{
1286 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1287 
1288 		if (sizeof(ulval) != sizeof(val) &&
1289 		    optlen >= sizeof(ulval) &&
1290 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1291 			ret = -EFAULT;
1292 			break;
1293 		}
1294 		if (ulval != ~0UL)
1295 			cmpxchg(&sk->sk_pacing_status,
1296 				SK_PACING_NONE,
1297 				SK_PACING_NEEDED);
1298 		sk->sk_max_pacing_rate = ulval;
1299 		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1300 		break;
1301 		}
1302 	case SO_INCOMING_CPU:
1303 		WRITE_ONCE(sk->sk_incoming_cpu, val);
1304 		break;
1305 
1306 	case SO_CNX_ADVICE:
1307 		if (val == 1)
1308 			dst_negative_advice(sk);
1309 		break;
1310 
1311 	case SO_ZEROCOPY:
1312 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1313 			if (!((sk->sk_type == SOCK_STREAM &&
1314 			       sk->sk_protocol == IPPROTO_TCP) ||
1315 			      (sk->sk_type == SOCK_DGRAM &&
1316 			       sk->sk_protocol == IPPROTO_UDP)))
1317 				ret = -ENOTSUPP;
1318 		} else if (sk->sk_family != PF_RDS) {
1319 			ret = -ENOTSUPP;
1320 		}
1321 		if (!ret) {
1322 			if (val < 0 || val > 1)
1323 				ret = -EINVAL;
1324 			else
1325 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1326 		}
1327 		break;
1328 
1329 	case SO_TXTIME:
1330 		if (optlen != sizeof(struct sock_txtime)) {
1331 			ret = -EINVAL;
1332 			break;
1333 		} else if (copy_from_sockptr(&sk_txtime, optval,
1334 			   sizeof(struct sock_txtime))) {
1335 			ret = -EFAULT;
1336 			break;
1337 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1338 			ret = -EINVAL;
1339 			break;
1340 		}
1341 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1342 		 * scheduler has enough safe guards.
1343 		 */
1344 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1345 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1346 			ret = -EPERM;
1347 			break;
1348 		}
1349 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1350 		sk->sk_clockid = sk_txtime.clockid;
1351 		sk->sk_txtime_deadline_mode =
1352 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1353 		sk->sk_txtime_report_errors =
1354 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1355 		break;
1356 
1357 	case SO_BINDTOIFINDEX:
1358 		ret = sock_bindtoindex_locked(sk, val);
1359 		break;
1360 
1361 	case SO_BUF_LOCK:
1362 		if (val & ~SOCK_BUF_LOCK_MASK) {
1363 			ret = -EINVAL;
1364 			break;
1365 		}
1366 		sk->sk_userlocks = val | (sk->sk_userlocks &
1367 					  ~SOCK_BUF_LOCK_MASK);
1368 		break;
1369 
1370 	default:
1371 		ret = -ENOPROTOOPT;
1372 		break;
1373 	}
1374 	release_sock(sk);
1375 	return ret;
1376 }
1377 EXPORT_SYMBOL(sock_setsockopt);
1378 
1379 
1380 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1381 			  struct ucred *ucred)
1382 {
1383 	ucred->pid = pid_vnr(pid);
1384 	ucred->uid = ucred->gid = -1;
1385 	if (cred) {
1386 		struct user_namespace *current_ns = current_user_ns();
1387 
1388 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1389 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1390 	}
1391 }
1392 
1393 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1394 {
1395 	struct user_namespace *user_ns = current_user_ns();
1396 	int i;
1397 
1398 	for (i = 0; i < src->ngroups; i++)
1399 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1400 			return -EFAULT;
1401 
1402 	return 0;
1403 }
1404 
1405 int sock_getsockopt(struct socket *sock, int level, int optname,
1406 		    char __user *optval, int __user *optlen)
1407 {
1408 	struct sock *sk = sock->sk;
1409 
1410 	union {
1411 		int val;
1412 		u64 val64;
1413 		unsigned long ulval;
1414 		struct linger ling;
1415 		struct old_timeval32 tm32;
1416 		struct __kernel_old_timeval tm;
1417 		struct  __kernel_sock_timeval stm;
1418 		struct sock_txtime txtime;
1419 		struct so_timestamping timestamping;
1420 	} v;
1421 
1422 	int lv = sizeof(int);
1423 	int len;
1424 
1425 	if (get_user(len, optlen))
1426 		return -EFAULT;
1427 	if (len < 0)
1428 		return -EINVAL;
1429 
1430 	memset(&v, 0, sizeof(v));
1431 
1432 	switch (optname) {
1433 	case SO_DEBUG:
1434 		v.val = sock_flag(sk, SOCK_DBG);
1435 		break;
1436 
1437 	case SO_DONTROUTE:
1438 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1439 		break;
1440 
1441 	case SO_BROADCAST:
1442 		v.val = sock_flag(sk, SOCK_BROADCAST);
1443 		break;
1444 
1445 	case SO_SNDBUF:
1446 		v.val = sk->sk_sndbuf;
1447 		break;
1448 
1449 	case SO_RCVBUF:
1450 		v.val = sk->sk_rcvbuf;
1451 		break;
1452 
1453 	case SO_REUSEADDR:
1454 		v.val = sk->sk_reuse;
1455 		break;
1456 
1457 	case SO_REUSEPORT:
1458 		v.val = sk->sk_reuseport;
1459 		break;
1460 
1461 	case SO_KEEPALIVE:
1462 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1463 		break;
1464 
1465 	case SO_TYPE:
1466 		v.val = sk->sk_type;
1467 		break;
1468 
1469 	case SO_PROTOCOL:
1470 		v.val = sk->sk_protocol;
1471 		break;
1472 
1473 	case SO_DOMAIN:
1474 		v.val = sk->sk_family;
1475 		break;
1476 
1477 	case SO_ERROR:
1478 		v.val = -sock_error(sk);
1479 		if (v.val == 0)
1480 			v.val = xchg(&sk->sk_err_soft, 0);
1481 		break;
1482 
1483 	case SO_OOBINLINE:
1484 		v.val = sock_flag(sk, SOCK_URGINLINE);
1485 		break;
1486 
1487 	case SO_NO_CHECK:
1488 		v.val = sk->sk_no_check_tx;
1489 		break;
1490 
1491 	case SO_PRIORITY:
1492 		v.val = sk->sk_priority;
1493 		break;
1494 
1495 	case SO_LINGER:
1496 		lv		= sizeof(v.ling);
1497 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1498 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1499 		break;
1500 
1501 	case SO_BSDCOMPAT:
1502 		break;
1503 
1504 	case SO_TIMESTAMP_OLD:
1505 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1506 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1507 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1508 		break;
1509 
1510 	case SO_TIMESTAMPNS_OLD:
1511 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1512 		break;
1513 
1514 	case SO_TIMESTAMP_NEW:
1515 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1516 		break;
1517 
1518 	case SO_TIMESTAMPNS_NEW:
1519 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1520 		break;
1521 
1522 	case SO_TIMESTAMPING_OLD:
1523 		lv = sizeof(v.timestamping);
1524 		v.timestamping.flags = sk->sk_tsflags;
1525 		v.timestamping.bind_phc = sk->sk_bind_phc;
1526 		break;
1527 
1528 	case SO_RCVTIMEO_OLD:
1529 	case SO_RCVTIMEO_NEW:
1530 		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1531 		break;
1532 
1533 	case SO_SNDTIMEO_OLD:
1534 	case SO_SNDTIMEO_NEW:
1535 		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1536 		break;
1537 
1538 	case SO_RCVLOWAT:
1539 		v.val = sk->sk_rcvlowat;
1540 		break;
1541 
1542 	case SO_SNDLOWAT:
1543 		v.val = 1;
1544 		break;
1545 
1546 	case SO_PASSCRED:
1547 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1548 		break;
1549 
1550 	case SO_PEERCRED:
1551 	{
1552 		struct ucred peercred;
1553 		if (len > sizeof(peercred))
1554 			len = sizeof(peercred);
1555 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1556 		if (copy_to_user(optval, &peercred, len))
1557 			return -EFAULT;
1558 		goto lenout;
1559 	}
1560 
1561 	case SO_PEERGROUPS:
1562 	{
1563 		int ret, n;
1564 
1565 		if (!sk->sk_peer_cred)
1566 			return -ENODATA;
1567 
1568 		n = sk->sk_peer_cred->group_info->ngroups;
1569 		if (len < n * sizeof(gid_t)) {
1570 			len = n * sizeof(gid_t);
1571 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1572 		}
1573 		len = n * sizeof(gid_t);
1574 
1575 		ret = groups_to_user((gid_t __user *)optval,
1576 				     sk->sk_peer_cred->group_info);
1577 		if (ret)
1578 			return ret;
1579 		goto lenout;
1580 	}
1581 
1582 	case SO_PEERNAME:
1583 	{
1584 		char address[128];
1585 
1586 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1587 		if (lv < 0)
1588 			return -ENOTCONN;
1589 		if (lv < len)
1590 			return -EINVAL;
1591 		if (copy_to_user(optval, address, len))
1592 			return -EFAULT;
1593 		goto lenout;
1594 	}
1595 
1596 	/* Dubious BSD thing... Probably nobody even uses it, but
1597 	 * the UNIX standard wants it for whatever reason... -DaveM
1598 	 */
1599 	case SO_ACCEPTCONN:
1600 		v.val = sk->sk_state == TCP_LISTEN;
1601 		break;
1602 
1603 	case SO_PASSSEC:
1604 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1605 		break;
1606 
1607 	case SO_PEERSEC:
1608 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1609 
1610 	case SO_MARK:
1611 		v.val = sk->sk_mark;
1612 		break;
1613 
1614 	case SO_RXQ_OVFL:
1615 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1616 		break;
1617 
1618 	case SO_WIFI_STATUS:
1619 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1620 		break;
1621 
1622 	case SO_PEEK_OFF:
1623 		if (!sock->ops->set_peek_off)
1624 			return -EOPNOTSUPP;
1625 
1626 		v.val = sk->sk_peek_off;
1627 		break;
1628 	case SO_NOFCS:
1629 		v.val = sock_flag(sk, SOCK_NOFCS);
1630 		break;
1631 
1632 	case SO_BINDTODEVICE:
1633 		return sock_getbindtodevice(sk, optval, optlen, len);
1634 
1635 	case SO_GET_FILTER:
1636 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1637 		if (len < 0)
1638 			return len;
1639 
1640 		goto lenout;
1641 
1642 	case SO_LOCK_FILTER:
1643 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1644 		break;
1645 
1646 	case SO_BPF_EXTENSIONS:
1647 		v.val = bpf_tell_extensions();
1648 		break;
1649 
1650 	case SO_SELECT_ERR_QUEUE:
1651 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1652 		break;
1653 
1654 #ifdef CONFIG_NET_RX_BUSY_POLL
1655 	case SO_BUSY_POLL:
1656 		v.val = sk->sk_ll_usec;
1657 		break;
1658 	case SO_PREFER_BUSY_POLL:
1659 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1660 		break;
1661 #endif
1662 
1663 	case SO_MAX_PACING_RATE:
1664 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1665 			lv = sizeof(v.ulval);
1666 			v.ulval = sk->sk_max_pacing_rate;
1667 		} else {
1668 			/* 32bit version */
1669 			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1670 		}
1671 		break;
1672 
1673 	case SO_INCOMING_CPU:
1674 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1675 		break;
1676 
1677 	case SO_MEMINFO:
1678 	{
1679 		u32 meminfo[SK_MEMINFO_VARS];
1680 
1681 		sk_get_meminfo(sk, meminfo);
1682 
1683 		len = min_t(unsigned int, len, sizeof(meminfo));
1684 		if (copy_to_user(optval, &meminfo, len))
1685 			return -EFAULT;
1686 
1687 		goto lenout;
1688 	}
1689 
1690 #ifdef CONFIG_NET_RX_BUSY_POLL
1691 	case SO_INCOMING_NAPI_ID:
1692 		v.val = READ_ONCE(sk->sk_napi_id);
1693 
1694 		/* aggregate non-NAPI IDs down to 0 */
1695 		if (v.val < MIN_NAPI_ID)
1696 			v.val = 0;
1697 
1698 		break;
1699 #endif
1700 
1701 	case SO_COOKIE:
1702 		lv = sizeof(u64);
1703 		if (len < lv)
1704 			return -EINVAL;
1705 		v.val64 = sock_gen_cookie(sk);
1706 		break;
1707 
1708 	case SO_ZEROCOPY:
1709 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1710 		break;
1711 
1712 	case SO_TXTIME:
1713 		lv = sizeof(v.txtime);
1714 		v.txtime.clockid = sk->sk_clockid;
1715 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1716 				  SOF_TXTIME_DEADLINE_MODE : 0;
1717 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1718 				  SOF_TXTIME_REPORT_ERRORS : 0;
1719 		break;
1720 
1721 	case SO_BINDTOIFINDEX:
1722 		v.val = sk->sk_bound_dev_if;
1723 		break;
1724 
1725 	case SO_NETNS_COOKIE:
1726 		lv = sizeof(u64);
1727 		if (len != lv)
1728 			return -EINVAL;
1729 		v.val64 = sock_net(sk)->net_cookie;
1730 		break;
1731 
1732 	case SO_BUF_LOCK:
1733 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1734 		break;
1735 
1736 	default:
1737 		/* We implement the SO_SNDLOWAT etc to not be settable
1738 		 * (1003.1g 7).
1739 		 */
1740 		return -ENOPROTOOPT;
1741 	}
1742 
1743 	if (len > lv)
1744 		len = lv;
1745 	if (copy_to_user(optval, &v, len))
1746 		return -EFAULT;
1747 lenout:
1748 	if (put_user(len, optlen))
1749 		return -EFAULT;
1750 	return 0;
1751 }
1752 
1753 /*
1754  * Initialize an sk_lock.
1755  *
1756  * (We also register the sk_lock with the lock validator.)
1757  */
1758 static inline void sock_lock_init(struct sock *sk)
1759 {
1760 	if (sk->sk_kern_sock)
1761 		sock_lock_init_class_and_name(
1762 			sk,
1763 			af_family_kern_slock_key_strings[sk->sk_family],
1764 			af_family_kern_slock_keys + sk->sk_family,
1765 			af_family_kern_key_strings[sk->sk_family],
1766 			af_family_kern_keys + sk->sk_family);
1767 	else
1768 		sock_lock_init_class_and_name(
1769 			sk,
1770 			af_family_slock_key_strings[sk->sk_family],
1771 			af_family_slock_keys + sk->sk_family,
1772 			af_family_key_strings[sk->sk_family],
1773 			af_family_keys + sk->sk_family);
1774 }
1775 
1776 /*
1777  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1778  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1779  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1780  */
1781 static void sock_copy(struct sock *nsk, const struct sock *osk)
1782 {
1783 	const struct proto *prot = READ_ONCE(osk->sk_prot);
1784 #ifdef CONFIG_SECURITY_NETWORK
1785 	void *sptr = nsk->sk_security;
1786 #endif
1787 
1788 	/* If we move sk_tx_queue_mapping out of the private section,
1789 	 * we must check if sk_tx_queue_clear() is called after
1790 	 * sock_copy() in sk_clone_lock().
1791 	 */
1792 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1793 		     offsetof(struct sock, sk_dontcopy_begin) ||
1794 		     offsetof(struct sock, sk_tx_queue_mapping) >=
1795 		     offsetof(struct sock, sk_dontcopy_end));
1796 
1797 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1798 
1799 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1800 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1801 
1802 #ifdef CONFIG_SECURITY_NETWORK
1803 	nsk->sk_security = sptr;
1804 	security_sk_clone(osk, nsk);
1805 #endif
1806 }
1807 
1808 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1809 		int family)
1810 {
1811 	struct sock *sk;
1812 	struct kmem_cache *slab;
1813 
1814 	slab = prot->slab;
1815 	if (slab != NULL) {
1816 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1817 		if (!sk)
1818 			return sk;
1819 		if (want_init_on_alloc(priority))
1820 			sk_prot_clear_nulls(sk, prot->obj_size);
1821 	} else
1822 		sk = kmalloc(prot->obj_size, priority);
1823 
1824 	if (sk != NULL) {
1825 		if (security_sk_alloc(sk, family, priority))
1826 			goto out_free;
1827 
1828 		if (!try_module_get(prot->owner))
1829 			goto out_free_sec;
1830 	}
1831 
1832 	return sk;
1833 
1834 out_free_sec:
1835 	security_sk_free(sk);
1836 out_free:
1837 	if (slab != NULL)
1838 		kmem_cache_free(slab, sk);
1839 	else
1840 		kfree(sk);
1841 	return NULL;
1842 }
1843 
1844 static void sk_prot_free(struct proto *prot, struct sock *sk)
1845 {
1846 	struct kmem_cache *slab;
1847 	struct module *owner;
1848 
1849 	owner = prot->owner;
1850 	slab = prot->slab;
1851 
1852 	cgroup_sk_free(&sk->sk_cgrp_data);
1853 	mem_cgroup_sk_free(sk);
1854 	security_sk_free(sk);
1855 	if (slab != NULL)
1856 		kmem_cache_free(slab, sk);
1857 	else
1858 		kfree(sk);
1859 	module_put(owner);
1860 }
1861 
1862 /**
1863  *	sk_alloc - All socket objects are allocated here
1864  *	@net: the applicable net namespace
1865  *	@family: protocol family
1866  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1867  *	@prot: struct proto associated with this new sock instance
1868  *	@kern: is this to be a kernel socket?
1869  */
1870 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1871 		      struct proto *prot, int kern)
1872 {
1873 	struct sock *sk;
1874 
1875 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1876 	if (sk) {
1877 		sk->sk_family = family;
1878 		/*
1879 		 * See comment in struct sock definition to understand
1880 		 * why we need sk_prot_creator -acme
1881 		 */
1882 		sk->sk_prot = sk->sk_prot_creator = prot;
1883 		sk->sk_kern_sock = kern;
1884 		sock_lock_init(sk);
1885 		sk->sk_net_refcnt = kern ? 0 : 1;
1886 		if (likely(sk->sk_net_refcnt)) {
1887 			get_net(net);
1888 			sock_inuse_add(net, 1);
1889 		}
1890 
1891 		sock_net_set(sk, net);
1892 		refcount_set(&sk->sk_wmem_alloc, 1);
1893 
1894 		mem_cgroup_sk_alloc(sk);
1895 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1896 		sock_update_classid(&sk->sk_cgrp_data);
1897 		sock_update_netprioidx(&sk->sk_cgrp_data);
1898 		sk_tx_queue_clear(sk);
1899 	}
1900 
1901 	return sk;
1902 }
1903 EXPORT_SYMBOL(sk_alloc);
1904 
1905 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1906  * grace period. This is the case for UDP sockets and TCP listeners.
1907  */
1908 static void __sk_destruct(struct rcu_head *head)
1909 {
1910 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1911 	struct sk_filter *filter;
1912 
1913 	if (sk->sk_destruct)
1914 		sk->sk_destruct(sk);
1915 
1916 	filter = rcu_dereference_check(sk->sk_filter,
1917 				       refcount_read(&sk->sk_wmem_alloc) == 0);
1918 	if (filter) {
1919 		sk_filter_uncharge(sk, filter);
1920 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1921 	}
1922 
1923 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1924 
1925 #ifdef CONFIG_BPF_SYSCALL
1926 	bpf_sk_storage_free(sk);
1927 #endif
1928 
1929 	if (atomic_read(&sk->sk_omem_alloc))
1930 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1931 			 __func__, atomic_read(&sk->sk_omem_alloc));
1932 
1933 	if (sk->sk_frag.page) {
1934 		put_page(sk->sk_frag.page);
1935 		sk->sk_frag.page = NULL;
1936 	}
1937 
1938 	if (sk->sk_peer_cred)
1939 		put_cred(sk->sk_peer_cred);
1940 	put_pid(sk->sk_peer_pid);
1941 	if (likely(sk->sk_net_refcnt))
1942 		put_net(sock_net(sk));
1943 	sk_prot_free(sk->sk_prot_creator, sk);
1944 }
1945 
1946 void sk_destruct(struct sock *sk)
1947 {
1948 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1949 
1950 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1951 		reuseport_detach_sock(sk);
1952 		use_call_rcu = true;
1953 	}
1954 
1955 	if (use_call_rcu)
1956 		call_rcu(&sk->sk_rcu, __sk_destruct);
1957 	else
1958 		__sk_destruct(&sk->sk_rcu);
1959 }
1960 
1961 static void __sk_free(struct sock *sk)
1962 {
1963 	if (likely(sk->sk_net_refcnt))
1964 		sock_inuse_add(sock_net(sk), -1);
1965 
1966 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1967 		sock_diag_broadcast_destroy(sk);
1968 	else
1969 		sk_destruct(sk);
1970 }
1971 
1972 void sk_free(struct sock *sk)
1973 {
1974 	/*
1975 	 * We subtract one from sk_wmem_alloc and can know if
1976 	 * some packets are still in some tx queue.
1977 	 * If not null, sock_wfree() will call __sk_free(sk) later
1978 	 */
1979 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1980 		__sk_free(sk);
1981 }
1982 EXPORT_SYMBOL(sk_free);
1983 
1984 static void sk_init_common(struct sock *sk)
1985 {
1986 	skb_queue_head_init(&sk->sk_receive_queue);
1987 	skb_queue_head_init(&sk->sk_write_queue);
1988 	skb_queue_head_init(&sk->sk_error_queue);
1989 
1990 	rwlock_init(&sk->sk_callback_lock);
1991 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1992 			af_rlock_keys + sk->sk_family,
1993 			af_family_rlock_key_strings[sk->sk_family]);
1994 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1995 			af_wlock_keys + sk->sk_family,
1996 			af_family_wlock_key_strings[sk->sk_family]);
1997 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1998 			af_elock_keys + sk->sk_family,
1999 			af_family_elock_key_strings[sk->sk_family]);
2000 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2001 			af_callback_keys + sk->sk_family,
2002 			af_family_clock_key_strings[sk->sk_family]);
2003 }
2004 
2005 /**
2006  *	sk_clone_lock - clone a socket, and lock its clone
2007  *	@sk: the socket to clone
2008  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2009  *
2010  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2011  */
2012 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2013 {
2014 	struct proto *prot = READ_ONCE(sk->sk_prot);
2015 	struct sk_filter *filter;
2016 	bool is_charged = true;
2017 	struct sock *newsk;
2018 
2019 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2020 	if (!newsk)
2021 		goto out;
2022 
2023 	sock_copy(newsk, sk);
2024 
2025 	newsk->sk_prot_creator = prot;
2026 
2027 	/* SANITY */
2028 	if (likely(newsk->sk_net_refcnt))
2029 		get_net(sock_net(newsk));
2030 	sk_node_init(&newsk->sk_node);
2031 	sock_lock_init(newsk);
2032 	bh_lock_sock(newsk);
2033 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2034 	newsk->sk_backlog.len = 0;
2035 
2036 	atomic_set(&newsk->sk_rmem_alloc, 0);
2037 
2038 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2039 	refcount_set(&newsk->sk_wmem_alloc, 1);
2040 
2041 	atomic_set(&newsk->sk_omem_alloc, 0);
2042 	sk_init_common(newsk);
2043 
2044 	newsk->sk_dst_cache	= NULL;
2045 	newsk->sk_dst_pending_confirm = 0;
2046 	newsk->sk_wmem_queued	= 0;
2047 	newsk->sk_forward_alloc = 0;
2048 	atomic_set(&newsk->sk_drops, 0);
2049 	newsk->sk_send_head	= NULL;
2050 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2051 	atomic_set(&newsk->sk_zckey, 0);
2052 
2053 	sock_reset_flag(newsk, SOCK_DONE);
2054 
2055 	/* sk->sk_memcg will be populated at accept() time */
2056 	newsk->sk_memcg = NULL;
2057 
2058 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2059 
2060 	rcu_read_lock();
2061 	filter = rcu_dereference(sk->sk_filter);
2062 	if (filter != NULL)
2063 		/* though it's an empty new sock, the charging may fail
2064 		 * if sysctl_optmem_max was changed between creation of
2065 		 * original socket and cloning
2066 		 */
2067 		is_charged = sk_filter_charge(newsk, filter);
2068 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2069 	rcu_read_unlock();
2070 
2071 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2072 		/* We need to make sure that we don't uncharge the new
2073 		 * socket if we couldn't charge it in the first place
2074 		 * as otherwise we uncharge the parent's filter.
2075 		 */
2076 		if (!is_charged)
2077 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2078 		sk_free_unlock_clone(newsk);
2079 		newsk = NULL;
2080 		goto out;
2081 	}
2082 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2083 
2084 	if (bpf_sk_storage_clone(sk, newsk)) {
2085 		sk_free_unlock_clone(newsk);
2086 		newsk = NULL;
2087 		goto out;
2088 	}
2089 
2090 	/* Clear sk_user_data if parent had the pointer tagged
2091 	 * as not suitable for copying when cloning.
2092 	 */
2093 	if (sk_user_data_is_nocopy(newsk))
2094 		newsk->sk_user_data = NULL;
2095 
2096 	newsk->sk_err	   = 0;
2097 	newsk->sk_err_soft = 0;
2098 	newsk->sk_priority = 0;
2099 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2100 	if (likely(newsk->sk_net_refcnt))
2101 		sock_inuse_add(sock_net(newsk), 1);
2102 
2103 	/* Before updating sk_refcnt, we must commit prior changes to memory
2104 	 * (Documentation/RCU/rculist_nulls.rst for details)
2105 	 */
2106 	smp_wmb();
2107 	refcount_set(&newsk->sk_refcnt, 2);
2108 
2109 	/* Increment the counter in the same struct proto as the master
2110 	 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2111 	 * is the same as sk->sk_prot->socks, as this field was copied
2112 	 * with memcpy).
2113 	 *
2114 	 * This _changes_ the previous behaviour, where
2115 	 * tcp_create_openreq_child always was incrementing the
2116 	 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2117 	 * to be taken into account in all callers. -acme
2118 	 */
2119 	sk_refcnt_debug_inc(newsk);
2120 	sk_set_socket(newsk, NULL);
2121 	sk_tx_queue_clear(newsk);
2122 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2123 
2124 	if (newsk->sk_prot->sockets_allocated)
2125 		sk_sockets_allocated_inc(newsk);
2126 
2127 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2128 		net_enable_timestamp();
2129 out:
2130 	return newsk;
2131 }
2132 EXPORT_SYMBOL_GPL(sk_clone_lock);
2133 
2134 void sk_free_unlock_clone(struct sock *sk)
2135 {
2136 	/* It is still raw copy of parent, so invalidate
2137 	 * destructor and make plain sk_free() */
2138 	sk->sk_destruct = NULL;
2139 	bh_unlock_sock(sk);
2140 	sk_free(sk);
2141 }
2142 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2143 
2144 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2145 {
2146 	u32 max_segs = 1;
2147 
2148 	sk_dst_set(sk, dst);
2149 	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
2150 	if (sk->sk_route_caps & NETIF_F_GSO)
2151 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2152 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
2153 	if (sk_can_gso(sk)) {
2154 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2155 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2156 		} else {
2157 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2158 			sk->sk_gso_max_size = dst->dev->gso_max_size;
2159 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2160 		}
2161 	}
2162 	sk->sk_gso_max_segs = max_segs;
2163 }
2164 EXPORT_SYMBOL_GPL(sk_setup_caps);
2165 
2166 /*
2167  *	Simple resource managers for sockets.
2168  */
2169 
2170 
2171 /*
2172  * Write buffer destructor automatically called from kfree_skb.
2173  */
2174 void sock_wfree(struct sk_buff *skb)
2175 {
2176 	struct sock *sk = skb->sk;
2177 	unsigned int len = skb->truesize;
2178 
2179 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2180 		/*
2181 		 * Keep a reference on sk_wmem_alloc, this will be released
2182 		 * after sk_write_space() call
2183 		 */
2184 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2185 		sk->sk_write_space(sk);
2186 		len = 1;
2187 	}
2188 	/*
2189 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2190 	 * could not do because of in-flight packets
2191 	 */
2192 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2193 		__sk_free(sk);
2194 }
2195 EXPORT_SYMBOL(sock_wfree);
2196 
2197 /* This variant of sock_wfree() is used by TCP,
2198  * since it sets SOCK_USE_WRITE_QUEUE.
2199  */
2200 void __sock_wfree(struct sk_buff *skb)
2201 {
2202 	struct sock *sk = skb->sk;
2203 
2204 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2205 		__sk_free(sk);
2206 }
2207 
2208 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2209 {
2210 	skb_orphan(skb);
2211 	skb->sk = sk;
2212 #ifdef CONFIG_INET
2213 	if (unlikely(!sk_fullsock(sk))) {
2214 		skb->destructor = sock_edemux;
2215 		sock_hold(sk);
2216 		return;
2217 	}
2218 #endif
2219 	skb->destructor = sock_wfree;
2220 	skb_set_hash_from_sk(skb, sk);
2221 	/*
2222 	 * We used to take a refcount on sk, but following operation
2223 	 * is enough to guarantee sk_free() wont free this sock until
2224 	 * all in-flight packets are completed
2225 	 */
2226 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2227 }
2228 EXPORT_SYMBOL(skb_set_owner_w);
2229 
2230 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2231 {
2232 #ifdef CONFIG_TLS_DEVICE
2233 	/* Drivers depend on in-order delivery for crypto offload,
2234 	 * partial orphan breaks out-of-order-OK logic.
2235 	 */
2236 	if (skb->decrypted)
2237 		return false;
2238 #endif
2239 	return (skb->destructor == sock_wfree ||
2240 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2241 }
2242 
2243 /* This helper is used by netem, as it can hold packets in its
2244  * delay queue. We want to allow the owner socket to send more
2245  * packets, as if they were already TX completed by a typical driver.
2246  * But we also want to keep skb->sk set because some packet schedulers
2247  * rely on it (sch_fq for example).
2248  */
2249 void skb_orphan_partial(struct sk_buff *skb)
2250 {
2251 	if (skb_is_tcp_pure_ack(skb))
2252 		return;
2253 
2254 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2255 		return;
2256 
2257 	skb_orphan(skb);
2258 }
2259 EXPORT_SYMBOL(skb_orphan_partial);
2260 
2261 /*
2262  * Read buffer destructor automatically called from kfree_skb.
2263  */
2264 void sock_rfree(struct sk_buff *skb)
2265 {
2266 	struct sock *sk = skb->sk;
2267 	unsigned int len = skb->truesize;
2268 
2269 	atomic_sub(len, &sk->sk_rmem_alloc);
2270 	sk_mem_uncharge(sk, len);
2271 }
2272 EXPORT_SYMBOL(sock_rfree);
2273 
2274 /*
2275  * Buffer destructor for skbs that are not used directly in read or write
2276  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2277  */
2278 void sock_efree(struct sk_buff *skb)
2279 {
2280 	sock_put(skb->sk);
2281 }
2282 EXPORT_SYMBOL(sock_efree);
2283 
2284 /* Buffer destructor for prefetch/receive path where reference count may
2285  * not be held, e.g. for listen sockets.
2286  */
2287 #ifdef CONFIG_INET
2288 void sock_pfree(struct sk_buff *skb)
2289 {
2290 	if (sk_is_refcounted(skb->sk))
2291 		sock_gen_put(skb->sk);
2292 }
2293 EXPORT_SYMBOL(sock_pfree);
2294 #endif /* CONFIG_INET */
2295 
2296 kuid_t sock_i_uid(struct sock *sk)
2297 {
2298 	kuid_t uid;
2299 
2300 	read_lock_bh(&sk->sk_callback_lock);
2301 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2302 	read_unlock_bh(&sk->sk_callback_lock);
2303 	return uid;
2304 }
2305 EXPORT_SYMBOL(sock_i_uid);
2306 
2307 unsigned long sock_i_ino(struct sock *sk)
2308 {
2309 	unsigned long ino;
2310 
2311 	read_lock_bh(&sk->sk_callback_lock);
2312 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2313 	read_unlock_bh(&sk->sk_callback_lock);
2314 	return ino;
2315 }
2316 EXPORT_SYMBOL(sock_i_ino);
2317 
2318 /*
2319  * Allocate a skb from the socket's send buffer.
2320  */
2321 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2322 			     gfp_t priority)
2323 {
2324 	if (force ||
2325 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2326 		struct sk_buff *skb = alloc_skb(size, priority);
2327 
2328 		if (skb) {
2329 			skb_set_owner_w(skb, sk);
2330 			return skb;
2331 		}
2332 	}
2333 	return NULL;
2334 }
2335 EXPORT_SYMBOL(sock_wmalloc);
2336 
2337 static void sock_ofree(struct sk_buff *skb)
2338 {
2339 	struct sock *sk = skb->sk;
2340 
2341 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2342 }
2343 
2344 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2345 			     gfp_t priority)
2346 {
2347 	struct sk_buff *skb;
2348 
2349 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2350 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2351 	    sysctl_optmem_max)
2352 		return NULL;
2353 
2354 	skb = alloc_skb(size, priority);
2355 	if (!skb)
2356 		return NULL;
2357 
2358 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2359 	skb->sk = sk;
2360 	skb->destructor = sock_ofree;
2361 	return skb;
2362 }
2363 
2364 /*
2365  * Allocate a memory block from the socket's option memory buffer.
2366  */
2367 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2368 {
2369 	if ((unsigned int)size <= sysctl_optmem_max &&
2370 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2371 		void *mem;
2372 		/* First do the add, to avoid the race if kmalloc
2373 		 * might sleep.
2374 		 */
2375 		atomic_add(size, &sk->sk_omem_alloc);
2376 		mem = kmalloc(size, priority);
2377 		if (mem)
2378 			return mem;
2379 		atomic_sub(size, &sk->sk_omem_alloc);
2380 	}
2381 	return NULL;
2382 }
2383 EXPORT_SYMBOL(sock_kmalloc);
2384 
2385 /* Free an option memory block. Note, we actually want the inline
2386  * here as this allows gcc to detect the nullify and fold away the
2387  * condition entirely.
2388  */
2389 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2390 				  const bool nullify)
2391 {
2392 	if (WARN_ON_ONCE(!mem))
2393 		return;
2394 	if (nullify)
2395 		kfree_sensitive(mem);
2396 	else
2397 		kfree(mem);
2398 	atomic_sub(size, &sk->sk_omem_alloc);
2399 }
2400 
2401 void sock_kfree_s(struct sock *sk, void *mem, int size)
2402 {
2403 	__sock_kfree_s(sk, mem, size, false);
2404 }
2405 EXPORT_SYMBOL(sock_kfree_s);
2406 
2407 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2408 {
2409 	__sock_kfree_s(sk, mem, size, true);
2410 }
2411 EXPORT_SYMBOL(sock_kzfree_s);
2412 
2413 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2414    I think, these locks should be removed for datagram sockets.
2415  */
2416 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2417 {
2418 	DEFINE_WAIT(wait);
2419 
2420 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2421 	for (;;) {
2422 		if (!timeo)
2423 			break;
2424 		if (signal_pending(current))
2425 			break;
2426 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2427 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2428 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2429 			break;
2430 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2431 			break;
2432 		if (sk->sk_err)
2433 			break;
2434 		timeo = schedule_timeout(timeo);
2435 	}
2436 	finish_wait(sk_sleep(sk), &wait);
2437 	return timeo;
2438 }
2439 
2440 
2441 /*
2442  *	Generic send/receive buffer handlers
2443  */
2444 
2445 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2446 				     unsigned long data_len, int noblock,
2447 				     int *errcode, int max_page_order)
2448 {
2449 	struct sk_buff *skb;
2450 	long timeo;
2451 	int err;
2452 
2453 	timeo = sock_sndtimeo(sk, noblock);
2454 	for (;;) {
2455 		err = sock_error(sk);
2456 		if (err != 0)
2457 			goto failure;
2458 
2459 		err = -EPIPE;
2460 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2461 			goto failure;
2462 
2463 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2464 			break;
2465 
2466 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2467 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2468 		err = -EAGAIN;
2469 		if (!timeo)
2470 			goto failure;
2471 		if (signal_pending(current))
2472 			goto interrupted;
2473 		timeo = sock_wait_for_wmem(sk, timeo);
2474 	}
2475 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2476 				   errcode, sk->sk_allocation);
2477 	if (skb)
2478 		skb_set_owner_w(skb, sk);
2479 	return skb;
2480 
2481 interrupted:
2482 	err = sock_intr_errno(timeo);
2483 failure:
2484 	*errcode = err;
2485 	return NULL;
2486 }
2487 EXPORT_SYMBOL(sock_alloc_send_pskb);
2488 
2489 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2490 				    int noblock, int *errcode)
2491 {
2492 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2493 }
2494 EXPORT_SYMBOL(sock_alloc_send_skb);
2495 
2496 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2497 		     struct sockcm_cookie *sockc)
2498 {
2499 	u32 tsflags;
2500 
2501 	switch (cmsg->cmsg_type) {
2502 	case SO_MARK:
2503 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2504 			return -EPERM;
2505 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2506 			return -EINVAL;
2507 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2508 		break;
2509 	case SO_TIMESTAMPING_OLD:
2510 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2511 			return -EINVAL;
2512 
2513 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2514 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2515 			return -EINVAL;
2516 
2517 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2518 		sockc->tsflags |= tsflags;
2519 		break;
2520 	case SCM_TXTIME:
2521 		if (!sock_flag(sk, SOCK_TXTIME))
2522 			return -EINVAL;
2523 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2524 			return -EINVAL;
2525 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2526 		break;
2527 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2528 	case SCM_RIGHTS:
2529 	case SCM_CREDENTIALS:
2530 		break;
2531 	default:
2532 		return -EINVAL;
2533 	}
2534 	return 0;
2535 }
2536 EXPORT_SYMBOL(__sock_cmsg_send);
2537 
2538 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2539 		   struct sockcm_cookie *sockc)
2540 {
2541 	struct cmsghdr *cmsg;
2542 	int ret;
2543 
2544 	for_each_cmsghdr(cmsg, msg) {
2545 		if (!CMSG_OK(msg, cmsg))
2546 			return -EINVAL;
2547 		if (cmsg->cmsg_level != SOL_SOCKET)
2548 			continue;
2549 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2550 		if (ret)
2551 			return ret;
2552 	}
2553 	return 0;
2554 }
2555 EXPORT_SYMBOL(sock_cmsg_send);
2556 
2557 static void sk_enter_memory_pressure(struct sock *sk)
2558 {
2559 	if (!sk->sk_prot->enter_memory_pressure)
2560 		return;
2561 
2562 	sk->sk_prot->enter_memory_pressure(sk);
2563 }
2564 
2565 static void sk_leave_memory_pressure(struct sock *sk)
2566 {
2567 	if (sk->sk_prot->leave_memory_pressure) {
2568 		sk->sk_prot->leave_memory_pressure(sk);
2569 	} else {
2570 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2571 
2572 		if (memory_pressure && READ_ONCE(*memory_pressure))
2573 			WRITE_ONCE(*memory_pressure, 0);
2574 	}
2575 }
2576 
2577 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2578 
2579 /**
2580  * skb_page_frag_refill - check that a page_frag contains enough room
2581  * @sz: minimum size of the fragment we want to get
2582  * @pfrag: pointer to page_frag
2583  * @gfp: priority for memory allocation
2584  *
2585  * Note: While this allocator tries to use high order pages, there is
2586  * no guarantee that allocations succeed. Therefore, @sz MUST be
2587  * less or equal than PAGE_SIZE.
2588  */
2589 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2590 {
2591 	if (pfrag->page) {
2592 		if (page_ref_count(pfrag->page) == 1) {
2593 			pfrag->offset = 0;
2594 			return true;
2595 		}
2596 		if (pfrag->offset + sz <= pfrag->size)
2597 			return true;
2598 		put_page(pfrag->page);
2599 	}
2600 
2601 	pfrag->offset = 0;
2602 	if (SKB_FRAG_PAGE_ORDER &&
2603 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2604 		/* Avoid direct reclaim but allow kswapd to wake */
2605 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2606 					  __GFP_COMP | __GFP_NOWARN |
2607 					  __GFP_NORETRY,
2608 					  SKB_FRAG_PAGE_ORDER);
2609 		if (likely(pfrag->page)) {
2610 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2611 			return true;
2612 		}
2613 	}
2614 	pfrag->page = alloc_page(gfp);
2615 	if (likely(pfrag->page)) {
2616 		pfrag->size = PAGE_SIZE;
2617 		return true;
2618 	}
2619 	return false;
2620 }
2621 EXPORT_SYMBOL(skb_page_frag_refill);
2622 
2623 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2624 {
2625 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2626 		return true;
2627 
2628 	sk_enter_memory_pressure(sk);
2629 	sk_stream_moderate_sndbuf(sk);
2630 	return false;
2631 }
2632 EXPORT_SYMBOL(sk_page_frag_refill);
2633 
2634 void __lock_sock(struct sock *sk)
2635 	__releases(&sk->sk_lock.slock)
2636 	__acquires(&sk->sk_lock.slock)
2637 {
2638 	DEFINE_WAIT(wait);
2639 
2640 	for (;;) {
2641 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2642 					TASK_UNINTERRUPTIBLE);
2643 		spin_unlock_bh(&sk->sk_lock.slock);
2644 		schedule();
2645 		spin_lock_bh(&sk->sk_lock.slock);
2646 		if (!sock_owned_by_user(sk))
2647 			break;
2648 	}
2649 	finish_wait(&sk->sk_lock.wq, &wait);
2650 }
2651 
2652 void __release_sock(struct sock *sk)
2653 	__releases(&sk->sk_lock.slock)
2654 	__acquires(&sk->sk_lock.slock)
2655 {
2656 	struct sk_buff *skb, *next;
2657 
2658 	while ((skb = sk->sk_backlog.head) != NULL) {
2659 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2660 
2661 		spin_unlock_bh(&sk->sk_lock.slock);
2662 
2663 		do {
2664 			next = skb->next;
2665 			prefetch(next);
2666 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2667 			skb_mark_not_on_list(skb);
2668 			sk_backlog_rcv(sk, skb);
2669 
2670 			cond_resched();
2671 
2672 			skb = next;
2673 		} while (skb != NULL);
2674 
2675 		spin_lock_bh(&sk->sk_lock.slock);
2676 	}
2677 
2678 	/*
2679 	 * Doing the zeroing here guarantee we can not loop forever
2680 	 * while a wild producer attempts to flood us.
2681 	 */
2682 	sk->sk_backlog.len = 0;
2683 }
2684 
2685 void __sk_flush_backlog(struct sock *sk)
2686 {
2687 	spin_lock_bh(&sk->sk_lock.slock);
2688 	__release_sock(sk);
2689 	spin_unlock_bh(&sk->sk_lock.slock);
2690 }
2691 
2692 /**
2693  * sk_wait_data - wait for data to arrive at sk_receive_queue
2694  * @sk:    sock to wait on
2695  * @timeo: for how long
2696  * @skb:   last skb seen on sk_receive_queue
2697  *
2698  * Now socket state including sk->sk_err is changed only under lock,
2699  * hence we may omit checks after joining wait queue.
2700  * We check receive queue before schedule() only as optimization;
2701  * it is very likely that release_sock() added new data.
2702  */
2703 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2704 {
2705 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2706 	int rc;
2707 
2708 	add_wait_queue(sk_sleep(sk), &wait);
2709 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2710 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2711 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2712 	remove_wait_queue(sk_sleep(sk), &wait);
2713 	return rc;
2714 }
2715 EXPORT_SYMBOL(sk_wait_data);
2716 
2717 /**
2718  *	__sk_mem_raise_allocated - increase memory_allocated
2719  *	@sk: socket
2720  *	@size: memory size to allocate
2721  *	@amt: pages to allocate
2722  *	@kind: allocation type
2723  *
2724  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2725  */
2726 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2727 {
2728 	struct proto *prot = sk->sk_prot;
2729 	long allocated = sk_memory_allocated_add(sk, amt);
2730 	bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2731 	bool charged = true;
2732 
2733 	if (memcg_charge &&
2734 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2735 						gfp_memcg_charge())))
2736 		goto suppress_allocation;
2737 
2738 	/* Under limit. */
2739 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2740 		sk_leave_memory_pressure(sk);
2741 		return 1;
2742 	}
2743 
2744 	/* Under pressure. */
2745 	if (allocated > sk_prot_mem_limits(sk, 1))
2746 		sk_enter_memory_pressure(sk);
2747 
2748 	/* Over hard limit. */
2749 	if (allocated > sk_prot_mem_limits(sk, 2))
2750 		goto suppress_allocation;
2751 
2752 	/* guarantee minimum buffer size under pressure */
2753 	if (kind == SK_MEM_RECV) {
2754 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2755 			return 1;
2756 
2757 	} else { /* SK_MEM_SEND */
2758 		int wmem0 = sk_get_wmem0(sk, prot);
2759 
2760 		if (sk->sk_type == SOCK_STREAM) {
2761 			if (sk->sk_wmem_queued < wmem0)
2762 				return 1;
2763 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2764 				return 1;
2765 		}
2766 	}
2767 
2768 	if (sk_has_memory_pressure(sk)) {
2769 		u64 alloc;
2770 
2771 		if (!sk_under_memory_pressure(sk))
2772 			return 1;
2773 		alloc = sk_sockets_allocated_read_positive(sk);
2774 		if (sk_prot_mem_limits(sk, 2) > alloc *
2775 		    sk_mem_pages(sk->sk_wmem_queued +
2776 				 atomic_read(&sk->sk_rmem_alloc) +
2777 				 sk->sk_forward_alloc))
2778 			return 1;
2779 	}
2780 
2781 suppress_allocation:
2782 
2783 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2784 		sk_stream_moderate_sndbuf(sk);
2785 
2786 		/* Fail only if socket is _under_ its sndbuf.
2787 		 * In this case we cannot block, so that we have to fail.
2788 		 */
2789 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2790 			/* Force charge with __GFP_NOFAIL */
2791 			if (memcg_charge && !charged) {
2792 				mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2793 					gfp_memcg_charge() | __GFP_NOFAIL);
2794 			}
2795 			return 1;
2796 		}
2797 	}
2798 
2799 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2800 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2801 
2802 	sk_memory_allocated_sub(sk, amt);
2803 
2804 	if (memcg_charge && charged)
2805 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2806 
2807 	return 0;
2808 }
2809 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2810 
2811 /**
2812  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2813  *	@sk: socket
2814  *	@size: memory size to allocate
2815  *	@kind: allocation type
2816  *
2817  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2818  *	rmem allocation. This function assumes that protocols which have
2819  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2820  */
2821 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2822 {
2823 	int ret, amt = sk_mem_pages(size);
2824 
2825 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2826 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2827 	if (!ret)
2828 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2829 	return ret;
2830 }
2831 EXPORT_SYMBOL(__sk_mem_schedule);
2832 
2833 /**
2834  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2835  *	@sk: socket
2836  *	@amount: number of quanta
2837  *
2838  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2839  */
2840 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2841 {
2842 	sk_memory_allocated_sub(sk, amount);
2843 
2844 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2845 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2846 
2847 	if (sk_under_memory_pressure(sk) &&
2848 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2849 		sk_leave_memory_pressure(sk);
2850 }
2851 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2852 
2853 /**
2854  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2855  *	@sk: socket
2856  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2857  */
2858 void __sk_mem_reclaim(struct sock *sk, int amount)
2859 {
2860 	amount >>= SK_MEM_QUANTUM_SHIFT;
2861 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2862 	__sk_mem_reduce_allocated(sk, amount);
2863 }
2864 EXPORT_SYMBOL(__sk_mem_reclaim);
2865 
2866 int sk_set_peek_off(struct sock *sk, int val)
2867 {
2868 	sk->sk_peek_off = val;
2869 	return 0;
2870 }
2871 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2872 
2873 /*
2874  * Set of default routines for initialising struct proto_ops when
2875  * the protocol does not support a particular function. In certain
2876  * cases where it makes no sense for a protocol to have a "do nothing"
2877  * function, some default processing is provided.
2878  */
2879 
2880 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2881 {
2882 	return -EOPNOTSUPP;
2883 }
2884 EXPORT_SYMBOL(sock_no_bind);
2885 
2886 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2887 		    int len, int flags)
2888 {
2889 	return -EOPNOTSUPP;
2890 }
2891 EXPORT_SYMBOL(sock_no_connect);
2892 
2893 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2894 {
2895 	return -EOPNOTSUPP;
2896 }
2897 EXPORT_SYMBOL(sock_no_socketpair);
2898 
2899 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2900 		   bool kern)
2901 {
2902 	return -EOPNOTSUPP;
2903 }
2904 EXPORT_SYMBOL(sock_no_accept);
2905 
2906 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2907 		    int peer)
2908 {
2909 	return -EOPNOTSUPP;
2910 }
2911 EXPORT_SYMBOL(sock_no_getname);
2912 
2913 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2914 {
2915 	return -EOPNOTSUPP;
2916 }
2917 EXPORT_SYMBOL(sock_no_ioctl);
2918 
2919 int sock_no_listen(struct socket *sock, int backlog)
2920 {
2921 	return -EOPNOTSUPP;
2922 }
2923 EXPORT_SYMBOL(sock_no_listen);
2924 
2925 int sock_no_shutdown(struct socket *sock, int how)
2926 {
2927 	return -EOPNOTSUPP;
2928 }
2929 EXPORT_SYMBOL(sock_no_shutdown);
2930 
2931 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2932 {
2933 	return -EOPNOTSUPP;
2934 }
2935 EXPORT_SYMBOL(sock_no_sendmsg);
2936 
2937 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2938 {
2939 	return -EOPNOTSUPP;
2940 }
2941 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2942 
2943 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2944 		    int flags)
2945 {
2946 	return -EOPNOTSUPP;
2947 }
2948 EXPORT_SYMBOL(sock_no_recvmsg);
2949 
2950 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2951 {
2952 	/* Mirror missing mmap method error code */
2953 	return -ENODEV;
2954 }
2955 EXPORT_SYMBOL(sock_no_mmap);
2956 
2957 /*
2958  * When a file is received (via SCM_RIGHTS, etc), we must bump the
2959  * various sock-based usage counts.
2960  */
2961 void __receive_sock(struct file *file)
2962 {
2963 	struct socket *sock;
2964 
2965 	sock = sock_from_file(file);
2966 	if (sock) {
2967 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2968 		sock_update_classid(&sock->sk->sk_cgrp_data);
2969 	}
2970 }
2971 
2972 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2973 {
2974 	ssize_t res;
2975 	struct msghdr msg = {.msg_flags = flags};
2976 	struct kvec iov;
2977 	char *kaddr = kmap(page);
2978 	iov.iov_base = kaddr + offset;
2979 	iov.iov_len = size;
2980 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2981 	kunmap(page);
2982 	return res;
2983 }
2984 EXPORT_SYMBOL(sock_no_sendpage);
2985 
2986 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2987 				int offset, size_t size, int flags)
2988 {
2989 	ssize_t res;
2990 	struct msghdr msg = {.msg_flags = flags};
2991 	struct kvec iov;
2992 	char *kaddr = kmap(page);
2993 
2994 	iov.iov_base = kaddr + offset;
2995 	iov.iov_len = size;
2996 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2997 	kunmap(page);
2998 	return res;
2999 }
3000 EXPORT_SYMBOL(sock_no_sendpage_locked);
3001 
3002 /*
3003  *	Default Socket Callbacks
3004  */
3005 
3006 static void sock_def_wakeup(struct sock *sk)
3007 {
3008 	struct socket_wq *wq;
3009 
3010 	rcu_read_lock();
3011 	wq = rcu_dereference(sk->sk_wq);
3012 	if (skwq_has_sleeper(wq))
3013 		wake_up_interruptible_all(&wq->wait);
3014 	rcu_read_unlock();
3015 }
3016 
3017 static void sock_def_error_report(struct sock *sk)
3018 {
3019 	struct socket_wq *wq;
3020 
3021 	rcu_read_lock();
3022 	wq = rcu_dereference(sk->sk_wq);
3023 	if (skwq_has_sleeper(wq))
3024 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3025 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3026 	rcu_read_unlock();
3027 }
3028 
3029 void sock_def_readable(struct sock *sk)
3030 {
3031 	struct socket_wq *wq;
3032 
3033 	rcu_read_lock();
3034 	wq = rcu_dereference(sk->sk_wq);
3035 	if (skwq_has_sleeper(wq))
3036 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3037 						EPOLLRDNORM | EPOLLRDBAND);
3038 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3039 	rcu_read_unlock();
3040 }
3041 
3042 static void sock_def_write_space(struct sock *sk)
3043 {
3044 	struct socket_wq *wq;
3045 
3046 	rcu_read_lock();
3047 
3048 	/* Do not wake up a writer until he can make "significant"
3049 	 * progress.  --DaveM
3050 	 */
3051 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
3052 		wq = rcu_dereference(sk->sk_wq);
3053 		if (skwq_has_sleeper(wq))
3054 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3055 						EPOLLWRNORM | EPOLLWRBAND);
3056 
3057 		/* Should agree with poll, otherwise some programs break */
3058 		if (sock_writeable(sk))
3059 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3060 	}
3061 
3062 	rcu_read_unlock();
3063 }
3064 
3065 static void sock_def_destruct(struct sock *sk)
3066 {
3067 }
3068 
3069 void sk_send_sigurg(struct sock *sk)
3070 {
3071 	if (sk->sk_socket && sk->sk_socket->file)
3072 		if (send_sigurg(&sk->sk_socket->file->f_owner))
3073 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3074 }
3075 EXPORT_SYMBOL(sk_send_sigurg);
3076 
3077 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3078 		    unsigned long expires)
3079 {
3080 	if (!mod_timer(timer, expires))
3081 		sock_hold(sk);
3082 }
3083 EXPORT_SYMBOL(sk_reset_timer);
3084 
3085 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3086 {
3087 	if (del_timer(timer))
3088 		__sock_put(sk);
3089 }
3090 EXPORT_SYMBOL(sk_stop_timer);
3091 
3092 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3093 {
3094 	if (del_timer_sync(timer))
3095 		__sock_put(sk);
3096 }
3097 EXPORT_SYMBOL(sk_stop_timer_sync);
3098 
3099 void sock_init_data(struct socket *sock, struct sock *sk)
3100 {
3101 	sk_init_common(sk);
3102 	sk->sk_send_head	=	NULL;
3103 
3104 	timer_setup(&sk->sk_timer, NULL, 0);
3105 
3106 	sk->sk_allocation	=	GFP_KERNEL;
3107 	sk->sk_rcvbuf		=	sysctl_rmem_default;
3108 	sk->sk_sndbuf		=	sysctl_wmem_default;
3109 	sk->sk_state		=	TCP_CLOSE;
3110 	sk_set_socket(sk, sock);
3111 
3112 	sock_set_flag(sk, SOCK_ZAPPED);
3113 
3114 	if (sock) {
3115 		sk->sk_type	=	sock->type;
3116 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3117 		sock->sk	=	sk;
3118 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
3119 	} else {
3120 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3121 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
3122 	}
3123 
3124 	rwlock_init(&sk->sk_callback_lock);
3125 	if (sk->sk_kern_sock)
3126 		lockdep_set_class_and_name(
3127 			&sk->sk_callback_lock,
3128 			af_kern_callback_keys + sk->sk_family,
3129 			af_family_kern_clock_key_strings[sk->sk_family]);
3130 	else
3131 		lockdep_set_class_and_name(
3132 			&sk->sk_callback_lock,
3133 			af_callback_keys + sk->sk_family,
3134 			af_family_clock_key_strings[sk->sk_family]);
3135 
3136 	sk->sk_state_change	=	sock_def_wakeup;
3137 	sk->sk_data_ready	=	sock_def_readable;
3138 	sk->sk_write_space	=	sock_def_write_space;
3139 	sk->sk_error_report	=	sock_def_error_report;
3140 	sk->sk_destruct		=	sock_def_destruct;
3141 
3142 	sk->sk_frag.page	=	NULL;
3143 	sk->sk_frag.offset	=	0;
3144 	sk->sk_peek_off		=	-1;
3145 
3146 	sk->sk_peer_pid 	=	NULL;
3147 	sk->sk_peer_cred	=	NULL;
3148 	sk->sk_write_pending	=	0;
3149 	sk->sk_rcvlowat		=	1;
3150 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3151 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3152 
3153 	sk->sk_stamp = SK_DEFAULT_STAMP;
3154 #if BITS_PER_LONG==32
3155 	seqlock_init(&sk->sk_stamp_seq);
3156 #endif
3157 	atomic_set(&sk->sk_zckey, 0);
3158 
3159 #ifdef CONFIG_NET_RX_BUSY_POLL
3160 	sk->sk_napi_id		=	0;
3161 	sk->sk_ll_usec		=	sysctl_net_busy_read;
3162 #endif
3163 
3164 	sk->sk_max_pacing_rate = ~0UL;
3165 	sk->sk_pacing_rate = ~0UL;
3166 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3167 	sk->sk_incoming_cpu = -1;
3168 
3169 	sk_rx_queue_clear(sk);
3170 	/*
3171 	 * Before updating sk_refcnt, we must commit prior changes to memory
3172 	 * (Documentation/RCU/rculist_nulls.rst for details)
3173 	 */
3174 	smp_wmb();
3175 	refcount_set(&sk->sk_refcnt, 1);
3176 	atomic_set(&sk->sk_drops, 0);
3177 }
3178 EXPORT_SYMBOL(sock_init_data);
3179 
3180 void lock_sock_nested(struct sock *sk, int subclass)
3181 {
3182 	might_sleep();
3183 	spin_lock_bh(&sk->sk_lock.slock);
3184 	if (sk->sk_lock.owned)
3185 		__lock_sock(sk);
3186 	sk->sk_lock.owned = 1;
3187 	spin_unlock(&sk->sk_lock.slock);
3188 	/*
3189 	 * The sk_lock has mutex_lock() semantics here:
3190 	 */
3191 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3192 	local_bh_enable();
3193 }
3194 EXPORT_SYMBOL(lock_sock_nested);
3195 
3196 void release_sock(struct sock *sk)
3197 {
3198 	spin_lock_bh(&sk->sk_lock.slock);
3199 	if (sk->sk_backlog.tail)
3200 		__release_sock(sk);
3201 
3202 	/* Warning : release_cb() might need to release sk ownership,
3203 	 * ie call sock_release_ownership(sk) before us.
3204 	 */
3205 	if (sk->sk_prot->release_cb)
3206 		sk->sk_prot->release_cb(sk);
3207 
3208 	sock_release_ownership(sk);
3209 	if (waitqueue_active(&sk->sk_lock.wq))
3210 		wake_up(&sk->sk_lock.wq);
3211 	spin_unlock_bh(&sk->sk_lock.slock);
3212 }
3213 EXPORT_SYMBOL(release_sock);
3214 
3215 /**
3216  * lock_sock_fast - fast version of lock_sock
3217  * @sk: socket
3218  *
3219  * This version should be used for very small section, where process wont block
3220  * return false if fast path is taken:
3221  *
3222  *   sk_lock.slock locked, owned = 0, BH disabled
3223  *
3224  * return true if slow path is taken:
3225  *
3226  *   sk_lock.slock unlocked, owned = 1, BH enabled
3227  */
3228 bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3229 {
3230 	might_sleep();
3231 	spin_lock_bh(&sk->sk_lock.slock);
3232 
3233 	if (!sk->sk_lock.owned)
3234 		/*
3235 		 * Note : We must disable BH
3236 		 */
3237 		return false;
3238 
3239 	__lock_sock(sk);
3240 	sk->sk_lock.owned = 1;
3241 	spin_unlock(&sk->sk_lock.slock);
3242 	/*
3243 	 * The sk_lock has mutex_lock() semantics here:
3244 	 */
3245 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
3246 	__acquire(&sk->sk_lock.slock);
3247 	local_bh_enable();
3248 	return true;
3249 }
3250 EXPORT_SYMBOL(lock_sock_fast);
3251 
3252 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3253 		   bool timeval, bool time32)
3254 {
3255 	struct sock *sk = sock->sk;
3256 	struct timespec64 ts;
3257 
3258 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3259 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3260 	if (ts.tv_sec == -1)
3261 		return -ENOENT;
3262 	if (ts.tv_sec == 0) {
3263 		ktime_t kt = ktime_get_real();
3264 		sock_write_timestamp(sk, kt);
3265 		ts = ktime_to_timespec64(kt);
3266 	}
3267 
3268 	if (timeval)
3269 		ts.tv_nsec /= 1000;
3270 
3271 #ifdef CONFIG_COMPAT_32BIT_TIME
3272 	if (time32)
3273 		return put_old_timespec32(&ts, userstamp);
3274 #endif
3275 #ifdef CONFIG_SPARC64
3276 	/* beware of padding in sparc64 timeval */
3277 	if (timeval && !in_compat_syscall()) {
3278 		struct __kernel_old_timeval __user tv = {
3279 			.tv_sec = ts.tv_sec,
3280 			.tv_usec = ts.tv_nsec,
3281 		};
3282 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3283 			return -EFAULT;
3284 		return 0;
3285 	}
3286 #endif
3287 	return put_timespec64(&ts, userstamp);
3288 }
3289 EXPORT_SYMBOL(sock_gettstamp);
3290 
3291 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3292 {
3293 	if (!sock_flag(sk, flag)) {
3294 		unsigned long previous_flags = sk->sk_flags;
3295 
3296 		sock_set_flag(sk, flag);
3297 		/*
3298 		 * we just set one of the two flags which require net
3299 		 * time stamping, but time stamping might have been on
3300 		 * already because of the other one
3301 		 */
3302 		if (sock_needs_netstamp(sk) &&
3303 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3304 			net_enable_timestamp();
3305 	}
3306 }
3307 
3308 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3309 		       int level, int type)
3310 {
3311 	struct sock_exterr_skb *serr;
3312 	struct sk_buff *skb;
3313 	int copied, err;
3314 
3315 	err = -EAGAIN;
3316 	skb = sock_dequeue_err_skb(sk);
3317 	if (skb == NULL)
3318 		goto out;
3319 
3320 	copied = skb->len;
3321 	if (copied > len) {
3322 		msg->msg_flags |= MSG_TRUNC;
3323 		copied = len;
3324 	}
3325 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3326 	if (err)
3327 		goto out_free_skb;
3328 
3329 	sock_recv_timestamp(msg, sk, skb);
3330 
3331 	serr = SKB_EXT_ERR(skb);
3332 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3333 
3334 	msg->msg_flags |= MSG_ERRQUEUE;
3335 	err = copied;
3336 
3337 out_free_skb:
3338 	kfree_skb(skb);
3339 out:
3340 	return err;
3341 }
3342 EXPORT_SYMBOL(sock_recv_errqueue);
3343 
3344 /*
3345  *	Get a socket option on an socket.
3346  *
3347  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3348  *	asynchronous errors should be reported by getsockopt. We assume
3349  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3350  */
3351 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3352 			   char __user *optval, int __user *optlen)
3353 {
3354 	struct sock *sk = sock->sk;
3355 
3356 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3357 }
3358 EXPORT_SYMBOL(sock_common_getsockopt);
3359 
3360 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3361 			int flags)
3362 {
3363 	struct sock *sk = sock->sk;
3364 	int addr_len = 0;
3365 	int err;
3366 
3367 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3368 				   flags & ~MSG_DONTWAIT, &addr_len);
3369 	if (err >= 0)
3370 		msg->msg_namelen = addr_len;
3371 	return err;
3372 }
3373 EXPORT_SYMBOL(sock_common_recvmsg);
3374 
3375 /*
3376  *	Set socket options on an inet socket.
3377  */
3378 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3379 			   sockptr_t optval, unsigned int optlen)
3380 {
3381 	struct sock *sk = sock->sk;
3382 
3383 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3384 }
3385 EXPORT_SYMBOL(sock_common_setsockopt);
3386 
3387 void sk_common_release(struct sock *sk)
3388 {
3389 	if (sk->sk_prot->destroy)
3390 		sk->sk_prot->destroy(sk);
3391 
3392 	/*
3393 	 * Observation: when sk_common_release is called, processes have
3394 	 * no access to socket. But net still has.
3395 	 * Step one, detach it from networking:
3396 	 *
3397 	 * A. Remove from hash tables.
3398 	 */
3399 
3400 	sk->sk_prot->unhash(sk);
3401 
3402 	/*
3403 	 * In this point socket cannot receive new packets, but it is possible
3404 	 * that some packets are in flight because some CPU runs receiver and
3405 	 * did hash table lookup before we unhashed socket. They will achieve
3406 	 * receive queue and will be purged by socket destructor.
3407 	 *
3408 	 * Also we still have packets pending on receive queue and probably,
3409 	 * our own packets waiting in device queues. sock_destroy will drain
3410 	 * receive queue, but transmitted packets will delay socket destruction
3411 	 * until the last reference will be released.
3412 	 */
3413 
3414 	sock_orphan(sk);
3415 
3416 	xfrm_sk_free_policy(sk);
3417 
3418 	sk_refcnt_debug_release(sk);
3419 
3420 	sock_put(sk);
3421 }
3422 EXPORT_SYMBOL(sk_common_release);
3423 
3424 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3425 {
3426 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3427 
3428 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3429 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3430 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3431 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3432 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3433 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3434 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3435 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3436 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3437 }
3438 
3439 #ifdef CONFIG_PROC_FS
3440 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
3441 struct prot_inuse {
3442 	int val[PROTO_INUSE_NR];
3443 };
3444 
3445 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3446 
3447 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3448 {
3449 	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3450 }
3451 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3452 
3453 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3454 {
3455 	int cpu, idx = prot->inuse_idx;
3456 	int res = 0;
3457 
3458 	for_each_possible_cpu(cpu)
3459 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3460 
3461 	return res >= 0 ? res : 0;
3462 }
3463 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3464 
3465 static void sock_inuse_add(struct net *net, int val)
3466 {
3467 	this_cpu_add(*net->core.sock_inuse, val);
3468 }
3469 
3470 int sock_inuse_get(struct net *net)
3471 {
3472 	int cpu, res = 0;
3473 
3474 	for_each_possible_cpu(cpu)
3475 		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3476 
3477 	return res;
3478 }
3479 
3480 EXPORT_SYMBOL_GPL(sock_inuse_get);
3481 
3482 static int __net_init sock_inuse_init_net(struct net *net)
3483 {
3484 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3485 	if (net->core.prot_inuse == NULL)
3486 		return -ENOMEM;
3487 
3488 	net->core.sock_inuse = alloc_percpu(int);
3489 	if (net->core.sock_inuse == NULL)
3490 		goto out;
3491 
3492 	return 0;
3493 
3494 out:
3495 	free_percpu(net->core.prot_inuse);
3496 	return -ENOMEM;
3497 }
3498 
3499 static void __net_exit sock_inuse_exit_net(struct net *net)
3500 {
3501 	free_percpu(net->core.prot_inuse);
3502 	free_percpu(net->core.sock_inuse);
3503 }
3504 
3505 static struct pernet_operations net_inuse_ops = {
3506 	.init = sock_inuse_init_net,
3507 	.exit = sock_inuse_exit_net,
3508 };
3509 
3510 static __init int net_inuse_init(void)
3511 {
3512 	if (register_pernet_subsys(&net_inuse_ops))
3513 		panic("Cannot initialize net inuse counters");
3514 
3515 	return 0;
3516 }
3517 
3518 core_initcall(net_inuse_init);
3519 
3520 static int assign_proto_idx(struct proto *prot)
3521 {
3522 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3523 
3524 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3525 		pr_err("PROTO_INUSE_NR exhausted\n");
3526 		return -ENOSPC;
3527 	}
3528 
3529 	set_bit(prot->inuse_idx, proto_inuse_idx);
3530 	return 0;
3531 }
3532 
3533 static void release_proto_idx(struct proto *prot)
3534 {
3535 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3536 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3537 }
3538 #else
3539 static inline int assign_proto_idx(struct proto *prot)
3540 {
3541 	return 0;
3542 }
3543 
3544 static inline void release_proto_idx(struct proto *prot)
3545 {
3546 }
3547 
3548 static void sock_inuse_add(struct net *net, int val)
3549 {
3550 }
3551 #endif
3552 
3553 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3554 {
3555 	if (!twsk_prot)
3556 		return;
3557 	kfree(twsk_prot->twsk_slab_name);
3558 	twsk_prot->twsk_slab_name = NULL;
3559 	kmem_cache_destroy(twsk_prot->twsk_slab);
3560 	twsk_prot->twsk_slab = NULL;
3561 }
3562 
3563 static int tw_prot_init(const struct proto *prot)
3564 {
3565 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3566 
3567 	if (!twsk_prot)
3568 		return 0;
3569 
3570 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3571 					      prot->name);
3572 	if (!twsk_prot->twsk_slab_name)
3573 		return -ENOMEM;
3574 
3575 	twsk_prot->twsk_slab =
3576 		kmem_cache_create(twsk_prot->twsk_slab_name,
3577 				  twsk_prot->twsk_obj_size, 0,
3578 				  SLAB_ACCOUNT | prot->slab_flags,
3579 				  NULL);
3580 	if (!twsk_prot->twsk_slab) {
3581 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3582 			prot->name);
3583 		return -ENOMEM;
3584 	}
3585 
3586 	return 0;
3587 }
3588 
3589 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3590 {
3591 	if (!rsk_prot)
3592 		return;
3593 	kfree(rsk_prot->slab_name);
3594 	rsk_prot->slab_name = NULL;
3595 	kmem_cache_destroy(rsk_prot->slab);
3596 	rsk_prot->slab = NULL;
3597 }
3598 
3599 static int req_prot_init(const struct proto *prot)
3600 {
3601 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3602 
3603 	if (!rsk_prot)
3604 		return 0;
3605 
3606 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3607 					prot->name);
3608 	if (!rsk_prot->slab_name)
3609 		return -ENOMEM;
3610 
3611 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3612 					   rsk_prot->obj_size, 0,
3613 					   SLAB_ACCOUNT | prot->slab_flags,
3614 					   NULL);
3615 
3616 	if (!rsk_prot->slab) {
3617 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3618 			prot->name);
3619 		return -ENOMEM;
3620 	}
3621 	return 0;
3622 }
3623 
3624 int proto_register(struct proto *prot, int alloc_slab)
3625 {
3626 	int ret = -ENOBUFS;
3627 
3628 	if (alloc_slab) {
3629 		prot->slab = kmem_cache_create_usercopy(prot->name,
3630 					prot->obj_size, 0,
3631 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3632 					prot->slab_flags,
3633 					prot->useroffset, prot->usersize,
3634 					NULL);
3635 
3636 		if (prot->slab == NULL) {
3637 			pr_crit("%s: Can't create sock SLAB cache!\n",
3638 				prot->name);
3639 			goto out;
3640 		}
3641 
3642 		if (req_prot_init(prot))
3643 			goto out_free_request_sock_slab;
3644 
3645 		if (tw_prot_init(prot))
3646 			goto out_free_timewait_sock_slab;
3647 	}
3648 
3649 	mutex_lock(&proto_list_mutex);
3650 	ret = assign_proto_idx(prot);
3651 	if (ret) {
3652 		mutex_unlock(&proto_list_mutex);
3653 		goto out_free_timewait_sock_slab;
3654 	}
3655 	list_add(&prot->node, &proto_list);
3656 	mutex_unlock(&proto_list_mutex);
3657 	return ret;
3658 
3659 out_free_timewait_sock_slab:
3660 	if (alloc_slab)
3661 		tw_prot_cleanup(prot->twsk_prot);
3662 out_free_request_sock_slab:
3663 	if (alloc_slab) {
3664 		req_prot_cleanup(prot->rsk_prot);
3665 
3666 		kmem_cache_destroy(prot->slab);
3667 		prot->slab = NULL;
3668 	}
3669 out:
3670 	return ret;
3671 }
3672 EXPORT_SYMBOL(proto_register);
3673 
3674 void proto_unregister(struct proto *prot)
3675 {
3676 	mutex_lock(&proto_list_mutex);
3677 	release_proto_idx(prot);
3678 	list_del(&prot->node);
3679 	mutex_unlock(&proto_list_mutex);
3680 
3681 	kmem_cache_destroy(prot->slab);
3682 	prot->slab = NULL;
3683 
3684 	req_prot_cleanup(prot->rsk_prot);
3685 	tw_prot_cleanup(prot->twsk_prot);
3686 }
3687 EXPORT_SYMBOL(proto_unregister);
3688 
3689 int sock_load_diag_module(int family, int protocol)
3690 {
3691 	if (!protocol) {
3692 		if (!sock_is_registered(family))
3693 			return -ENOENT;
3694 
3695 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3696 				      NETLINK_SOCK_DIAG, family);
3697 	}
3698 
3699 #ifdef CONFIG_INET
3700 	if (family == AF_INET &&
3701 	    protocol != IPPROTO_RAW &&
3702 	    protocol < MAX_INET_PROTOS &&
3703 	    !rcu_access_pointer(inet_protos[protocol]))
3704 		return -ENOENT;
3705 #endif
3706 
3707 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3708 			      NETLINK_SOCK_DIAG, family, protocol);
3709 }
3710 EXPORT_SYMBOL(sock_load_diag_module);
3711 
3712 #ifdef CONFIG_PROC_FS
3713 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3714 	__acquires(proto_list_mutex)
3715 {
3716 	mutex_lock(&proto_list_mutex);
3717 	return seq_list_start_head(&proto_list, *pos);
3718 }
3719 
3720 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3721 {
3722 	return seq_list_next(v, &proto_list, pos);
3723 }
3724 
3725 static void proto_seq_stop(struct seq_file *seq, void *v)
3726 	__releases(proto_list_mutex)
3727 {
3728 	mutex_unlock(&proto_list_mutex);
3729 }
3730 
3731 static char proto_method_implemented(const void *method)
3732 {
3733 	return method == NULL ? 'n' : 'y';
3734 }
3735 static long sock_prot_memory_allocated(struct proto *proto)
3736 {
3737 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3738 }
3739 
3740 static const char *sock_prot_memory_pressure(struct proto *proto)
3741 {
3742 	return proto->memory_pressure != NULL ?
3743 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3744 }
3745 
3746 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3747 {
3748 
3749 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3750 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3751 		   proto->name,
3752 		   proto->obj_size,
3753 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3754 		   sock_prot_memory_allocated(proto),
3755 		   sock_prot_memory_pressure(proto),
3756 		   proto->max_header,
3757 		   proto->slab == NULL ? "no" : "yes",
3758 		   module_name(proto->owner),
3759 		   proto_method_implemented(proto->close),
3760 		   proto_method_implemented(proto->connect),
3761 		   proto_method_implemented(proto->disconnect),
3762 		   proto_method_implemented(proto->accept),
3763 		   proto_method_implemented(proto->ioctl),
3764 		   proto_method_implemented(proto->init),
3765 		   proto_method_implemented(proto->destroy),
3766 		   proto_method_implemented(proto->shutdown),
3767 		   proto_method_implemented(proto->setsockopt),
3768 		   proto_method_implemented(proto->getsockopt),
3769 		   proto_method_implemented(proto->sendmsg),
3770 		   proto_method_implemented(proto->recvmsg),
3771 		   proto_method_implemented(proto->sendpage),
3772 		   proto_method_implemented(proto->bind),
3773 		   proto_method_implemented(proto->backlog_rcv),
3774 		   proto_method_implemented(proto->hash),
3775 		   proto_method_implemented(proto->unhash),
3776 		   proto_method_implemented(proto->get_port),
3777 		   proto_method_implemented(proto->enter_memory_pressure));
3778 }
3779 
3780 static int proto_seq_show(struct seq_file *seq, void *v)
3781 {
3782 	if (v == &proto_list)
3783 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3784 			   "protocol",
3785 			   "size",
3786 			   "sockets",
3787 			   "memory",
3788 			   "press",
3789 			   "maxhdr",
3790 			   "slab",
3791 			   "module",
3792 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3793 	else
3794 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3795 	return 0;
3796 }
3797 
3798 static const struct seq_operations proto_seq_ops = {
3799 	.start  = proto_seq_start,
3800 	.next   = proto_seq_next,
3801 	.stop   = proto_seq_stop,
3802 	.show   = proto_seq_show,
3803 };
3804 
3805 static __net_init int proto_init_net(struct net *net)
3806 {
3807 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3808 			sizeof(struct seq_net_private)))
3809 		return -ENOMEM;
3810 
3811 	return 0;
3812 }
3813 
3814 static __net_exit void proto_exit_net(struct net *net)
3815 {
3816 	remove_proc_entry("protocols", net->proc_net);
3817 }
3818 
3819 
3820 static __net_initdata struct pernet_operations proto_net_ops = {
3821 	.init = proto_init_net,
3822 	.exit = proto_exit_net,
3823 };
3824 
3825 static int __init proto_init(void)
3826 {
3827 	return register_pernet_subsys(&proto_net_ops);
3828 }
3829 
3830 subsys_initcall(proto_init);
3831 
3832 #endif /* PROC_FS */
3833 
3834 #ifdef CONFIG_NET_RX_BUSY_POLL
3835 bool sk_busy_loop_end(void *p, unsigned long start_time)
3836 {
3837 	struct sock *sk = p;
3838 
3839 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3840 	       sk_busy_loop_timeout(sk, start_time);
3841 }
3842 EXPORT_SYMBOL(sk_busy_loop_end);
3843 #endif /* CONFIG_NET_RX_BUSY_POLL */
3844 
3845 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3846 {
3847 	if (!sk->sk_prot->bind_add)
3848 		return -EOPNOTSUPP;
3849 	return sk->sk_prot->bind_add(sk, addr, addr_len);
3850 }
3851 EXPORT_SYMBOL(sock_bind_add);
3852