xref: /linux/net/core/sock.c (revision 5ea2f5ffde39251115ef9a566262fb9e52b91cb7)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117 
118 #include <linux/uaccess.h>
119 
120 #include <linux/netdevice.h>
121 #include <net/protocol.h>
122 #include <linux/skbuff.h>
123 #include <net/net_namespace.h>
124 #include <net/request_sock.h>
125 #include <net/sock.h>
126 #include <linux/net_tstamp.h>
127 #include <net/xfrm.h>
128 #include <linux/ipsec.h>
129 #include <net/cls_cgroup.h>
130 #include <net/netprio_cgroup.h>
131 #include <linux/sock_diag.h>
132 
133 #include <linux/filter.h>
134 #include <net/sock_reuseport.h>
135 #include <net/bpf_sk_storage.h>
136 
137 #include <trace/events/sock.h>
138 
139 #include <net/tcp.h>
140 #include <net/busy_poll.h>
141 
142 #include <linux/ethtool.h>
143 
144 static DEFINE_MUTEX(proto_list_mutex);
145 static LIST_HEAD(proto_list);
146 
147 static void sock_inuse_add(struct net *net, int val);
148 
149 /**
150  * sk_ns_capable - General socket capability test
151  * @sk: Socket to use a capability on or through
152  * @user_ns: The user namespace of the capability to use
153  * @cap: The capability to use
154  *
155  * Test to see if the opener of the socket had when the socket was
156  * created and the current process has the capability @cap in the user
157  * namespace @user_ns.
158  */
159 bool sk_ns_capable(const struct sock *sk,
160 		   struct user_namespace *user_ns, int cap)
161 {
162 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
163 		ns_capable(user_ns, cap);
164 }
165 EXPORT_SYMBOL(sk_ns_capable);
166 
167 /**
168  * sk_capable - Socket global capability test
169  * @sk: Socket to use a capability on or through
170  * @cap: The global capability to use
171  *
172  * Test to see if the opener of the socket had when the socket was
173  * created and the current process has the capability @cap in all user
174  * namespaces.
175  */
176 bool sk_capable(const struct sock *sk, int cap)
177 {
178 	return sk_ns_capable(sk, &init_user_ns, cap);
179 }
180 EXPORT_SYMBOL(sk_capable);
181 
182 /**
183  * sk_net_capable - Network namespace socket capability test
184  * @sk: Socket to use a capability on or through
185  * @cap: The capability to use
186  *
187  * Test to see if the opener of the socket had when the socket was created
188  * and the current process has the capability @cap over the network namespace
189  * the socket is a member of.
190  */
191 bool sk_net_capable(const struct sock *sk, int cap)
192 {
193 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
194 }
195 EXPORT_SYMBOL(sk_net_capable);
196 
197 /*
198  * Each address family might have different locking rules, so we have
199  * one slock key per address family and separate keys for internal and
200  * userspace sockets.
201  */
202 static struct lock_class_key af_family_keys[AF_MAX];
203 static struct lock_class_key af_family_kern_keys[AF_MAX];
204 static struct lock_class_key af_family_slock_keys[AF_MAX];
205 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
206 
207 /*
208  * Make lock validator output more readable. (we pre-construct these
209  * strings build-time, so that runtime initialization of socket
210  * locks is fast):
211  */
212 
213 #define _sock_locks(x)						  \
214   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
215   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
216   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
217   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
218   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
219   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
220   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
221   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
222   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
223   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
224   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
225   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
226   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
227   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
228   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
229   x "AF_MCTP"  , \
230   x "AF_MAX"
231 
232 static const char *const af_family_key_strings[AF_MAX+1] = {
233 	_sock_locks("sk_lock-")
234 };
235 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
236 	_sock_locks("slock-")
237 };
238 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
239 	_sock_locks("clock-")
240 };
241 
242 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
243 	_sock_locks("k-sk_lock-")
244 };
245 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
246 	_sock_locks("k-slock-")
247 };
248 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
249 	_sock_locks("k-clock-")
250 };
251 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
252 	_sock_locks("rlock-")
253 };
254 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
255 	_sock_locks("wlock-")
256 };
257 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
258 	_sock_locks("elock-")
259 };
260 
261 /*
262  * sk_callback_lock and sk queues locking rules are per-address-family,
263  * so split the lock classes by using a per-AF key:
264  */
265 static struct lock_class_key af_callback_keys[AF_MAX];
266 static struct lock_class_key af_rlock_keys[AF_MAX];
267 static struct lock_class_key af_wlock_keys[AF_MAX];
268 static struct lock_class_key af_elock_keys[AF_MAX];
269 static struct lock_class_key af_kern_callback_keys[AF_MAX];
270 
271 /* Run time adjustable parameters. */
272 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
273 EXPORT_SYMBOL(sysctl_wmem_max);
274 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
275 EXPORT_SYMBOL(sysctl_rmem_max);
276 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
277 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
278 
279 /* Maximal space eaten by iovec or ancillary data plus some space */
280 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
281 EXPORT_SYMBOL(sysctl_optmem_max);
282 
283 int sysctl_tstamp_allow_data __read_mostly = 1;
284 
285 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
286 EXPORT_SYMBOL_GPL(memalloc_socks_key);
287 
288 /**
289  * sk_set_memalloc - sets %SOCK_MEMALLOC
290  * @sk: socket to set it on
291  *
292  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
293  * It's the responsibility of the admin to adjust min_free_kbytes
294  * to meet the requirements
295  */
296 void sk_set_memalloc(struct sock *sk)
297 {
298 	sock_set_flag(sk, SOCK_MEMALLOC);
299 	sk->sk_allocation |= __GFP_MEMALLOC;
300 	static_branch_inc(&memalloc_socks_key);
301 }
302 EXPORT_SYMBOL_GPL(sk_set_memalloc);
303 
304 void sk_clear_memalloc(struct sock *sk)
305 {
306 	sock_reset_flag(sk, SOCK_MEMALLOC);
307 	sk->sk_allocation &= ~__GFP_MEMALLOC;
308 	static_branch_dec(&memalloc_socks_key);
309 
310 	/*
311 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
312 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
313 	 * it has rmem allocations due to the last swapfile being deactivated
314 	 * but there is a risk that the socket is unusable due to exceeding
315 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
316 	 */
317 	sk_mem_reclaim(sk);
318 }
319 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
320 
321 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
322 {
323 	int ret;
324 	unsigned int noreclaim_flag;
325 
326 	/* these should have been dropped before queueing */
327 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
328 
329 	noreclaim_flag = memalloc_noreclaim_save();
330 	ret = sk->sk_backlog_rcv(sk, skb);
331 	memalloc_noreclaim_restore(noreclaim_flag);
332 
333 	return ret;
334 }
335 EXPORT_SYMBOL(__sk_backlog_rcv);
336 
337 void sk_error_report(struct sock *sk)
338 {
339 	sk->sk_error_report(sk);
340 
341 	switch (sk->sk_family) {
342 	case AF_INET:
343 		fallthrough;
344 	case AF_INET6:
345 		trace_inet_sk_error_report(sk);
346 		break;
347 	default:
348 		break;
349 	}
350 }
351 EXPORT_SYMBOL(sk_error_report);
352 
353 static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
354 {
355 	struct __kernel_sock_timeval tv;
356 
357 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
358 		tv.tv_sec = 0;
359 		tv.tv_usec = 0;
360 	} else {
361 		tv.tv_sec = timeo / HZ;
362 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
363 	}
364 
365 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
366 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
367 		*(struct old_timeval32 *)optval = tv32;
368 		return sizeof(tv32);
369 	}
370 
371 	if (old_timeval) {
372 		struct __kernel_old_timeval old_tv;
373 		old_tv.tv_sec = tv.tv_sec;
374 		old_tv.tv_usec = tv.tv_usec;
375 		*(struct __kernel_old_timeval *)optval = old_tv;
376 		return sizeof(old_tv);
377 	}
378 
379 	*(struct __kernel_sock_timeval *)optval = tv;
380 	return sizeof(tv);
381 }
382 
383 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
384 			    bool old_timeval)
385 {
386 	struct __kernel_sock_timeval tv;
387 
388 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
389 		struct old_timeval32 tv32;
390 
391 		if (optlen < sizeof(tv32))
392 			return -EINVAL;
393 
394 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
395 			return -EFAULT;
396 		tv.tv_sec = tv32.tv_sec;
397 		tv.tv_usec = tv32.tv_usec;
398 	} else if (old_timeval) {
399 		struct __kernel_old_timeval old_tv;
400 
401 		if (optlen < sizeof(old_tv))
402 			return -EINVAL;
403 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
404 			return -EFAULT;
405 		tv.tv_sec = old_tv.tv_sec;
406 		tv.tv_usec = old_tv.tv_usec;
407 	} else {
408 		if (optlen < sizeof(tv))
409 			return -EINVAL;
410 		if (copy_from_sockptr(&tv, optval, sizeof(tv)))
411 			return -EFAULT;
412 	}
413 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
414 		return -EDOM;
415 
416 	if (tv.tv_sec < 0) {
417 		static int warned __read_mostly;
418 
419 		*timeo_p = 0;
420 		if (warned < 10 && net_ratelimit()) {
421 			warned++;
422 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
423 				__func__, current->comm, task_pid_nr(current));
424 		}
425 		return 0;
426 	}
427 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
428 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
429 		return 0;
430 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
431 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
432 	return 0;
433 }
434 
435 static bool sock_needs_netstamp(const struct sock *sk)
436 {
437 	switch (sk->sk_family) {
438 	case AF_UNSPEC:
439 	case AF_UNIX:
440 		return false;
441 	default:
442 		return true;
443 	}
444 }
445 
446 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
447 {
448 	if (sk->sk_flags & flags) {
449 		sk->sk_flags &= ~flags;
450 		if (sock_needs_netstamp(sk) &&
451 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
452 			net_disable_timestamp();
453 	}
454 }
455 
456 
457 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
458 {
459 	unsigned long flags;
460 	struct sk_buff_head *list = &sk->sk_receive_queue;
461 
462 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
463 		atomic_inc(&sk->sk_drops);
464 		trace_sock_rcvqueue_full(sk, skb);
465 		return -ENOMEM;
466 	}
467 
468 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
469 		atomic_inc(&sk->sk_drops);
470 		return -ENOBUFS;
471 	}
472 
473 	skb->dev = NULL;
474 	skb_set_owner_r(skb, sk);
475 
476 	/* we escape from rcu protected region, make sure we dont leak
477 	 * a norefcounted dst
478 	 */
479 	skb_dst_force(skb);
480 
481 	spin_lock_irqsave(&list->lock, flags);
482 	sock_skb_set_dropcount(sk, skb);
483 	__skb_queue_tail(list, skb);
484 	spin_unlock_irqrestore(&list->lock, flags);
485 
486 	if (!sock_flag(sk, SOCK_DEAD))
487 		sk->sk_data_ready(sk);
488 	return 0;
489 }
490 EXPORT_SYMBOL(__sock_queue_rcv_skb);
491 
492 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
493 {
494 	int err;
495 
496 	err = sk_filter(sk, skb);
497 	if (err)
498 		return err;
499 
500 	return __sock_queue_rcv_skb(sk, skb);
501 }
502 EXPORT_SYMBOL(sock_queue_rcv_skb);
503 
504 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
505 		     const int nested, unsigned int trim_cap, bool refcounted)
506 {
507 	int rc = NET_RX_SUCCESS;
508 
509 	if (sk_filter_trim_cap(sk, skb, trim_cap))
510 		goto discard_and_relse;
511 
512 	skb->dev = NULL;
513 
514 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
515 		atomic_inc(&sk->sk_drops);
516 		goto discard_and_relse;
517 	}
518 	if (nested)
519 		bh_lock_sock_nested(sk);
520 	else
521 		bh_lock_sock(sk);
522 	if (!sock_owned_by_user(sk)) {
523 		/*
524 		 * trylock + unlock semantics:
525 		 */
526 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
527 
528 		rc = sk_backlog_rcv(sk, skb);
529 
530 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
531 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
532 		bh_unlock_sock(sk);
533 		atomic_inc(&sk->sk_drops);
534 		goto discard_and_relse;
535 	}
536 
537 	bh_unlock_sock(sk);
538 out:
539 	if (refcounted)
540 		sock_put(sk);
541 	return rc;
542 discard_and_relse:
543 	kfree_skb(skb);
544 	goto out;
545 }
546 EXPORT_SYMBOL(__sk_receive_skb);
547 
548 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
549 							  u32));
550 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
551 							   u32));
552 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
553 {
554 	struct dst_entry *dst = __sk_dst_get(sk);
555 
556 	if (dst && dst->obsolete &&
557 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
558 			       dst, cookie) == NULL) {
559 		sk_tx_queue_clear(sk);
560 		sk->sk_dst_pending_confirm = 0;
561 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
562 		dst_release(dst);
563 		return NULL;
564 	}
565 
566 	return dst;
567 }
568 EXPORT_SYMBOL(__sk_dst_check);
569 
570 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
571 {
572 	struct dst_entry *dst = sk_dst_get(sk);
573 
574 	if (dst && dst->obsolete &&
575 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
576 			       dst, cookie) == NULL) {
577 		sk_dst_reset(sk);
578 		dst_release(dst);
579 		return NULL;
580 	}
581 
582 	return dst;
583 }
584 EXPORT_SYMBOL(sk_dst_check);
585 
586 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
587 {
588 	int ret = -ENOPROTOOPT;
589 #ifdef CONFIG_NETDEVICES
590 	struct net *net = sock_net(sk);
591 
592 	/* Sorry... */
593 	ret = -EPERM;
594 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
595 		goto out;
596 
597 	ret = -EINVAL;
598 	if (ifindex < 0)
599 		goto out;
600 
601 	sk->sk_bound_dev_if = ifindex;
602 	if (sk->sk_prot->rehash)
603 		sk->sk_prot->rehash(sk);
604 	sk_dst_reset(sk);
605 
606 	ret = 0;
607 
608 out:
609 #endif
610 
611 	return ret;
612 }
613 
614 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
615 {
616 	int ret;
617 
618 	if (lock_sk)
619 		lock_sock(sk);
620 	ret = sock_bindtoindex_locked(sk, ifindex);
621 	if (lock_sk)
622 		release_sock(sk);
623 
624 	return ret;
625 }
626 EXPORT_SYMBOL(sock_bindtoindex);
627 
628 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
629 {
630 	int ret = -ENOPROTOOPT;
631 #ifdef CONFIG_NETDEVICES
632 	struct net *net = sock_net(sk);
633 	char devname[IFNAMSIZ];
634 	int index;
635 
636 	ret = -EINVAL;
637 	if (optlen < 0)
638 		goto out;
639 
640 	/* Bind this socket to a particular device like "eth0",
641 	 * as specified in the passed interface name. If the
642 	 * name is "" or the option length is zero the socket
643 	 * is not bound.
644 	 */
645 	if (optlen > IFNAMSIZ - 1)
646 		optlen = IFNAMSIZ - 1;
647 	memset(devname, 0, sizeof(devname));
648 
649 	ret = -EFAULT;
650 	if (copy_from_sockptr(devname, optval, optlen))
651 		goto out;
652 
653 	index = 0;
654 	if (devname[0] != '\0') {
655 		struct net_device *dev;
656 
657 		rcu_read_lock();
658 		dev = dev_get_by_name_rcu(net, devname);
659 		if (dev)
660 			index = dev->ifindex;
661 		rcu_read_unlock();
662 		ret = -ENODEV;
663 		if (!dev)
664 			goto out;
665 	}
666 
667 	return sock_bindtoindex(sk, index, true);
668 out:
669 #endif
670 
671 	return ret;
672 }
673 
674 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
675 				int __user *optlen, int len)
676 {
677 	int ret = -ENOPROTOOPT;
678 #ifdef CONFIG_NETDEVICES
679 	struct net *net = sock_net(sk);
680 	char devname[IFNAMSIZ];
681 
682 	if (sk->sk_bound_dev_if == 0) {
683 		len = 0;
684 		goto zero;
685 	}
686 
687 	ret = -EINVAL;
688 	if (len < IFNAMSIZ)
689 		goto out;
690 
691 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
692 	if (ret)
693 		goto out;
694 
695 	len = strlen(devname) + 1;
696 
697 	ret = -EFAULT;
698 	if (copy_to_user(optval, devname, len))
699 		goto out;
700 
701 zero:
702 	ret = -EFAULT;
703 	if (put_user(len, optlen))
704 		goto out;
705 
706 	ret = 0;
707 
708 out:
709 #endif
710 
711 	return ret;
712 }
713 
714 bool sk_mc_loop(struct sock *sk)
715 {
716 	if (dev_recursion_level())
717 		return false;
718 	if (!sk)
719 		return true;
720 	switch (sk->sk_family) {
721 	case AF_INET:
722 		return inet_sk(sk)->mc_loop;
723 #if IS_ENABLED(CONFIG_IPV6)
724 	case AF_INET6:
725 		return inet6_sk(sk)->mc_loop;
726 #endif
727 	}
728 	WARN_ON_ONCE(1);
729 	return true;
730 }
731 EXPORT_SYMBOL(sk_mc_loop);
732 
733 void sock_set_reuseaddr(struct sock *sk)
734 {
735 	lock_sock(sk);
736 	sk->sk_reuse = SK_CAN_REUSE;
737 	release_sock(sk);
738 }
739 EXPORT_SYMBOL(sock_set_reuseaddr);
740 
741 void sock_set_reuseport(struct sock *sk)
742 {
743 	lock_sock(sk);
744 	sk->sk_reuseport = true;
745 	release_sock(sk);
746 }
747 EXPORT_SYMBOL(sock_set_reuseport);
748 
749 void sock_no_linger(struct sock *sk)
750 {
751 	lock_sock(sk);
752 	sk->sk_lingertime = 0;
753 	sock_set_flag(sk, SOCK_LINGER);
754 	release_sock(sk);
755 }
756 EXPORT_SYMBOL(sock_no_linger);
757 
758 void sock_set_priority(struct sock *sk, u32 priority)
759 {
760 	lock_sock(sk);
761 	sk->sk_priority = priority;
762 	release_sock(sk);
763 }
764 EXPORT_SYMBOL(sock_set_priority);
765 
766 void sock_set_sndtimeo(struct sock *sk, s64 secs)
767 {
768 	lock_sock(sk);
769 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
770 		sk->sk_sndtimeo = secs * HZ;
771 	else
772 		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
773 	release_sock(sk);
774 }
775 EXPORT_SYMBOL(sock_set_sndtimeo);
776 
777 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
778 {
779 	if (val)  {
780 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
781 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
782 		sock_set_flag(sk, SOCK_RCVTSTAMP);
783 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
784 	} else {
785 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
786 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
787 	}
788 }
789 
790 void sock_enable_timestamps(struct sock *sk)
791 {
792 	lock_sock(sk);
793 	__sock_set_timestamps(sk, true, false, true);
794 	release_sock(sk);
795 }
796 EXPORT_SYMBOL(sock_enable_timestamps);
797 
798 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
799 {
800 	switch (optname) {
801 	case SO_TIMESTAMP_OLD:
802 		__sock_set_timestamps(sk, valbool, false, false);
803 		break;
804 	case SO_TIMESTAMP_NEW:
805 		__sock_set_timestamps(sk, valbool, true, false);
806 		break;
807 	case SO_TIMESTAMPNS_OLD:
808 		__sock_set_timestamps(sk, valbool, false, true);
809 		break;
810 	case SO_TIMESTAMPNS_NEW:
811 		__sock_set_timestamps(sk, valbool, true, true);
812 		break;
813 	}
814 }
815 
816 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
817 {
818 	struct net *net = sock_net(sk);
819 	struct net_device *dev = NULL;
820 	bool match = false;
821 	int *vclock_index;
822 	int i, num;
823 
824 	if (sk->sk_bound_dev_if)
825 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
826 
827 	if (!dev) {
828 		pr_err("%s: sock not bind to device\n", __func__);
829 		return -EOPNOTSUPP;
830 	}
831 
832 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
833 	for (i = 0; i < num; i++) {
834 		if (*(vclock_index + i) == phc_index) {
835 			match = true;
836 			break;
837 		}
838 	}
839 
840 	if (num > 0)
841 		kfree(vclock_index);
842 
843 	if (!match)
844 		return -EINVAL;
845 
846 	sk->sk_bind_phc = phc_index;
847 
848 	return 0;
849 }
850 
851 int sock_set_timestamping(struct sock *sk, int optname,
852 			  struct so_timestamping timestamping)
853 {
854 	int val = timestamping.flags;
855 	int ret;
856 
857 	if (val & ~SOF_TIMESTAMPING_MASK)
858 		return -EINVAL;
859 
860 	if (val & SOF_TIMESTAMPING_OPT_ID &&
861 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
862 		if (sk->sk_protocol == IPPROTO_TCP &&
863 		    sk->sk_type == SOCK_STREAM) {
864 			if ((1 << sk->sk_state) &
865 			    (TCPF_CLOSE | TCPF_LISTEN))
866 				return -EINVAL;
867 			sk->sk_tskey = tcp_sk(sk)->snd_una;
868 		} else {
869 			sk->sk_tskey = 0;
870 		}
871 	}
872 
873 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
874 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
875 		return -EINVAL;
876 
877 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
878 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
879 		if (ret)
880 			return ret;
881 	}
882 
883 	sk->sk_tsflags = val;
884 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
885 
886 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
887 		sock_enable_timestamp(sk,
888 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
889 	else
890 		sock_disable_timestamp(sk,
891 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
892 	return 0;
893 }
894 
895 void sock_set_keepalive(struct sock *sk)
896 {
897 	lock_sock(sk);
898 	if (sk->sk_prot->keepalive)
899 		sk->sk_prot->keepalive(sk, true);
900 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
901 	release_sock(sk);
902 }
903 EXPORT_SYMBOL(sock_set_keepalive);
904 
905 static void __sock_set_rcvbuf(struct sock *sk, int val)
906 {
907 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
908 	 * as a negative value.
909 	 */
910 	val = min_t(int, val, INT_MAX / 2);
911 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
912 
913 	/* We double it on the way in to account for "struct sk_buff" etc.
914 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
915 	 * will allow that much actual data to be received on that socket.
916 	 *
917 	 * Applications are unaware that "struct sk_buff" and other overheads
918 	 * allocate from the receive buffer during socket buffer allocation.
919 	 *
920 	 * And after considering the possible alternatives, returning the value
921 	 * we actually used in getsockopt is the most desirable behavior.
922 	 */
923 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
924 }
925 
926 void sock_set_rcvbuf(struct sock *sk, int val)
927 {
928 	lock_sock(sk);
929 	__sock_set_rcvbuf(sk, val);
930 	release_sock(sk);
931 }
932 EXPORT_SYMBOL(sock_set_rcvbuf);
933 
934 static void __sock_set_mark(struct sock *sk, u32 val)
935 {
936 	if (val != sk->sk_mark) {
937 		sk->sk_mark = val;
938 		sk_dst_reset(sk);
939 	}
940 }
941 
942 void sock_set_mark(struct sock *sk, u32 val)
943 {
944 	lock_sock(sk);
945 	__sock_set_mark(sk, val);
946 	release_sock(sk);
947 }
948 EXPORT_SYMBOL(sock_set_mark);
949 
950 /*
951  *	This is meant for all protocols to use and covers goings on
952  *	at the socket level. Everything here is generic.
953  */
954 
955 int sock_setsockopt(struct socket *sock, int level, int optname,
956 		    sockptr_t optval, unsigned int optlen)
957 {
958 	struct so_timestamping timestamping;
959 	struct sock_txtime sk_txtime;
960 	struct sock *sk = sock->sk;
961 	int val;
962 	int valbool;
963 	struct linger ling;
964 	int ret = 0;
965 
966 	/*
967 	 *	Options without arguments
968 	 */
969 
970 	if (optname == SO_BINDTODEVICE)
971 		return sock_setbindtodevice(sk, optval, optlen);
972 
973 	if (optlen < sizeof(int))
974 		return -EINVAL;
975 
976 	if (copy_from_sockptr(&val, optval, sizeof(val)))
977 		return -EFAULT;
978 
979 	valbool = val ? 1 : 0;
980 
981 	lock_sock(sk);
982 
983 	switch (optname) {
984 	case SO_DEBUG:
985 		if (val && !capable(CAP_NET_ADMIN))
986 			ret = -EACCES;
987 		else
988 			sock_valbool_flag(sk, SOCK_DBG, valbool);
989 		break;
990 	case SO_REUSEADDR:
991 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
992 		break;
993 	case SO_REUSEPORT:
994 		sk->sk_reuseport = valbool;
995 		break;
996 	case SO_TYPE:
997 	case SO_PROTOCOL:
998 	case SO_DOMAIN:
999 	case SO_ERROR:
1000 		ret = -ENOPROTOOPT;
1001 		break;
1002 	case SO_DONTROUTE:
1003 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1004 		sk_dst_reset(sk);
1005 		break;
1006 	case SO_BROADCAST:
1007 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1008 		break;
1009 	case SO_SNDBUF:
1010 		/* Don't error on this BSD doesn't and if you think
1011 		 * about it this is right. Otherwise apps have to
1012 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1013 		 * are treated in BSD as hints
1014 		 */
1015 		val = min_t(u32, val, sysctl_wmem_max);
1016 set_sndbuf:
1017 		/* Ensure val * 2 fits into an int, to prevent max_t()
1018 		 * from treating it as a negative value.
1019 		 */
1020 		val = min_t(int, val, INT_MAX / 2);
1021 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1022 		WRITE_ONCE(sk->sk_sndbuf,
1023 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1024 		/* Wake up sending tasks if we upped the value. */
1025 		sk->sk_write_space(sk);
1026 		break;
1027 
1028 	case SO_SNDBUFFORCE:
1029 		if (!capable(CAP_NET_ADMIN)) {
1030 			ret = -EPERM;
1031 			break;
1032 		}
1033 
1034 		/* No negative values (to prevent underflow, as val will be
1035 		 * multiplied by 2).
1036 		 */
1037 		if (val < 0)
1038 			val = 0;
1039 		goto set_sndbuf;
1040 
1041 	case SO_RCVBUF:
1042 		/* Don't error on this BSD doesn't and if you think
1043 		 * about it this is right. Otherwise apps have to
1044 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1045 		 * are treated in BSD as hints
1046 		 */
1047 		__sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
1048 		break;
1049 
1050 	case SO_RCVBUFFORCE:
1051 		if (!capable(CAP_NET_ADMIN)) {
1052 			ret = -EPERM;
1053 			break;
1054 		}
1055 
1056 		/* No negative values (to prevent underflow, as val will be
1057 		 * multiplied by 2).
1058 		 */
1059 		__sock_set_rcvbuf(sk, max(val, 0));
1060 		break;
1061 
1062 	case SO_KEEPALIVE:
1063 		if (sk->sk_prot->keepalive)
1064 			sk->sk_prot->keepalive(sk, valbool);
1065 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1066 		break;
1067 
1068 	case SO_OOBINLINE:
1069 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1070 		break;
1071 
1072 	case SO_NO_CHECK:
1073 		sk->sk_no_check_tx = valbool;
1074 		break;
1075 
1076 	case SO_PRIORITY:
1077 		if ((val >= 0 && val <= 6) ||
1078 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1079 			sk->sk_priority = val;
1080 		else
1081 			ret = -EPERM;
1082 		break;
1083 
1084 	case SO_LINGER:
1085 		if (optlen < sizeof(ling)) {
1086 			ret = -EINVAL;	/* 1003.1g */
1087 			break;
1088 		}
1089 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1090 			ret = -EFAULT;
1091 			break;
1092 		}
1093 		if (!ling.l_onoff)
1094 			sock_reset_flag(sk, SOCK_LINGER);
1095 		else {
1096 #if (BITS_PER_LONG == 32)
1097 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1098 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1099 			else
1100 #endif
1101 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1102 			sock_set_flag(sk, SOCK_LINGER);
1103 		}
1104 		break;
1105 
1106 	case SO_BSDCOMPAT:
1107 		break;
1108 
1109 	case SO_PASSCRED:
1110 		if (valbool)
1111 			set_bit(SOCK_PASSCRED, &sock->flags);
1112 		else
1113 			clear_bit(SOCK_PASSCRED, &sock->flags);
1114 		break;
1115 
1116 	case SO_TIMESTAMP_OLD:
1117 	case SO_TIMESTAMP_NEW:
1118 	case SO_TIMESTAMPNS_OLD:
1119 	case SO_TIMESTAMPNS_NEW:
1120 		sock_set_timestamp(sk, optname, valbool);
1121 		break;
1122 
1123 	case SO_TIMESTAMPING_NEW:
1124 	case SO_TIMESTAMPING_OLD:
1125 		if (optlen == sizeof(timestamping)) {
1126 			if (copy_from_sockptr(&timestamping, optval,
1127 					      sizeof(timestamping))) {
1128 				ret = -EFAULT;
1129 				break;
1130 			}
1131 		} else {
1132 			memset(&timestamping, 0, sizeof(timestamping));
1133 			timestamping.flags = val;
1134 		}
1135 		ret = sock_set_timestamping(sk, optname, timestamping);
1136 		break;
1137 
1138 	case SO_RCVLOWAT:
1139 		if (val < 0)
1140 			val = INT_MAX;
1141 		if (sock->ops->set_rcvlowat)
1142 			ret = sock->ops->set_rcvlowat(sk, val);
1143 		else
1144 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1145 		break;
1146 
1147 	case SO_RCVTIMEO_OLD:
1148 	case SO_RCVTIMEO_NEW:
1149 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1150 				       optlen, optname == SO_RCVTIMEO_OLD);
1151 		break;
1152 
1153 	case SO_SNDTIMEO_OLD:
1154 	case SO_SNDTIMEO_NEW:
1155 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1156 				       optlen, optname == SO_SNDTIMEO_OLD);
1157 		break;
1158 
1159 	case SO_ATTACH_FILTER: {
1160 		struct sock_fprog fprog;
1161 
1162 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1163 		if (!ret)
1164 			ret = sk_attach_filter(&fprog, sk);
1165 		break;
1166 	}
1167 	case SO_ATTACH_BPF:
1168 		ret = -EINVAL;
1169 		if (optlen == sizeof(u32)) {
1170 			u32 ufd;
1171 
1172 			ret = -EFAULT;
1173 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1174 				break;
1175 
1176 			ret = sk_attach_bpf(ufd, sk);
1177 		}
1178 		break;
1179 
1180 	case SO_ATTACH_REUSEPORT_CBPF: {
1181 		struct sock_fprog fprog;
1182 
1183 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1184 		if (!ret)
1185 			ret = sk_reuseport_attach_filter(&fprog, sk);
1186 		break;
1187 	}
1188 	case SO_ATTACH_REUSEPORT_EBPF:
1189 		ret = -EINVAL;
1190 		if (optlen == sizeof(u32)) {
1191 			u32 ufd;
1192 
1193 			ret = -EFAULT;
1194 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1195 				break;
1196 
1197 			ret = sk_reuseport_attach_bpf(ufd, sk);
1198 		}
1199 		break;
1200 
1201 	case SO_DETACH_REUSEPORT_BPF:
1202 		ret = reuseport_detach_prog(sk);
1203 		break;
1204 
1205 	case SO_DETACH_FILTER:
1206 		ret = sk_detach_filter(sk);
1207 		break;
1208 
1209 	case SO_LOCK_FILTER:
1210 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1211 			ret = -EPERM;
1212 		else
1213 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1214 		break;
1215 
1216 	case SO_PASSSEC:
1217 		if (valbool)
1218 			set_bit(SOCK_PASSSEC, &sock->flags);
1219 		else
1220 			clear_bit(SOCK_PASSSEC, &sock->flags);
1221 		break;
1222 	case SO_MARK:
1223 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1224 			ret = -EPERM;
1225 			break;
1226 		}
1227 
1228 		__sock_set_mark(sk, val);
1229 		break;
1230 
1231 	case SO_RXQ_OVFL:
1232 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1233 		break;
1234 
1235 	case SO_WIFI_STATUS:
1236 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1237 		break;
1238 
1239 	case SO_PEEK_OFF:
1240 		if (sock->ops->set_peek_off)
1241 			ret = sock->ops->set_peek_off(sk, val);
1242 		else
1243 			ret = -EOPNOTSUPP;
1244 		break;
1245 
1246 	case SO_NOFCS:
1247 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1248 		break;
1249 
1250 	case SO_SELECT_ERR_QUEUE:
1251 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1252 		break;
1253 
1254 #ifdef CONFIG_NET_RX_BUSY_POLL
1255 	case SO_BUSY_POLL:
1256 		/* allow unprivileged users to decrease the value */
1257 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1258 			ret = -EPERM;
1259 		else {
1260 			if (val < 0)
1261 				ret = -EINVAL;
1262 			else
1263 				WRITE_ONCE(sk->sk_ll_usec, val);
1264 		}
1265 		break;
1266 	case SO_PREFER_BUSY_POLL:
1267 		if (valbool && !capable(CAP_NET_ADMIN))
1268 			ret = -EPERM;
1269 		else
1270 			WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1271 		break;
1272 	case SO_BUSY_POLL_BUDGET:
1273 		if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1274 			ret = -EPERM;
1275 		} else {
1276 			if (val < 0 || val > U16_MAX)
1277 				ret = -EINVAL;
1278 			else
1279 				WRITE_ONCE(sk->sk_busy_poll_budget, val);
1280 		}
1281 		break;
1282 #endif
1283 
1284 	case SO_MAX_PACING_RATE:
1285 		{
1286 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1287 
1288 		if (sizeof(ulval) != sizeof(val) &&
1289 		    optlen >= sizeof(ulval) &&
1290 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1291 			ret = -EFAULT;
1292 			break;
1293 		}
1294 		if (ulval != ~0UL)
1295 			cmpxchg(&sk->sk_pacing_status,
1296 				SK_PACING_NONE,
1297 				SK_PACING_NEEDED);
1298 		sk->sk_max_pacing_rate = ulval;
1299 		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1300 		break;
1301 		}
1302 	case SO_INCOMING_CPU:
1303 		WRITE_ONCE(sk->sk_incoming_cpu, val);
1304 		break;
1305 
1306 	case SO_CNX_ADVICE:
1307 		if (val == 1)
1308 			dst_negative_advice(sk);
1309 		break;
1310 
1311 	case SO_ZEROCOPY:
1312 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1313 			if (!((sk->sk_type == SOCK_STREAM &&
1314 			       sk->sk_protocol == IPPROTO_TCP) ||
1315 			      (sk->sk_type == SOCK_DGRAM &&
1316 			       sk->sk_protocol == IPPROTO_UDP)))
1317 				ret = -ENOTSUPP;
1318 		} else if (sk->sk_family != PF_RDS) {
1319 			ret = -ENOTSUPP;
1320 		}
1321 		if (!ret) {
1322 			if (val < 0 || val > 1)
1323 				ret = -EINVAL;
1324 			else
1325 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1326 		}
1327 		break;
1328 
1329 	case SO_TXTIME:
1330 		if (optlen != sizeof(struct sock_txtime)) {
1331 			ret = -EINVAL;
1332 			break;
1333 		} else if (copy_from_sockptr(&sk_txtime, optval,
1334 			   sizeof(struct sock_txtime))) {
1335 			ret = -EFAULT;
1336 			break;
1337 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1338 			ret = -EINVAL;
1339 			break;
1340 		}
1341 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1342 		 * scheduler has enough safe guards.
1343 		 */
1344 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1345 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1346 			ret = -EPERM;
1347 			break;
1348 		}
1349 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1350 		sk->sk_clockid = sk_txtime.clockid;
1351 		sk->sk_txtime_deadline_mode =
1352 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1353 		sk->sk_txtime_report_errors =
1354 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1355 		break;
1356 
1357 	case SO_BINDTOIFINDEX:
1358 		ret = sock_bindtoindex_locked(sk, val);
1359 		break;
1360 
1361 	default:
1362 		ret = -ENOPROTOOPT;
1363 		break;
1364 	}
1365 	release_sock(sk);
1366 	return ret;
1367 }
1368 EXPORT_SYMBOL(sock_setsockopt);
1369 
1370 
1371 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1372 			  struct ucred *ucred)
1373 {
1374 	ucred->pid = pid_vnr(pid);
1375 	ucred->uid = ucred->gid = -1;
1376 	if (cred) {
1377 		struct user_namespace *current_ns = current_user_ns();
1378 
1379 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1380 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1381 	}
1382 }
1383 
1384 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1385 {
1386 	struct user_namespace *user_ns = current_user_ns();
1387 	int i;
1388 
1389 	for (i = 0; i < src->ngroups; i++)
1390 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1391 			return -EFAULT;
1392 
1393 	return 0;
1394 }
1395 
1396 int sock_getsockopt(struct socket *sock, int level, int optname,
1397 		    char __user *optval, int __user *optlen)
1398 {
1399 	struct sock *sk = sock->sk;
1400 
1401 	union {
1402 		int val;
1403 		u64 val64;
1404 		unsigned long ulval;
1405 		struct linger ling;
1406 		struct old_timeval32 tm32;
1407 		struct __kernel_old_timeval tm;
1408 		struct  __kernel_sock_timeval stm;
1409 		struct sock_txtime txtime;
1410 		struct so_timestamping timestamping;
1411 	} v;
1412 
1413 	int lv = sizeof(int);
1414 	int len;
1415 
1416 	if (get_user(len, optlen))
1417 		return -EFAULT;
1418 	if (len < 0)
1419 		return -EINVAL;
1420 
1421 	memset(&v, 0, sizeof(v));
1422 
1423 	switch (optname) {
1424 	case SO_DEBUG:
1425 		v.val = sock_flag(sk, SOCK_DBG);
1426 		break;
1427 
1428 	case SO_DONTROUTE:
1429 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1430 		break;
1431 
1432 	case SO_BROADCAST:
1433 		v.val = sock_flag(sk, SOCK_BROADCAST);
1434 		break;
1435 
1436 	case SO_SNDBUF:
1437 		v.val = sk->sk_sndbuf;
1438 		break;
1439 
1440 	case SO_RCVBUF:
1441 		v.val = sk->sk_rcvbuf;
1442 		break;
1443 
1444 	case SO_REUSEADDR:
1445 		v.val = sk->sk_reuse;
1446 		break;
1447 
1448 	case SO_REUSEPORT:
1449 		v.val = sk->sk_reuseport;
1450 		break;
1451 
1452 	case SO_KEEPALIVE:
1453 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1454 		break;
1455 
1456 	case SO_TYPE:
1457 		v.val = sk->sk_type;
1458 		break;
1459 
1460 	case SO_PROTOCOL:
1461 		v.val = sk->sk_protocol;
1462 		break;
1463 
1464 	case SO_DOMAIN:
1465 		v.val = sk->sk_family;
1466 		break;
1467 
1468 	case SO_ERROR:
1469 		v.val = -sock_error(sk);
1470 		if (v.val == 0)
1471 			v.val = xchg(&sk->sk_err_soft, 0);
1472 		break;
1473 
1474 	case SO_OOBINLINE:
1475 		v.val = sock_flag(sk, SOCK_URGINLINE);
1476 		break;
1477 
1478 	case SO_NO_CHECK:
1479 		v.val = sk->sk_no_check_tx;
1480 		break;
1481 
1482 	case SO_PRIORITY:
1483 		v.val = sk->sk_priority;
1484 		break;
1485 
1486 	case SO_LINGER:
1487 		lv		= sizeof(v.ling);
1488 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1489 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1490 		break;
1491 
1492 	case SO_BSDCOMPAT:
1493 		break;
1494 
1495 	case SO_TIMESTAMP_OLD:
1496 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1497 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1498 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1499 		break;
1500 
1501 	case SO_TIMESTAMPNS_OLD:
1502 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1503 		break;
1504 
1505 	case SO_TIMESTAMP_NEW:
1506 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1507 		break;
1508 
1509 	case SO_TIMESTAMPNS_NEW:
1510 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1511 		break;
1512 
1513 	case SO_TIMESTAMPING_OLD:
1514 		lv = sizeof(v.timestamping);
1515 		v.timestamping.flags = sk->sk_tsflags;
1516 		v.timestamping.bind_phc = sk->sk_bind_phc;
1517 		break;
1518 
1519 	case SO_RCVTIMEO_OLD:
1520 	case SO_RCVTIMEO_NEW:
1521 		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1522 		break;
1523 
1524 	case SO_SNDTIMEO_OLD:
1525 	case SO_SNDTIMEO_NEW:
1526 		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1527 		break;
1528 
1529 	case SO_RCVLOWAT:
1530 		v.val = sk->sk_rcvlowat;
1531 		break;
1532 
1533 	case SO_SNDLOWAT:
1534 		v.val = 1;
1535 		break;
1536 
1537 	case SO_PASSCRED:
1538 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1539 		break;
1540 
1541 	case SO_PEERCRED:
1542 	{
1543 		struct ucred peercred;
1544 		if (len > sizeof(peercred))
1545 			len = sizeof(peercred);
1546 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1547 		if (copy_to_user(optval, &peercred, len))
1548 			return -EFAULT;
1549 		goto lenout;
1550 	}
1551 
1552 	case SO_PEERGROUPS:
1553 	{
1554 		int ret, n;
1555 
1556 		if (!sk->sk_peer_cred)
1557 			return -ENODATA;
1558 
1559 		n = sk->sk_peer_cred->group_info->ngroups;
1560 		if (len < n * sizeof(gid_t)) {
1561 			len = n * sizeof(gid_t);
1562 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1563 		}
1564 		len = n * sizeof(gid_t);
1565 
1566 		ret = groups_to_user((gid_t __user *)optval,
1567 				     sk->sk_peer_cred->group_info);
1568 		if (ret)
1569 			return ret;
1570 		goto lenout;
1571 	}
1572 
1573 	case SO_PEERNAME:
1574 	{
1575 		char address[128];
1576 
1577 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1578 		if (lv < 0)
1579 			return -ENOTCONN;
1580 		if (lv < len)
1581 			return -EINVAL;
1582 		if (copy_to_user(optval, address, len))
1583 			return -EFAULT;
1584 		goto lenout;
1585 	}
1586 
1587 	/* Dubious BSD thing... Probably nobody even uses it, but
1588 	 * the UNIX standard wants it for whatever reason... -DaveM
1589 	 */
1590 	case SO_ACCEPTCONN:
1591 		v.val = sk->sk_state == TCP_LISTEN;
1592 		break;
1593 
1594 	case SO_PASSSEC:
1595 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1596 		break;
1597 
1598 	case SO_PEERSEC:
1599 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1600 
1601 	case SO_MARK:
1602 		v.val = sk->sk_mark;
1603 		break;
1604 
1605 	case SO_RXQ_OVFL:
1606 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1607 		break;
1608 
1609 	case SO_WIFI_STATUS:
1610 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1611 		break;
1612 
1613 	case SO_PEEK_OFF:
1614 		if (!sock->ops->set_peek_off)
1615 			return -EOPNOTSUPP;
1616 
1617 		v.val = sk->sk_peek_off;
1618 		break;
1619 	case SO_NOFCS:
1620 		v.val = sock_flag(sk, SOCK_NOFCS);
1621 		break;
1622 
1623 	case SO_BINDTODEVICE:
1624 		return sock_getbindtodevice(sk, optval, optlen, len);
1625 
1626 	case SO_GET_FILTER:
1627 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1628 		if (len < 0)
1629 			return len;
1630 
1631 		goto lenout;
1632 
1633 	case SO_LOCK_FILTER:
1634 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1635 		break;
1636 
1637 	case SO_BPF_EXTENSIONS:
1638 		v.val = bpf_tell_extensions();
1639 		break;
1640 
1641 	case SO_SELECT_ERR_QUEUE:
1642 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1643 		break;
1644 
1645 #ifdef CONFIG_NET_RX_BUSY_POLL
1646 	case SO_BUSY_POLL:
1647 		v.val = sk->sk_ll_usec;
1648 		break;
1649 	case SO_PREFER_BUSY_POLL:
1650 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1651 		break;
1652 #endif
1653 
1654 	case SO_MAX_PACING_RATE:
1655 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1656 			lv = sizeof(v.ulval);
1657 			v.ulval = sk->sk_max_pacing_rate;
1658 		} else {
1659 			/* 32bit version */
1660 			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1661 		}
1662 		break;
1663 
1664 	case SO_INCOMING_CPU:
1665 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1666 		break;
1667 
1668 	case SO_MEMINFO:
1669 	{
1670 		u32 meminfo[SK_MEMINFO_VARS];
1671 
1672 		sk_get_meminfo(sk, meminfo);
1673 
1674 		len = min_t(unsigned int, len, sizeof(meminfo));
1675 		if (copy_to_user(optval, &meminfo, len))
1676 			return -EFAULT;
1677 
1678 		goto lenout;
1679 	}
1680 
1681 #ifdef CONFIG_NET_RX_BUSY_POLL
1682 	case SO_INCOMING_NAPI_ID:
1683 		v.val = READ_ONCE(sk->sk_napi_id);
1684 
1685 		/* aggregate non-NAPI IDs down to 0 */
1686 		if (v.val < MIN_NAPI_ID)
1687 			v.val = 0;
1688 
1689 		break;
1690 #endif
1691 
1692 	case SO_COOKIE:
1693 		lv = sizeof(u64);
1694 		if (len < lv)
1695 			return -EINVAL;
1696 		v.val64 = sock_gen_cookie(sk);
1697 		break;
1698 
1699 	case SO_ZEROCOPY:
1700 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1701 		break;
1702 
1703 	case SO_TXTIME:
1704 		lv = sizeof(v.txtime);
1705 		v.txtime.clockid = sk->sk_clockid;
1706 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1707 				  SOF_TXTIME_DEADLINE_MODE : 0;
1708 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1709 				  SOF_TXTIME_REPORT_ERRORS : 0;
1710 		break;
1711 
1712 	case SO_BINDTOIFINDEX:
1713 		v.val = sk->sk_bound_dev_if;
1714 		break;
1715 
1716 	case SO_NETNS_COOKIE:
1717 		lv = sizeof(u64);
1718 		if (len != lv)
1719 			return -EINVAL;
1720 		v.val64 = sock_net(sk)->net_cookie;
1721 		break;
1722 
1723 	default:
1724 		/* We implement the SO_SNDLOWAT etc to not be settable
1725 		 * (1003.1g 7).
1726 		 */
1727 		return -ENOPROTOOPT;
1728 	}
1729 
1730 	if (len > lv)
1731 		len = lv;
1732 	if (copy_to_user(optval, &v, len))
1733 		return -EFAULT;
1734 lenout:
1735 	if (put_user(len, optlen))
1736 		return -EFAULT;
1737 	return 0;
1738 }
1739 
1740 /*
1741  * Initialize an sk_lock.
1742  *
1743  * (We also register the sk_lock with the lock validator.)
1744  */
1745 static inline void sock_lock_init(struct sock *sk)
1746 {
1747 	if (sk->sk_kern_sock)
1748 		sock_lock_init_class_and_name(
1749 			sk,
1750 			af_family_kern_slock_key_strings[sk->sk_family],
1751 			af_family_kern_slock_keys + sk->sk_family,
1752 			af_family_kern_key_strings[sk->sk_family],
1753 			af_family_kern_keys + sk->sk_family);
1754 	else
1755 		sock_lock_init_class_and_name(
1756 			sk,
1757 			af_family_slock_key_strings[sk->sk_family],
1758 			af_family_slock_keys + sk->sk_family,
1759 			af_family_key_strings[sk->sk_family],
1760 			af_family_keys + sk->sk_family);
1761 }
1762 
1763 /*
1764  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1765  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1766  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1767  */
1768 static void sock_copy(struct sock *nsk, const struct sock *osk)
1769 {
1770 	const struct proto *prot = READ_ONCE(osk->sk_prot);
1771 #ifdef CONFIG_SECURITY_NETWORK
1772 	void *sptr = nsk->sk_security;
1773 #endif
1774 
1775 	/* If we move sk_tx_queue_mapping out of the private section,
1776 	 * we must check if sk_tx_queue_clear() is called after
1777 	 * sock_copy() in sk_clone_lock().
1778 	 */
1779 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1780 		     offsetof(struct sock, sk_dontcopy_begin) ||
1781 		     offsetof(struct sock, sk_tx_queue_mapping) >=
1782 		     offsetof(struct sock, sk_dontcopy_end));
1783 
1784 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1785 
1786 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1787 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1788 
1789 #ifdef CONFIG_SECURITY_NETWORK
1790 	nsk->sk_security = sptr;
1791 	security_sk_clone(osk, nsk);
1792 #endif
1793 }
1794 
1795 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1796 		int family)
1797 {
1798 	struct sock *sk;
1799 	struct kmem_cache *slab;
1800 
1801 	slab = prot->slab;
1802 	if (slab != NULL) {
1803 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1804 		if (!sk)
1805 			return sk;
1806 		if (want_init_on_alloc(priority))
1807 			sk_prot_clear_nulls(sk, prot->obj_size);
1808 	} else
1809 		sk = kmalloc(prot->obj_size, priority);
1810 
1811 	if (sk != NULL) {
1812 		if (security_sk_alloc(sk, family, priority))
1813 			goto out_free;
1814 
1815 		if (!try_module_get(prot->owner))
1816 			goto out_free_sec;
1817 	}
1818 
1819 	return sk;
1820 
1821 out_free_sec:
1822 	security_sk_free(sk);
1823 out_free:
1824 	if (slab != NULL)
1825 		kmem_cache_free(slab, sk);
1826 	else
1827 		kfree(sk);
1828 	return NULL;
1829 }
1830 
1831 static void sk_prot_free(struct proto *prot, struct sock *sk)
1832 {
1833 	struct kmem_cache *slab;
1834 	struct module *owner;
1835 
1836 	owner = prot->owner;
1837 	slab = prot->slab;
1838 
1839 	cgroup_sk_free(&sk->sk_cgrp_data);
1840 	mem_cgroup_sk_free(sk);
1841 	security_sk_free(sk);
1842 	if (slab != NULL)
1843 		kmem_cache_free(slab, sk);
1844 	else
1845 		kfree(sk);
1846 	module_put(owner);
1847 }
1848 
1849 /**
1850  *	sk_alloc - All socket objects are allocated here
1851  *	@net: the applicable net namespace
1852  *	@family: protocol family
1853  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1854  *	@prot: struct proto associated with this new sock instance
1855  *	@kern: is this to be a kernel socket?
1856  */
1857 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1858 		      struct proto *prot, int kern)
1859 {
1860 	struct sock *sk;
1861 
1862 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1863 	if (sk) {
1864 		sk->sk_family = family;
1865 		/*
1866 		 * See comment in struct sock definition to understand
1867 		 * why we need sk_prot_creator -acme
1868 		 */
1869 		sk->sk_prot = sk->sk_prot_creator = prot;
1870 		sk->sk_kern_sock = kern;
1871 		sock_lock_init(sk);
1872 		sk->sk_net_refcnt = kern ? 0 : 1;
1873 		if (likely(sk->sk_net_refcnt)) {
1874 			get_net(net);
1875 			sock_inuse_add(net, 1);
1876 		}
1877 
1878 		sock_net_set(sk, net);
1879 		refcount_set(&sk->sk_wmem_alloc, 1);
1880 
1881 		mem_cgroup_sk_alloc(sk);
1882 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1883 		sock_update_classid(&sk->sk_cgrp_data);
1884 		sock_update_netprioidx(&sk->sk_cgrp_data);
1885 		sk_tx_queue_clear(sk);
1886 	}
1887 
1888 	return sk;
1889 }
1890 EXPORT_SYMBOL(sk_alloc);
1891 
1892 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1893  * grace period. This is the case for UDP sockets and TCP listeners.
1894  */
1895 static void __sk_destruct(struct rcu_head *head)
1896 {
1897 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1898 	struct sk_filter *filter;
1899 
1900 	if (sk->sk_destruct)
1901 		sk->sk_destruct(sk);
1902 
1903 	filter = rcu_dereference_check(sk->sk_filter,
1904 				       refcount_read(&sk->sk_wmem_alloc) == 0);
1905 	if (filter) {
1906 		sk_filter_uncharge(sk, filter);
1907 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1908 	}
1909 
1910 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1911 
1912 #ifdef CONFIG_BPF_SYSCALL
1913 	bpf_sk_storage_free(sk);
1914 #endif
1915 
1916 	if (atomic_read(&sk->sk_omem_alloc))
1917 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1918 			 __func__, atomic_read(&sk->sk_omem_alloc));
1919 
1920 	if (sk->sk_frag.page) {
1921 		put_page(sk->sk_frag.page);
1922 		sk->sk_frag.page = NULL;
1923 	}
1924 
1925 	if (sk->sk_peer_cred)
1926 		put_cred(sk->sk_peer_cred);
1927 	put_pid(sk->sk_peer_pid);
1928 	if (likely(sk->sk_net_refcnt))
1929 		put_net(sock_net(sk));
1930 	sk_prot_free(sk->sk_prot_creator, sk);
1931 }
1932 
1933 void sk_destruct(struct sock *sk)
1934 {
1935 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1936 
1937 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1938 		reuseport_detach_sock(sk);
1939 		use_call_rcu = true;
1940 	}
1941 
1942 	if (use_call_rcu)
1943 		call_rcu(&sk->sk_rcu, __sk_destruct);
1944 	else
1945 		__sk_destruct(&sk->sk_rcu);
1946 }
1947 
1948 static void __sk_free(struct sock *sk)
1949 {
1950 	if (likely(sk->sk_net_refcnt))
1951 		sock_inuse_add(sock_net(sk), -1);
1952 
1953 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1954 		sock_diag_broadcast_destroy(sk);
1955 	else
1956 		sk_destruct(sk);
1957 }
1958 
1959 void sk_free(struct sock *sk)
1960 {
1961 	/*
1962 	 * We subtract one from sk_wmem_alloc and can know if
1963 	 * some packets are still in some tx queue.
1964 	 * If not null, sock_wfree() will call __sk_free(sk) later
1965 	 */
1966 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1967 		__sk_free(sk);
1968 }
1969 EXPORT_SYMBOL(sk_free);
1970 
1971 static void sk_init_common(struct sock *sk)
1972 {
1973 	skb_queue_head_init(&sk->sk_receive_queue);
1974 	skb_queue_head_init(&sk->sk_write_queue);
1975 	skb_queue_head_init(&sk->sk_error_queue);
1976 
1977 	rwlock_init(&sk->sk_callback_lock);
1978 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1979 			af_rlock_keys + sk->sk_family,
1980 			af_family_rlock_key_strings[sk->sk_family]);
1981 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1982 			af_wlock_keys + sk->sk_family,
1983 			af_family_wlock_key_strings[sk->sk_family]);
1984 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1985 			af_elock_keys + sk->sk_family,
1986 			af_family_elock_key_strings[sk->sk_family]);
1987 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1988 			af_callback_keys + sk->sk_family,
1989 			af_family_clock_key_strings[sk->sk_family]);
1990 }
1991 
1992 /**
1993  *	sk_clone_lock - clone a socket, and lock its clone
1994  *	@sk: the socket to clone
1995  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1996  *
1997  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1998  */
1999 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2000 {
2001 	struct proto *prot = READ_ONCE(sk->sk_prot);
2002 	struct sk_filter *filter;
2003 	bool is_charged = true;
2004 	struct sock *newsk;
2005 
2006 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2007 	if (!newsk)
2008 		goto out;
2009 
2010 	sock_copy(newsk, sk);
2011 
2012 	newsk->sk_prot_creator = prot;
2013 
2014 	/* SANITY */
2015 	if (likely(newsk->sk_net_refcnt))
2016 		get_net(sock_net(newsk));
2017 	sk_node_init(&newsk->sk_node);
2018 	sock_lock_init(newsk);
2019 	bh_lock_sock(newsk);
2020 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2021 	newsk->sk_backlog.len = 0;
2022 
2023 	atomic_set(&newsk->sk_rmem_alloc, 0);
2024 
2025 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2026 	refcount_set(&newsk->sk_wmem_alloc, 1);
2027 
2028 	atomic_set(&newsk->sk_omem_alloc, 0);
2029 	sk_init_common(newsk);
2030 
2031 	newsk->sk_dst_cache	= NULL;
2032 	newsk->sk_dst_pending_confirm = 0;
2033 	newsk->sk_wmem_queued	= 0;
2034 	newsk->sk_forward_alloc = 0;
2035 	atomic_set(&newsk->sk_drops, 0);
2036 	newsk->sk_send_head	= NULL;
2037 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2038 	atomic_set(&newsk->sk_zckey, 0);
2039 
2040 	sock_reset_flag(newsk, SOCK_DONE);
2041 
2042 	/* sk->sk_memcg will be populated at accept() time */
2043 	newsk->sk_memcg = NULL;
2044 
2045 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2046 
2047 	rcu_read_lock();
2048 	filter = rcu_dereference(sk->sk_filter);
2049 	if (filter != NULL)
2050 		/* though it's an empty new sock, the charging may fail
2051 		 * if sysctl_optmem_max was changed between creation of
2052 		 * original socket and cloning
2053 		 */
2054 		is_charged = sk_filter_charge(newsk, filter);
2055 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2056 	rcu_read_unlock();
2057 
2058 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2059 		/* We need to make sure that we don't uncharge the new
2060 		 * socket if we couldn't charge it in the first place
2061 		 * as otherwise we uncharge the parent's filter.
2062 		 */
2063 		if (!is_charged)
2064 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2065 		sk_free_unlock_clone(newsk);
2066 		newsk = NULL;
2067 		goto out;
2068 	}
2069 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2070 
2071 	if (bpf_sk_storage_clone(sk, newsk)) {
2072 		sk_free_unlock_clone(newsk);
2073 		newsk = NULL;
2074 		goto out;
2075 	}
2076 
2077 	/* Clear sk_user_data if parent had the pointer tagged
2078 	 * as not suitable for copying when cloning.
2079 	 */
2080 	if (sk_user_data_is_nocopy(newsk))
2081 		newsk->sk_user_data = NULL;
2082 
2083 	newsk->sk_err	   = 0;
2084 	newsk->sk_err_soft = 0;
2085 	newsk->sk_priority = 0;
2086 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2087 	if (likely(newsk->sk_net_refcnt))
2088 		sock_inuse_add(sock_net(newsk), 1);
2089 
2090 	/* Before updating sk_refcnt, we must commit prior changes to memory
2091 	 * (Documentation/RCU/rculist_nulls.rst for details)
2092 	 */
2093 	smp_wmb();
2094 	refcount_set(&newsk->sk_refcnt, 2);
2095 
2096 	/* Increment the counter in the same struct proto as the master
2097 	 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2098 	 * is the same as sk->sk_prot->socks, as this field was copied
2099 	 * with memcpy).
2100 	 *
2101 	 * This _changes_ the previous behaviour, where
2102 	 * tcp_create_openreq_child always was incrementing the
2103 	 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2104 	 * to be taken into account in all callers. -acme
2105 	 */
2106 	sk_refcnt_debug_inc(newsk);
2107 	sk_set_socket(newsk, NULL);
2108 	sk_tx_queue_clear(newsk);
2109 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2110 
2111 	if (newsk->sk_prot->sockets_allocated)
2112 		sk_sockets_allocated_inc(newsk);
2113 
2114 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2115 		net_enable_timestamp();
2116 out:
2117 	return newsk;
2118 }
2119 EXPORT_SYMBOL_GPL(sk_clone_lock);
2120 
2121 void sk_free_unlock_clone(struct sock *sk)
2122 {
2123 	/* It is still raw copy of parent, so invalidate
2124 	 * destructor and make plain sk_free() */
2125 	sk->sk_destruct = NULL;
2126 	bh_unlock_sock(sk);
2127 	sk_free(sk);
2128 }
2129 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2130 
2131 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2132 {
2133 	u32 max_segs = 1;
2134 
2135 	sk_dst_set(sk, dst);
2136 	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
2137 	if (sk->sk_route_caps & NETIF_F_GSO)
2138 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2139 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
2140 	if (sk_can_gso(sk)) {
2141 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2142 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2143 		} else {
2144 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2145 			sk->sk_gso_max_size = dst->dev->gso_max_size;
2146 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2147 		}
2148 	}
2149 	sk->sk_gso_max_segs = max_segs;
2150 }
2151 EXPORT_SYMBOL_GPL(sk_setup_caps);
2152 
2153 /*
2154  *	Simple resource managers for sockets.
2155  */
2156 
2157 
2158 /*
2159  * Write buffer destructor automatically called from kfree_skb.
2160  */
2161 void sock_wfree(struct sk_buff *skb)
2162 {
2163 	struct sock *sk = skb->sk;
2164 	unsigned int len = skb->truesize;
2165 
2166 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2167 		/*
2168 		 * Keep a reference on sk_wmem_alloc, this will be released
2169 		 * after sk_write_space() call
2170 		 */
2171 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2172 		sk->sk_write_space(sk);
2173 		len = 1;
2174 	}
2175 	/*
2176 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2177 	 * could not do because of in-flight packets
2178 	 */
2179 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2180 		__sk_free(sk);
2181 }
2182 EXPORT_SYMBOL(sock_wfree);
2183 
2184 /* This variant of sock_wfree() is used by TCP,
2185  * since it sets SOCK_USE_WRITE_QUEUE.
2186  */
2187 void __sock_wfree(struct sk_buff *skb)
2188 {
2189 	struct sock *sk = skb->sk;
2190 
2191 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2192 		__sk_free(sk);
2193 }
2194 
2195 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2196 {
2197 	skb_orphan(skb);
2198 	skb->sk = sk;
2199 #ifdef CONFIG_INET
2200 	if (unlikely(!sk_fullsock(sk))) {
2201 		skb->destructor = sock_edemux;
2202 		sock_hold(sk);
2203 		return;
2204 	}
2205 #endif
2206 	skb->destructor = sock_wfree;
2207 	skb_set_hash_from_sk(skb, sk);
2208 	/*
2209 	 * We used to take a refcount on sk, but following operation
2210 	 * is enough to guarantee sk_free() wont free this sock until
2211 	 * all in-flight packets are completed
2212 	 */
2213 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2214 }
2215 EXPORT_SYMBOL(skb_set_owner_w);
2216 
2217 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2218 {
2219 #ifdef CONFIG_TLS_DEVICE
2220 	/* Drivers depend on in-order delivery for crypto offload,
2221 	 * partial orphan breaks out-of-order-OK logic.
2222 	 */
2223 	if (skb->decrypted)
2224 		return false;
2225 #endif
2226 	return (skb->destructor == sock_wfree ||
2227 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2228 }
2229 
2230 /* This helper is used by netem, as it can hold packets in its
2231  * delay queue. We want to allow the owner socket to send more
2232  * packets, as if they were already TX completed by a typical driver.
2233  * But we also want to keep skb->sk set because some packet schedulers
2234  * rely on it (sch_fq for example).
2235  */
2236 void skb_orphan_partial(struct sk_buff *skb)
2237 {
2238 	if (skb_is_tcp_pure_ack(skb))
2239 		return;
2240 
2241 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2242 		return;
2243 
2244 	skb_orphan(skb);
2245 }
2246 EXPORT_SYMBOL(skb_orphan_partial);
2247 
2248 /*
2249  * Read buffer destructor automatically called from kfree_skb.
2250  */
2251 void sock_rfree(struct sk_buff *skb)
2252 {
2253 	struct sock *sk = skb->sk;
2254 	unsigned int len = skb->truesize;
2255 
2256 	atomic_sub(len, &sk->sk_rmem_alloc);
2257 	sk_mem_uncharge(sk, len);
2258 }
2259 EXPORT_SYMBOL(sock_rfree);
2260 
2261 /*
2262  * Buffer destructor for skbs that are not used directly in read or write
2263  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2264  */
2265 void sock_efree(struct sk_buff *skb)
2266 {
2267 	sock_put(skb->sk);
2268 }
2269 EXPORT_SYMBOL(sock_efree);
2270 
2271 /* Buffer destructor for prefetch/receive path where reference count may
2272  * not be held, e.g. for listen sockets.
2273  */
2274 #ifdef CONFIG_INET
2275 void sock_pfree(struct sk_buff *skb)
2276 {
2277 	if (sk_is_refcounted(skb->sk))
2278 		sock_gen_put(skb->sk);
2279 }
2280 EXPORT_SYMBOL(sock_pfree);
2281 #endif /* CONFIG_INET */
2282 
2283 kuid_t sock_i_uid(struct sock *sk)
2284 {
2285 	kuid_t uid;
2286 
2287 	read_lock_bh(&sk->sk_callback_lock);
2288 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2289 	read_unlock_bh(&sk->sk_callback_lock);
2290 	return uid;
2291 }
2292 EXPORT_SYMBOL(sock_i_uid);
2293 
2294 unsigned long sock_i_ino(struct sock *sk)
2295 {
2296 	unsigned long ino;
2297 
2298 	read_lock_bh(&sk->sk_callback_lock);
2299 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2300 	read_unlock_bh(&sk->sk_callback_lock);
2301 	return ino;
2302 }
2303 EXPORT_SYMBOL(sock_i_ino);
2304 
2305 /*
2306  * Allocate a skb from the socket's send buffer.
2307  */
2308 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2309 			     gfp_t priority)
2310 {
2311 	if (force ||
2312 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2313 		struct sk_buff *skb = alloc_skb(size, priority);
2314 
2315 		if (skb) {
2316 			skb_set_owner_w(skb, sk);
2317 			return skb;
2318 		}
2319 	}
2320 	return NULL;
2321 }
2322 EXPORT_SYMBOL(sock_wmalloc);
2323 
2324 static void sock_ofree(struct sk_buff *skb)
2325 {
2326 	struct sock *sk = skb->sk;
2327 
2328 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2329 }
2330 
2331 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2332 			     gfp_t priority)
2333 {
2334 	struct sk_buff *skb;
2335 
2336 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2337 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2338 	    sysctl_optmem_max)
2339 		return NULL;
2340 
2341 	skb = alloc_skb(size, priority);
2342 	if (!skb)
2343 		return NULL;
2344 
2345 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2346 	skb->sk = sk;
2347 	skb->destructor = sock_ofree;
2348 	return skb;
2349 }
2350 
2351 /*
2352  * Allocate a memory block from the socket's option memory buffer.
2353  */
2354 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2355 {
2356 	if ((unsigned int)size <= sysctl_optmem_max &&
2357 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2358 		void *mem;
2359 		/* First do the add, to avoid the race if kmalloc
2360 		 * might sleep.
2361 		 */
2362 		atomic_add(size, &sk->sk_omem_alloc);
2363 		mem = kmalloc(size, priority);
2364 		if (mem)
2365 			return mem;
2366 		atomic_sub(size, &sk->sk_omem_alloc);
2367 	}
2368 	return NULL;
2369 }
2370 EXPORT_SYMBOL(sock_kmalloc);
2371 
2372 /* Free an option memory block. Note, we actually want the inline
2373  * here as this allows gcc to detect the nullify and fold away the
2374  * condition entirely.
2375  */
2376 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2377 				  const bool nullify)
2378 {
2379 	if (WARN_ON_ONCE(!mem))
2380 		return;
2381 	if (nullify)
2382 		kfree_sensitive(mem);
2383 	else
2384 		kfree(mem);
2385 	atomic_sub(size, &sk->sk_omem_alloc);
2386 }
2387 
2388 void sock_kfree_s(struct sock *sk, void *mem, int size)
2389 {
2390 	__sock_kfree_s(sk, mem, size, false);
2391 }
2392 EXPORT_SYMBOL(sock_kfree_s);
2393 
2394 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2395 {
2396 	__sock_kfree_s(sk, mem, size, true);
2397 }
2398 EXPORT_SYMBOL(sock_kzfree_s);
2399 
2400 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2401    I think, these locks should be removed for datagram sockets.
2402  */
2403 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2404 {
2405 	DEFINE_WAIT(wait);
2406 
2407 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2408 	for (;;) {
2409 		if (!timeo)
2410 			break;
2411 		if (signal_pending(current))
2412 			break;
2413 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2414 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2415 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2416 			break;
2417 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2418 			break;
2419 		if (sk->sk_err)
2420 			break;
2421 		timeo = schedule_timeout(timeo);
2422 	}
2423 	finish_wait(sk_sleep(sk), &wait);
2424 	return timeo;
2425 }
2426 
2427 
2428 /*
2429  *	Generic send/receive buffer handlers
2430  */
2431 
2432 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2433 				     unsigned long data_len, int noblock,
2434 				     int *errcode, int max_page_order)
2435 {
2436 	struct sk_buff *skb;
2437 	long timeo;
2438 	int err;
2439 
2440 	timeo = sock_sndtimeo(sk, noblock);
2441 	for (;;) {
2442 		err = sock_error(sk);
2443 		if (err != 0)
2444 			goto failure;
2445 
2446 		err = -EPIPE;
2447 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2448 			goto failure;
2449 
2450 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2451 			break;
2452 
2453 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2454 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2455 		err = -EAGAIN;
2456 		if (!timeo)
2457 			goto failure;
2458 		if (signal_pending(current))
2459 			goto interrupted;
2460 		timeo = sock_wait_for_wmem(sk, timeo);
2461 	}
2462 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2463 				   errcode, sk->sk_allocation);
2464 	if (skb)
2465 		skb_set_owner_w(skb, sk);
2466 	return skb;
2467 
2468 interrupted:
2469 	err = sock_intr_errno(timeo);
2470 failure:
2471 	*errcode = err;
2472 	return NULL;
2473 }
2474 EXPORT_SYMBOL(sock_alloc_send_pskb);
2475 
2476 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2477 				    int noblock, int *errcode)
2478 {
2479 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2480 }
2481 EXPORT_SYMBOL(sock_alloc_send_skb);
2482 
2483 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2484 		     struct sockcm_cookie *sockc)
2485 {
2486 	u32 tsflags;
2487 
2488 	switch (cmsg->cmsg_type) {
2489 	case SO_MARK:
2490 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2491 			return -EPERM;
2492 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2493 			return -EINVAL;
2494 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2495 		break;
2496 	case SO_TIMESTAMPING_OLD:
2497 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2498 			return -EINVAL;
2499 
2500 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2501 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2502 			return -EINVAL;
2503 
2504 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2505 		sockc->tsflags |= tsflags;
2506 		break;
2507 	case SCM_TXTIME:
2508 		if (!sock_flag(sk, SOCK_TXTIME))
2509 			return -EINVAL;
2510 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2511 			return -EINVAL;
2512 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2513 		break;
2514 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2515 	case SCM_RIGHTS:
2516 	case SCM_CREDENTIALS:
2517 		break;
2518 	default:
2519 		return -EINVAL;
2520 	}
2521 	return 0;
2522 }
2523 EXPORT_SYMBOL(__sock_cmsg_send);
2524 
2525 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2526 		   struct sockcm_cookie *sockc)
2527 {
2528 	struct cmsghdr *cmsg;
2529 	int ret;
2530 
2531 	for_each_cmsghdr(cmsg, msg) {
2532 		if (!CMSG_OK(msg, cmsg))
2533 			return -EINVAL;
2534 		if (cmsg->cmsg_level != SOL_SOCKET)
2535 			continue;
2536 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2537 		if (ret)
2538 			return ret;
2539 	}
2540 	return 0;
2541 }
2542 EXPORT_SYMBOL(sock_cmsg_send);
2543 
2544 static void sk_enter_memory_pressure(struct sock *sk)
2545 {
2546 	if (!sk->sk_prot->enter_memory_pressure)
2547 		return;
2548 
2549 	sk->sk_prot->enter_memory_pressure(sk);
2550 }
2551 
2552 static void sk_leave_memory_pressure(struct sock *sk)
2553 {
2554 	if (sk->sk_prot->leave_memory_pressure) {
2555 		sk->sk_prot->leave_memory_pressure(sk);
2556 	} else {
2557 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2558 
2559 		if (memory_pressure && READ_ONCE(*memory_pressure))
2560 			WRITE_ONCE(*memory_pressure, 0);
2561 	}
2562 }
2563 
2564 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
2565 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2566 
2567 /**
2568  * skb_page_frag_refill - check that a page_frag contains enough room
2569  * @sz: minimum size of the fragment we want to get
2570  * @pfrag: pointer to page_frag
2571  * @gfp: priority for memory allocation
2572  *
2573  * Note: While this allocator tries to use high order pages, there is
2574  * no guarantee that allocations succeed. Therefore, @sz MUST be
2575  * less or equal than PAGE_SIZE.
2576  */
2577 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2578 {
2579 	if (pfrag->page) {
2580 		if (page_ref_count(pfrag->page) == 1) {
2581 			pfrag->offset = 0;
2582 			return true;
2583 		}
2584 		if (pfrag->offset + sz <= pfrag->size)
2585 			return true;
2586 		put_page(pfrag->page);
2587 	}
2588 
2589 	pfrag->offset = 0;
2590 	if (SKB_FRAG_PAGE_ORDER &&
2591 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2592 		/* Avoid direct reclaim but allow kswapd to wake */
2593 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2594 					  __GFP_COMP | __GFP_NOWARN |
2595 					  __GFP_NORETRY,
2596 					  SKB_FRAG_PAGE_ORDER);
2597 		if (likely(pfrag->page)) {
2598 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2599 			return true;
2600 		}
2601 	}
2602 	pfrag->page = alloc_page(gfp);
2603 	if (likely(pfrag->page)) {
2604 		pfrag->size = PAGE_SIZE;
2605 		return true;
2606 	}
2607 	return false;
2608 }
2609 EXPORT_SYMBOL(skb_page_frag_refill);
2610 
2611 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2612 {
2613 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2614 		return true;
2615 
2616 	sk_enter_memory_pressure(sk);
2617 	sk_stream_moderate_sndbuf(sk);
2618 	return false;
2619 }
2620 EXPORT_SYMBOL(sk_page_frag_refill);
2621 
2622 void __lock_sock(struct sock *sk)
2623 	__releases(&sk->sk_lock.slock)
2624 	__acquires(&sk->sk_lock.slock)
2625 {
2626 	DEFINE_WAIT(wait);
2627 
2628 	for (;;) {
2629 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2630 					TASK_UNINTERRUPTIBLE);
2631 		spin_unlock_bh(&sk->sk_lock.slock);
2632 		schedule();
2633 		spin_lock_bh(&sk->sk_lock.slock);
2634 		if (!sock_owned_by_user(sk))
2635 			break;
2636 	}
2637 	finish_wait(&sk->sk_lock.wq, &wait);
2638 }
2639 
2640 void __release_sock(struct sock *sk)
2641 	__releases(&sk->sk_lock.slock)
2642 	__acquires(&sk->sk_lock.slock)
2643 {
2644 	struct sk_buff *skb, *next;
2645 
2646 	while ((skb = sk->sk_backlog.head) != NULL) {
2647 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2648 
2649 		spin_unlock_bh(&sk->sk_lock.slock);
2650 
2651 		do {
2652 			next = skb->next;
2653 			prefetch(next);
2654 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2655 			skb_mark_not_on_list(skb);
2656 			sk_backlog_rcv(sk, skb);
2657 
2658 			cond_resched();
2659 
2660 			skb = next;
2661 		} while (skb != NULL);
2662 
2663 		spin_lock_bh(&sk->sk_lock.slock);
2664 	}
2665 
2666 	/*
2667 	 * Doing the zeroing here guarantee we can not loop forever
2668 	 * while a wild producer attempts to flood us.
2669 	 */
2670 	sk->sk_backlog.len = 0;
2671 }
2672 
2673 void __sk_flush_backlog(struct sock *sk)
2674 {
2675 	spin_lock_bh(&sk->sk_lock.slock);
2676 	__release_sock(sk);
2677 	spin_unlock_bh(&sk->sk_lock.slock);
2678 }
2679 
2680 /**
2681  * sk_wait_data - wait for data to arrive at sk_receive_queue
2682  * @sk:    sock to wait on
2683  * @timeo: for how long
2684  * @skb:   last skb seen on sk_receive_queue
2685  *
2686  * Now socket state including sk->sk_err is changed only under lock,
2687  * hence we may omit checks after joining wait queue.
2688  * We check receive queue before schedule() only as optimization;
2689  * it is very likely that release_sock() added new data.
2690  */
2691 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2692 {
2693 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2694 	int rc;
2695 
2696 	add_wait_queue(sk_sleep(sk), &wait);
2697 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2698 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2699 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2700 	remove_wait_queue(sk_sleep(sk), &wait);
2701 	return rc;
2702 }
2703 EXPORT_SYMBOL(sk_wait_data);
2704 
2705 /**
2706  *	__sk_mem_raise_allocated - increase memory_allocated
2707  *	@sk: socket
2708  *	@size: memory size to allocate
2709  *	@amt: pages to allocate
2710  *	@kind: allocation type
2711  *
2712  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2713  */
2714 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2715 {
2716 	struct proto *prot = sk->sk_prot;
2717 	long allocated = sk_memory_allocated_add(sk, amt);
2718 	bool charged = true;
2719 
2720 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2721 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2722 		goto suppress_allocation;
2723 
2724 	/* Under limit. */
2725 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2726 		sk_leave_memory_pressure(sk);
2727 		return 1;
2728 	}
2729 
2730 	/* Under pressure. */
2731 	if (allocated > sk_prot_mem_limits(sk, 1))
2732 		sk_enter_memory_pressure(sk);
2733 
2734 	/* Over hard limit. */
2735 	if (allocated > sk_prot_mem_limits(sk, 2))
2736 		goto suppress_allocation;
2737 
2738 	/* guarantee minimum buffer size under pressure */
2739 	if (kind == SK_MEM_RECV) {
2740 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2741 			return 1;
2742 
2743 	} else { /* SK_MEM_SEND */
2744 		int wmem0 = sk_get_wmem0(sk, prot);
2745 
2746 		if (sk->sk_type == SOCK_STREAM) {
2747 			if (sk->sk_wmem_queued < wmem0)
2748 				return 1;
2749 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2750 				return 1;
2751 		}
2752 	}
2753 
2754 	if (sk_has_memory_pressure(sk)) {
2755 		u64 alloc;
2756 
2757 		if (!sk_under_memory_pressure(sk))
2758 			return 1;
2759 		alloc = sk_sockets_allocated_read_positive(sk);
2760 		if (sk_prot_mem_limits(sk, 2) > alloc *
2761 		    sk_mem_pages(sk->sk_wmem_queued +
2762 				 atomic_read(&sk->sk_rmem_alloc) +
2763 				 sk->sk_forward_alloc))
2764 			return 1;
2765 	}
2766 
2767 suppress_allocation:
2768 
2769 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2770 		sk_stream_moderate_sndbuf(sk);
2771 
2772 		/* Fail only if socket is _under_ its sndbuf.
2773 		 * In this case we cannot block, so that we have to fail.
2774 		 */
2775 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2776 			return 1;
2777 	}
2778 
2779 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2780 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2781 
2782 	sk_memory_allocated_sub(sk, amt);
2783 
2784 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2785 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2786 
2787 	return 0;
2788 }
2789 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2790 
2791 /**
2792  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2793  *	@sk: socket
2794  *	@size: memory size to allocate
2795  *	@kind: allocation type
2796  *
2797  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2798  *	rmem allocation. This function assumes that protocols which have
2799  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2800  */
2801 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2802 {
2803 	int ret, amt = sk_mem_pages(size);
2804 
2805 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2806 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2807 	if (!ret)
2808 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2809 	return ret;
2810 }
2811 EXPORT_SYMBOL(__sk_mem_schedule);
2812 
2813 /**
2814  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2815  *	@sk: socket
2816  *	@amount: number of quanta
2817  *
2818  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2819  */
2820 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2821 {
2822 	sk_memory_allocated_sub(sk, amount);
2823 
2824 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2825 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2826 
2827 	if (sk_under_memory_pressure(sk) &&
2828 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2829 		sk_leave_memory_pressure(sk);
2830 }
2831 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2832 
2833 /**
2834  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2835  *	@sk: socket
2836  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2837  */
2838 void __sk_mem_reclaim(struct sock *sk, int amount)
2839 {
2840 	amount >>= SK_MEM_QUANTUM_SHIFT;
2841 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2842 	__sk_mem_reduce_allocated(sk, amount);
2843 }
2844 EXPORT_SYMBOL(__sk_mem_reclaim);
2845 
2846 int sk_set_peek_off(struct sock *sk, int val)
2847 {
2848 	sk->sk_peek_off = val;
2849 	return 0;
2850 }
2851 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2852 
2853 /*
2854  * Set of default routines for initialising struct proto_ops when
2855  * the protocol does not support a particular function. In certain
2856  * cases where it makes no sense for a protocol to have a "do nothing"
2857  * function, some default processing is provided.
2858  */
2859 
2860 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2861 {
2862 	return -EOPNOTSUPP;
2863 }
2864 EXPORT_SYMBOL(sock_no_bind);
2865 
2866 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2867 		    int len, int flags)
2868 {
2869 	return -EOPNOTSUPP;
2870 }
2871 EXPORT_SYMBOL(sock_no_connect);
2872 
2873 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2874 {
2875 	return -EOPNOTSUPP;
2876 }
2877 EXPORT_SYMBOL(sock_no_socketpair);
2878 
2879 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2880 		   bool kern)
2881 {
2882 	return -EOPNOTSUPP;
2883 }
2884 EXPORT_SYMBOL(sock_no_accept);
2885 
2886 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2887 		    int peer)
2888 {
2889 	return -EOPNOTSUPP;
2890 }
2891 EXPORT_SYMBOL(sock_no_getname);
2892 
2893 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2894 {
2895 	return -EOPNOTSUPP;
2896 }
2897 EXPORT_SYMBOL(sock_no_ioctl);
2898 
2899 int sock_no_listen(struct socket *sock, int backlog)
2900 {
2901 	return -EOPNOTSUPP;
2902 }
2903 EXPORT_SYMBOL(sock_no_listen);
2904 
2905 int sock_no_shutdown(struct socket *sock, int how)
2906 {
2907 	return -EOPNOTSUPP;
2908 }
2909 EXPORT_SYMBOL(sock_no_shutdown);
2910 
2911 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2912 {
2913 	return -EOPNOTSUPP;
2914 }
2915 EXPORT_SYMBOL(sock_no_sendmsg);
2916 
2917 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2918 {
2919 	return -EOPNOTSUPP;
2920 }
2921 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2922 
2923 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2924 		    int flags)
2925 {
2926 	return -EOPNOTSUPP;
2927 }
2928 EXPORT_SYMBOL(sock_no_recvmsg);
2929 
2930 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2931 {
2932 	/* Mirror missing mmap method error code */
2933 	return -ENODEV;
2934 }
2935 EXPORT_SYMBOL(sock_no_mmap);
2936 
2937 /*
2938  * When a file is received (via SCM_RIGHTS, etc), we must bump the
2939  * various sock-based usage counts.
2940  */
2941 void __receive_sock(struct file *file)
2942 {
2943 	struct socket *sock;
2944 
2945 	sock = sock_from_file(file);
2946 	if (sock) {
2947 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2948 		sock_update_classid(&sock->sk->sk_cgrp_data);
2949 	}
2950 }
2951 
2952 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2953 {
2954 	ssize_t res;
2955 	struct msghdr msg = {.msg_flags = flags};
2956 	struct kvec iov;
2957 	char *kaddr = kmap(page);
2958 	iov.iov_base = kaddr + offset;
2959 	iov.iov_len = size;
2960 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2961 	kunmap(page);
2962 	return res;
2963 }
2964 EXPORT_SYMBOL(sock_no_sendpage);
2965 
2966 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2967 				int offset, size_t size, int flags)
2968 {
2969 	ssize_t res;
2970 	struct msghdr msg = {.msg_flags = flags};
2971 	struct kvec iov;
2972 	char *kaddr = kmap(page);
2973 
2974 	iov.iov_base = kaddr + offset;
2975 	iov.iov_len = size;
2976 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2977 	kunmap(page);
2978 	return res;
2979 }
2980 EXPORT_SYMBOL(sock_no_sendpage_locked);
2981 
2982 /*
2983  *	Default Socket Callbacks
2984  */
2985 
2986 static void sock_def_wakeup(struct sock *sk)
2987 {
2988 	struct socket_wq *wq;
2989 
2990 	rcu_read_lock();
2991 	wq = rcu_dereference(sk->sk_wq);
2992 	if (skwq_has_sleeper(wq))
2993 		wake_up_interruptible_all(&wq->wait);
2994 	rcu_read_unlock();
2995 }
2996 
2997 static void sock_def_error_report(struct sock *sk)
2998 {
2999 	struct socket_wq *wq;
3000 
3001 	rcu_read_lock();
3002 	wq = rcu_dereference(sk->sk_wq);
3003 	if (skwq_has_sleeper(wq))
3004 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3005 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3006 	rcu_read_unlock();
3007 }
3008 
3009 void sock_def_readable(struct sock *sk)
3010 {
3011 	struct socket_wq *wq;
3012 
3013 	rcu_read_lock();
3014 	wq = rcu_dereference(sk->sk_wq);
3015 	if (skwq_has_sleeper(wq))
3016 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3017 						EPOLLRDNORM | EPOLLRDBAND);
3018 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3019 	rcu_read_unlock();
3020 }
3021 
3022 static void sock_def_write_space(struct sock *sk)
3023 {
3024 	struct socket_wq *wq;
3025 
3026 	rcu_read_lock();
3027 
3028 	/* Do not wake up a writer until he can make "significant"
3029 	 * progress.  --DaveM
3030 	 */
3031 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
3032 		wq = rcu_dereference(sk->sk_wq);
3033 		if (skwq_has_sleeper(wq))
3034 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3035 						EPOLLWRNORM | EPOLLWRBAND);
3036 
3037 		/* Should agree with poll, otherwise some programs break */
3038 		if (sock_writeable(sk))
3039 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3040 	}
3041 
3042 	rcu_read_unlock();
3043 }
3044 
3045 static void sock_def_destruct(struct sock *sk)
3046 {
3047 }
3048 
3049 void sk_send_sigurg(struct sock *sk)
3050 {
3051 	if (sk->sk_socket && sk->sk_socket->file)
3052 		if (send_sigurg(&sk->sk_socket->file->f_owner))
3053 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3054 }
3055 EXPORT_SYMBOL(sk_send_sigurg);
3056 
3057 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3058 		    unsigned long expires)
3059 {
3060 	if (!mod_timer(timer, expires))
3061 		sock_hold(sk);
3062 }
3063 EXPORT_SYMBOL(sk_reset_timer);
3064 
3065 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3066 {
3067 	if (del_timer(timer))
3068 		__sock_put(sk);
3069 }
3070 EXPORT_SYMBOL(sk_stop_timer);
3071 
3072 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3073 {
3074 	if (del_timer_sync(timer))
3075 		__sock_put(sk);
3076 }
3077 EXPORT_SYMBOL(sk_stop_timer_sync);
3078 
3079 void sock_init_data(struct socket *sock, struct sock *sk)
3080 {
3081 	sk_init_common(sk);
3082 	sk->sk_send_head	=	NULL;
3083 
3084 	timer_setup(&sk->sk_timer, NULL, 0);
3085 
3086 	sk->sk_allocation	=	GFP_KERNEL;
3087 	sk->sk_rcvbuf		=	sysctl_rmem_default;
3088 	sk->sk_sndbuf		=	sysctl_wmem_default;
3089 	sk->sk_state		=	TCP_CLOSE;
3090 	sk_set_socket(sk, sock);
3091 
3092 	sock_set_flag(sk, SOCK_ZAPPED);
3093 
3094 	if (sock) {
3095 		sk->sk_type	=	sock->type;
3096 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3097 		sock->sk	=	sk;
3098 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
3099 	} else {
3100 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3101 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
3102 	}
3103 
3104 	rwlock_init(&sk->sk_callback_lock);
3105 	if (sk->sk_kern_sock)
3106 		lockdep_set_class_and_name(
3107 			&sk->sk_callback_lock,
3108 			af_kern_callback_keys + sk->sk_family,
3109 			af_family_kern_clock_key_strings[sk->sk_family]);
3110 	else
3111 		lockdep_set_class_and_name(
3112 			&sk->sk_callback_lock,
3113 			af_callback_keys + sk->sk_family,
3114 			af_family_clock_key_strings[sk->sk_family]);
3115 
3116 	sk->sk_state_change	=	sock_def_wakeup;
3117 	sk->sk_data_ready	=	sock_def_readable;
3118 	sk->sk_write_space	=	sock_def_write_space;
3119 	sk->sk_error_report	=	sock_def_error_report;
3120 	sk->sk_destruct		=	sock_def_destruct;
3121 
3122 	sk->sk_frag.page	=	NULL;
3123 	sk->sk_frag.offset	=	0;
3124 	sk->sk_peek_off		=	-1;
3125 
3126 	sk->sk_peer_pid 	=	NULL;
3127 	sk->sk_peer_cred	=	NULL;
3128 	sk->sk_write_pending	=	0;
3129 	sk->sk_rcvlowat		=	1;
3130 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3131 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3132 
3133 	sk->sk_stamp = SK_DEFAULT_STAMP;
3134 #if BITS_PER_LONG==32
3135 	seqlock_init(&sk->sk_stamp_seq);
3136 #endif
3137 	atomic_set(&sk->sk_zckey, 0);
3138 
3139 #ifdef CONFIG_NET_RX_BUSY_POLL
3140 	sk->sk_napi_id		=	0;
3141 	sk->sk_ll_usec		=	sysctl_net_busy_read;
3142 #endif
3143 
3144 	sk->sk_max_pacing_rate = ~0UL;
3145 	sk->sk_pacing_rate = ~0UL;
3146 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3147 	sk->sk_incoming_cpu = -1;
3148 
3149 	sk_rx_queue_clear(sk);
3150 	/*
3151 	 * Before updating sk_refcnt, we must commit prior changes to memory
3152 	 * (Documentation/RCU/rculist_nulls.rst for details)
3153 	 */
3154 	smp_wmb();
3155 	refcount_set(&sk->sk_refcnt, 1);
3156 	atomic_set(&sk->sk_drops, 0);
3157 }
3158 EXPORT_SYMBOL(sock_init_data);
3159 
3160 void lock_sock_nested(struct sock *sk, int subclass)
3161 {
3162 	might_sleep();
3163 	spin_lock_bh(&sk->sk_lock.slock);
3164 	if (sk->sk_lock.owned)
3165 		__lock_sock(sk);
3166 	sk->sk_lock.owned = 1;
3167 	spin_unlock(&sk->sk_lock.slock);
3168 	/*
3169 	 * The sk_lock has mutex_lock() semantics here:
3170 	 */
3171 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3172 	local_bh_enable();
3173 }
3174 EXPORT_SYMBOL(lock_sock_nested);
3175 
3176 void release_sock(struct sock *sk)
3177 {
3178 	spin_lock_bh(&sk->sk_lock.slock);
3179 	if (sk->sk_backlog.tail)
3180 		__release_sock(sk);
3181 
3182 	/* Warning : release_cb() might need to release sk ownership,
3183 	 * ie call sock_release_ownership(sk) before us.
3184 	 */
3185 	if (sk->sk_prot->release_cb)
3186 		sk->sk_prot->release_cb(sk);
3187 
3188 	sock_release_ownership(sk);
3189 	if (waitqueue_active(&sk->sk_lock.wq))
3190 		wake_up(&sk->sk_lock.wq);
3191 	spin_unlock_bh(&sk->sk_lock.slock);
3192 }
3193 EXPORT_SYMBOL(release_sock);
3194 
3195 /**
3196  * lock_sock_fast - fast version of lock_sock
3197  * @sk: socket
3198  *
3199  * This version should be used for very small section, where process wont block
3200  * return false if fast path is taken:
3201  *
3202  *   sk_lock.slock locked, owned = 0, BH disabled
3203  *
3204  * return true if slow path is taken:
3205  *
3206  *   sk_lock.slock unlocked, owned = 1, BH enabled
3207  */
3208 bool lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3209 {
3210 	might_sleep();
3211 	spin_lock_bh(&sk->sk_lock.slock);
3212 
3213 	if (!sk->sk_lock.owned)
3214 		/*
3215 		 * Note : We must disable BH
3216 		 */
3217 		return false;
3218 
3219 	__lock_sock(sk);
3220 	sk->sk_lock.owned = 1;
3221 	spin_unlock(&sk->sk_lock.slock);
3222 	/*
3223 	 * The sk_lock has mutex_lock() semantics here:
3224 	 */
3225 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
3226 	__acquire(&sk->sk_lock.slock);
3227 	local_bh_enable();
3228 	return true;
3229 }
3230 EXPORT_SYMBOL(lock_sock_fast);
3231 
3232 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3233 		   bool timeval, bool time32)
3234 {
3235 	struct sock *sk = sock->sk;
3236 	struct timespec64 ts;
3237 
3238 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3239 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3240 	if (ts.tv_sec == -1)
3241 		return -ENOENT;
3242 	if (ts.tv_sec == 0) {
3243 		ktime_t kt = ktime_get_real();
3244 		sock_write_timestamp(sk, kt);
3245 		ts = ktime_to_timespec64(kt);
3246 	}
3247 
3248 	if (timeval)
3249 		ts.tv_nsec /= 1000;
3250 
3251 #ifdef CONFIG_COMPAT_32BIT_TIME
3252 	if (time32)
3253 		return put_old_timespec32(&ts, userstamp);
3254 #endif
3255 #ifdef CONFIG_SPARC64
3256 	/* beware of padding in sparc64 timeval */
3257 	if (timeval && !in_compat_syscall()) {
3258 		struct __kernel_old_timeval __user tv = {
3259 			.tv_sec = ts.tv_sec,
3260 			.tv_usec = ts.tv_nsec,
3261 		};
3262 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3263 			return -EFAULT;
3264 		return 0;
3265 	}
3266 #endif
3267 	return put_timespec64(&ts, userstamp);
3268 }
3269 EXPORT_SYMBOL(sock_gettstamp);
3270 
3271 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3272 {
3273 	if (!sock_flag(sk, flag)) {
3274 		unsigned long previous_flags = sk->sk_flags;
3275 
3276 		sock_set_flag(sk, flag);
3277 		/*
3278 		 * we just set one of the two flags which require net
3279 		 * time stamping, but time stamping might have been on
3280 		 * already because of the other one
3281 		 */
3282 		if (sock_needs_netstamp(sk) &&
3283 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3284 			net_enable_timestamp();
3285 	}
3286 }
3287 
3288 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3289 		       int level, int type)
3290 {
3291 	struct sock_exterr_skb *serr;
3292 	struct sk_buff *skb;
3293 	int copied, err;
3294 
3295 	err = -EAGAIN;
3296 	skb = sock_dequeue_err_skb(sk);
3297 	if (skb == NULL)
3298 		goto out;
3299 
3300 	copied = skb->len;
3301 	if (copied > len) {
3302 		msg->msg_flags |= MSG_TRUNC;
3303 		copied = len;
3304 	}
3305 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3306 	if (err)
3307 		goto out_free_skb;
3308 
3309 	sock_recv_timestamp(msg, sk, skb);
3310 
3311 	serr = SKB_EXT_ERR(skb);
3312 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3313 
3314 	msg->msg_flags |= MSG_ERRQUEUE;
3315 	err = copied;
3316 
3317 out_free_skb:
3318 	kfree_skb(skb);
3319 out:
3320 	return err;
3321 }
3322 EXPORT_SYMBOL(sock_recv_errqueue);
3323 
3324 /*
3325  *	Get a socket option on an socket.
3326  *
3327  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3328  *	asynchronous errors should be reported by getsockopt. We assume
3329  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3330  */
3331 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3332 			   char __user *optval, int __user *optlen)
3333 {
3334 	struct sock *sk = sock->sk;
3335 
3336 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3337 }
3338 EXPORT_SYMBOL(sock_common_getsockopt);
3339 
3340 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3341 			int flags)
3342 {
3343 	struct sock *sk = sock->sk;
3344 	int addr_len = 0;
3345 	int err;
3346 
3347 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3348 				   flags & ~MSG_DONTWAIT, &addr_len);
3349 	if (err >= 0)
3350 		msg->msg_namelen = addr_len;
3351 	return err;
3352 }
3353 EXPORT_SYMBOL(sock_common_recvmsg);
3354 
3355 /*
3356  *	Set socket options on an inet socket.
3357  */
3358 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3359 			   sockptr_t optval, unsigned int optlen)
3360 {
3361 	struct sock *sk = sock->sk;
3362 
3363 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3364 }
3365 EXPORT_SYMBOL(sock_common_setsockopt);
3366 
3367 void sk_common_release(struct sock *sk)
3368 {
3369 	if (sk->sk_prot->destroy)
3370 		sk->sk_prot->destroy(sk);
3371 
3372 	/*
3373 	 * Observation: when sk_common_release is called, processes have
3374 	 * no access to socket. But net still has.
3375 	 * Step one, detach it from networking:
3376 	 *
3377 	 * A. Remove from hash tables.
3378 	 */
3379 
3380 	sk->sk_prot->unhash(sk);
3381 
3382 	/*
3383 	 * In this point socket cannot receive new packets, but it is possible
3384 	 * that some packets are in flight because some CPU runs receiver and
3385 	 * did hash table lookup before we unhashed socket. They will achieve
3386 	 * receive queue and will be purged by socket destructor.
3387 	 *
3388 	 * Also we still have packets pending on receive queue and probably,
3389 	 * our own packets waiting in device queues. sock_destroy will drain
3390 	 * receive queue, but transmitted packets will delay socket destruction
3391 	 * until the last reference will be released.
3392 	 */
3393 
3394 	sock_orphan(sk);
3395 
3396 	xfrm_sk_free_policy(sk);
3397 
3398 	sk_refcnt_debug_release(sk);
3399 
3400 	sock_put(sk);
3401 }
3402 EXPORT_SYMBOL(sk_common_release);
3403 
3404 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3405 {
3406 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3407 
3408 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3409 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3410 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3411 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3412 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3413 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3414 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3415 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3416 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3417 }
3418 
3419 #ifdef CONFIG_PROC_FS
3420 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
3421 struct prot_inuse {
3422 	int val[PROTO_INUSE_NR];
3423 };
3424 
3425 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3426 
3427 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3428 {
3429 	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3430 }
3431 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3432 
3433 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3434 {
3435 	int cpu, idx = prot->inuse_idx;
3436 	int res = 0;
3437 
3438 	for_each_possible_cpu(cpu)
3439 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3440 
3441 	return res >= 0 ? res : 0;
3442 }
3443 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3444 
3445 static void sock_inuse_add(struct net *net, int val)
3446 {
3447 	this_cpu_add(*net->core.sock_inuse, val);
3448 }
3449 
3450 int sock_inuse_get(struct net *net)
3451 {
3452 	int cpu, res = 0;
3453 
3454 	for_each_possible_cpu(cpu)
3455 		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3456 
3457 	return res;
3458 }
3459 
3460 EXPORT_SYMBOL_GPL(sock_inuse_get);
3461 
3462 static int __net_init sock_inuse_init_net(struct net *net)
3463 {
3464 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3465 	if (net->core.prot_inuse == NULL)
3466 		return -ENOMEM;
3467 
3468 	net->core.sock_inuse = alloc_percpu(int);
3469 	if (net->core.sock_inuse == NULL)
3470 		goto out;
3471 
3472 	return 0;
3473 
3474 out:
3475 	free_percpu(net->core.prot_inuse);
3476 	return -ENOMEM;
3477 }
3478 
3479 static void __net_exit sock_inuse_exit_net(struct net *net)
3480 {
3481 	free_percpu(net->core.prot_inuse);
3482 	free_percpu(net->core.sock_inuse);
3483 }
3484 
3485 static struct pernet_operations net_inuse_ops = {
3486 	.init = sock_inuse_init_net,
3487 	.exit = sock_inuse_exit_net,
3488 };
3489 
3490 static __init int net_inuse_init(void)
3491 {
3492 	if (register_pernet_subsys(&net_inuse_ops))
3493 		panic("Cannot initialize net inuse counters");
3494 
3495 	return 0;
3496 }
3497 
3498 core_initcall(net_inuse_init);
3499 
3500 static int assign_proto_idx(struct proto *prot)
3501 {
3502 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3503 
3504 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3505 		pr_err("PROTO_INUSE_NR exhausted\n");
3506 		return -ENOSPC;
3507 	}
3508 
3509 	set_bit(prot->inuse_idx, proto_inuse_idx);
3510 	return 0;
3511 }
3512 
3513 static void release_proto_idx(struct proto *prot)
3514 {
3515 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3516 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3517 }
3518 #else
3519 static inline int assign_proto_idx(struct proto *prot)
3520 {
3521 	return 0;
3522 }
3523 
3524 static inline void release_proto_idx(struct proto *prot)
3525 {
3526 }
3527 
3528 static void sock_inuse_add(struct net *net, int val)
3529 {
3530 }
3531 #endif
3532 
3533 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3534 {
3535 	if (!twsk_prot)
3536 		return;
3537 	kfree(twsk_prot->twsk_slab_name);
3538 	twsk_prot->twsk_slab_name = NULL;
3539 	kmem_cache_destroy(twsk_prot->twsk_slab);
3540 	twsk_prot->twsk_slab = NULL;
3541 }
3542 
3543 static int tw_prot_init(const struct proto *prot)
3544 {
3545 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3546 
3547 	if (!twsk_prot)
3548 		return 0;
3549 
3550 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3551 					      prot->name);
3552 	if (!twsk_prot->twsk_slab_name)
3553 		return -ENOMEM;
3554 
3555 	twsk_prot->twsk_slab =
3556 		kmem_cache_create(twsk_prot->twsk_slab_name,
3557 				  twsk_prot->twsk_obj_size, 0,
3558 				  SLAB_ACCOUNT | prot->slab_flags,
3559 				  NULL);
3560 	if (!twsk_prot->twsk_slab) {
3561 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3562 			prot->name);
3563 		return -ENOMEM;
3564 	}
3565 
3566 	return 0;
3567 }
3568 
3569 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3570 {
3571 	if (!rsk_prot)
3572 		return;
3573 	kfree(rsk_prot->slab_name);
3574 	rsk_prot->slab_name = NULL;
3575 	kmem_cache_destroy(rsk_prot->slab);
3576 	rsk_prot->slab = NULL;
3577 }
3578 
3579 static int req_prot_init(const struct proto *prot)
3580 {
3581 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3582 
3583 	if (!rsk_prot)
3584 		return 0;
3585 
3586 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3587 					prot->name);
3588 	if (!rsk_prot->slab_name)
3589 		return -ENOMEM;
3590 
3591 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3592 					   rsk_prot->obj_size, 0,
3593 					   SLAB_ACCOUNT | prot->slab_flags,
3594 					   NULL);
3595 
3596 	if (!rsk_prot->slab) {
3597 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3598 			prot->name);
3599 		return -ENOMEM;
3600 	}
3601 	return 0;
3602 }
3603 
3604 int proto_register(struct proto *prot, int alloc_slab)
3605 {
3606 	int ret = -ENOBUFS;
3607 
3608 	if (alloc_slab) {
3609 		prot->slab = kmem_cache_create_usercopy(prot->name,
3610 					prot->obj_size, 0,
3611 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3612 					prot->slab_flags,
3613 					prot->useroffset, prot->usersize,
3614 					NULL);
3615 
3616 		if (prot->slab == NULL) {
3617 			pr_crit("%s: Can't create sock SLAB cache!\n",
3618 				prot->name);
3619 			goto out;
3620 		}
3621 
3622 		if (req_prot_init(prot))
3623 			goto out_free_request_sock_slab;
3624 
3625 		if (tw_prot_init(prot))
3626 			goto out_free_timewait_sock_slab;
3627 	}
3628 
3629 	mutex_lock(&proto_list_mutex);
3630 	ret = assign_proto_idx(prot);
3631 	if (ret) {
3632 		mutex_unlock(&proto_list_mutex);
3633 		goto out_free_timewait_sock_slab;
3634 	}
3635 	list_add(&prot->node, &proto_list);
3636 	mutex_unlock(&proto_list_mutex);
3637 	return ret;
3638 
3639 out_free_timewait_sock_slab:
3640 	if (alloc_slab)
3641 		tw_prot_cleanup(prot->twsk_prot);
3642 out_free_request_sock_slab:
3643 	if (alloc_slab) {
3644 		req_prot_cleanup(prot->rsk_prot);
3645 
3646 		kmem_cache_destroy(prot->slab);
3647 		prot->slab = NULL;
3648 	}
3649 out:
3650 	return ret;
3651 }
3652 EXPORT_SYMBOL(proto_register);
3653 
3654 void proto_unregister(struct proto *prot)
3655 {
3656 	mutex_lock(&proto_list_mutex);
3657 	release_proto_idx(prot);
3658 	list_del(&prot->node);
3659 	mutex_unlock(&proto_list_mutex);
3660 
3661 	kmem_cache_destroy(prot->slab);
3662 	prot->slab = NULL;
3663 
3664 	req_prot_cleanup(prot->rsk_prot);
3665 	tw_prot_cleanup(prot->twsk_prot);
3666 }
3667 EXPORT_SYMBOL(proto_unregister);
3668 
3669 int sock_load_diag_module(int family, int protocol)
3670 {
3671 	if (!protocol) {
3672 		if (!sock_is_registered(family))
3673 			return -ENOENT;
3674 
3675 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3676 				      NETLINK_SOCK_DIAG, family);
3677 	}
3678 
3679 #ifdef CONFIG_INET
3680 	if (family == AF_INET &&
3681 	    protocol != IPPROTO_RAW &&
3682 	    protocol < MAX_INET_PROTOS &&
3683 	    !rcu_access_pointer(inet_protos[protocol]))
3684 		return -ENOENT;
3685 #endif
3686 
3687 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3688 			      NETLINK_SOCK_DIAG, family, protocol);
3689 }
3690 EXPORT_SYMBOL(sock_load_diag_module);
3691 
3692 #ifdef CONFIG_PROC_FS
3693 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3694 	__acquires(proto_list_mutex)
3695 {
3696 	mutex_lock(&proto_list_mutex);
3697 	return seq_list_start_head(&proto_list, *pos);
3698 }
3699 
3700 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3701 {
3702 	return seq_list_next(v, &proto_list, pos);
3703 }
3704 
3705 static void proto_seq_stop(struct seq_file *seq, void *v)
3706 	__releases(proto_list_mutex)
3707 {
3708 	mutex_unlock(&proto_list_mutex);
3709 }
3710 
3711 static char proto_method_implemented(const void *method)
3712 {
3713 	return method == NULL ? 'n' : 'y';
3714 }
3715 static long sock_prot_memory_allocated(struct proto *proto)
3716 {
3717 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3718 }
3719 
3720 static const char *sock_prot_memory_pressure(struct proto *proto)
3721 {
3722 	return proto->memory_pressure != NULL ?
3723 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3724 }
3725 
3726 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3727 {
3728 
3729 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3730 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3731 		   proto->name,
3732 		   proto->obj_size,
3733 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3734 		   sock_prot_memory_allocated(proto),
3735 		   sock_prot_memory_pressure(proto),
3736 		   proto->max_header,
3737 		   proto->slab == NULL ? "no" : "yes",
3738 		   module_name(proto->owner),
3739 		   proto_method_implemented(proto->close),
3740 		   proto_method_implemented(proto->connect),
3741 		   proto_method_implemented(proto->disconnect),
3742 		   proto_method_implemented(proto->accept),
3743 		   proto_method_implemented(proto->ioctl),
3744 		   proto_method_implemented(proto->init),
3745 		   proto_method_implemented(proto->destroy),
3746 		   proto_method_implemented(proto->shutdown),
3747 		   proto_method_implemented(proto->setsockopt),
3748 		   proto_method_implemented(proto->getsockopt),
3749 		   proto_method_implemented(proto->sendmsg),
3750 		   proto_method_implemented(proto->recvmsg),
3751 		   proto_method_implemented(proto->sendpage),
3752 		   proto_method_implemented(proto->bind),
3753 		   proto_method_implemented(proto->backlog_rcv),
3754 		   proto_method_implemented(proto->hash),
3755 		   proto_method_implemented(proto->unhash),
3756 		   proto_method_implemented(proto->get_port),
3757 		   proto_method_implemented(proto->enter_memory_pressure));
3758 }
3759 
3760 static int proto_seq_show(struct seq_file *seq, void *v)
3761 {
3762 	if (v == &proto_list)
3763 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3764 			   "protocol",
3765 			   "size",
3766 			   "sockets",
3767 			   "memory",
3768 			   "press",
3769 			   "maxhdr",
3770 			   "slab",
3771 			   "module",
3772 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3773 	else
3774 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3775 	return 0;
3776 }
3777 
3778 static const struct seq_operations proto_seq_ops = {
3779 	.start  = proto_seq_start,
3780 	.next   = proto_seq_next,
3781 	.stop   = proto_seq_stop,
3782 	.show   = proto_seq_show,
3783 };
3784 
3785 static __net_init int proto_init_net(struct net *net)
3786 {
3787 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3788 			sizeof(struct seq_net_private)))
3789 		return -ENOMEM;
3790 
3791 	return 0;
3792 }
3793 
3794 static __net_exit void proto_exit_net(struct net *net)
3795 {
3796 	remove_proc_entry("protocols", net->proc_net);
3797 }
3798 
3799 
3800 static __net_initdata struct pernet_operations proto_net_ops = {
3801 	.init = proto_init_net,
3802 	.exit = proto_exit_net,
3803 };
3804 
3805 static int __init proto_init(void)
3806 {
3807 	return register_pernet_subsys(&proto_net_ops);
3808 }
3809 
3810 subsys_initcall(proto_init);
3811 
3812 #endif /* PROC_FS */
3813 
3814 #ifdef CONFIG_NET_RX_BUSY_POLL
3815 bool sk_busy_loop_end(void *p, unsigned long start_time)
3816 {
3817 	struct sock *sk = p;
3818 
3819 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3820 	       sk_busy_loop_timeout(sk, start_time);
3821 }
3822 EXPORT_SYMBOL(sk_busy_loop_end);
3823 #endif /* CONFIG_NET_RX_BUSY_POLL */
3824 
3825 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3826 {
3827 	if (!sk->sk_prot->bind_add)
3828 		return -EOPNOTSUPP;
3829 	return sk->sk_prot->bind_add(sk, addr, addr_len);
3830 }
3831 EXPORT_SYMBOL(sock_bind_add);
3832