xref: /linux/net/core/sock.c (revision e3b9626f09d429788d929c9b9000a069fcfc056e)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/init.h>
111 #include <linux/highmem.h>
112 #include <linux/user_namespace.h>
113 #include <linux/static_key.h>
114 #include <linux/memcontrol.h>
115 #include <linux/prefetch.h>
116 #include <linux/compat.h>
117 
118 #include <linux/uaccess.h>
119 
120 #include <linux/netdevice.h>
121 #include <net/protocol.h>
122 #include <linux/skbuff.h>
123 #include <net/net_namespace.h>
124 #include <net/request_sock.h>
125 #include <net/sock.h>
126 #include <linux/net_tstamp.h>
127 #include <net/xfrm.h>
128 #include <linux/ipsec.h>
129 #include <net/cls_cgroup.h>
130 #include <net/netprio_cgroup.h>
131 #include <linux/sock_diag.h>
132 
133 #include <linux/filter.h>
134 #include <net/sock_reuseport.h>
135 #include <net/bpf_sk_storage.h>
136 
137 #include <trace/events/sock.h>
138 
139 #include <net/tcp.h>
140 #include <net/busy_poll.h>
141 
142 static DEFINE_MUTEX(proto_list_mutex);
143 static LIST_HEAD(proto_list);
144 
145 static void sock_inuse_add(struct net *net, int val);
146 
147 /**
148  * sk_ns_capable - General socket capability test
149  * @sk: Socket to use a capability on or through
150  * @user_ns: The user namespace of the capability to use
151  * @cap: The capability to use
152  *
153  * Test to see if the opener of the socket had when the socket was
154  * created and the current process has the capability @cap in the user
155  * namespace @user_ns.
156  */
157 bool sk_ns_capable(const struct sock *sk,
158 		   struct user_namespace *user_ns, int cap)
159 {
160 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
161 		ns_capable(user_ns, cap);
162 }
163 EXPORT_SYMBOL(sk_ns_capable);
164 
165 /**
166  * sk_capable - Socket global capability test
167  * @sk: Socket to use a capability on or through
168  * @cap: The global capability to use
169  *
170  * Test to see if the opener of the socket had when the socket was
171  * created and the current process has the capability @cap in all user
172  * namespaces.
173  */
174 bool sk_capable(const struct sock *sk, int cap)
175 {
176 	return sk_ns_capable(sk, &init_user_ns, cap);
177 }
178 EXPORT_SYMBOL(sk_capable);
179 
180 /**
181  * sk_net_capable - Network namespace socket capability test
182  * @sk: Socket to use a capability on or through
183  * @cap: The capability to use
184  *
185  * Test to see if the opener of the socket had when the socket was created
186  * and the current process has the capability @cap over the network namespace
187  * the socket is a member of.
188  */
189 bool sk_net_capable(const struct sock *sk, int cap)
190 {
191 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
192 }
193 EXPORT_SYMBOL(sk_net_capable);
194 
195 /*
196  * Each address family might have different locking rules, so we have
197  * one slock key per address family and separate keys for internal and
198  * userspace sockets.
199  */
200 static struct lock_class_key af_family_keys[AF_MAX];
201 static struct lock_class_key af_family_kern_keys[AF_MAX];
202 static struct lock_class_key af_family_slock_keys[AF_MAX];
203 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
204 
205 /*
206  * Make lock validator output more readable. (we pre-construct these
207  * strings build-time, so that runtime initialization of socket
208  * locks is fast):
209  */
210 
211 #define _sock_locks(x)						  \
212   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
213   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
214   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
215   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
216   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
217   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
218   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
219   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
220   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
221   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
222   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
223   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
224   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
225   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
226   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
227   x "AF_MAX"
228 
229 static const char *const af_family_key_strings[AF_MAX+1] = {
230 	_sock_locks("sk_lock-")
231 };
232 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
233 	_sock_locks("slock-")
234 };
235 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
236 	_sock_locks("clock-")
237 };
238 
239 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
240 	_sock_locks("k-sk_lock-")
241 };
242 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
243 	_sock_locks("k-slock-")
244 };
245 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
246 	_sock_locks("k-clock-")
247 };
248 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
249 	_sock_locks("rlock-")
250 };
251 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
252 	_sock_locks("wlock-")
253 };
254 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
255 	_sock_locks("elock-")
256 };
257 
258 /*
259  * sk_callback_lock and sk queues locking rules are per-address-family,
260  * so split the lock classes by using a per-AF key:
261  */
262 static struct lock_class_key af_callback_keys[AF_MAX];
263 static struct lock_class_key af_rlock_keys[AF_MAX];
264 static struct lock_class_key af_wlock_keys[AF_MAX];
265 static struct lock_class_key af_elock_keys[AF_MAX];
266 static struct lock_class_key af_kern_callback_keys[AF_MAX];
267 
268 /* Run time adjustable parameters. */
269 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
270 EXPORT_SYMBOL(sysctl_wmem_max);
271 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
272 EXPORT_SYMBOL(sysctl_rmem_max);
273 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
274 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
275 
276 /* Maximal space eaten by iovec or ancillary data plus some space */
277 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
278 EXPORT_SYMBOL(sysctl_optmem_max);
279 
280 int sysctl_tstamp_allow_data __read_mostly = 1;
281 
282 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
283 EXPORT_SYMBOL_GPL(memalloc_socks_key);
284 
285 /**
286  * sk_set_memalloc - sets %SOCK_MEMALLOC
287  * @sk: socket to set it on
288  *
289  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
290  * It's the responsibility of the admin to adjust min_free_kbytes
291  * to meet the requirements
292  */
293 void sk_set_memalloc(struct sock *sk)
294 {
295 	sock_set_flag(sk, SOCK_MEMALLOC);
296 	sk->sk_allocation |= __GFP_MEMALLOC;
297 	static_branch_inc(&memalloc_socks_key);
298 }
299 EXPORT_SYMBOL_GPL(sk_set_memalloc);
300 
301 void sk_clear_memalloc(struct sock *sk)
302 {
303 	sock_reset_flag(sk, SOCK_MEMALLOC);
304 	sk->sk_allocation &= ~__GFP_MEMALLOC;
305 	static_branch_dec(&memalloc_socks_key);
306 
307 	/*
308 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
309 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
310 	 * it has rmem allocations due to the last swapfile being deactivated
311 	 * but there is a risk that the socket is unusable due to exceeding
312 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
313 	 */
314 	sk_mem_reclaim(sk);
315 }
316 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
317 
318 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
319 {
320 	int ret;
321 	unsigned int noreclaim_flag;
322 
323 	/* these should have been dropped before queueing */
324 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
325 
326 	noreclaim_flag = memalloc_noreclaim_save();
327 	ret = sk->sk_backlog_rcv(sk, skb);
328 	memalloc_noreclaim_restore(noreclaim_flag);
329 
330 	return ret;
331 }
332 EXPORT_SYMBOL(__sk_backlog_rcv);
333 
334 static int sock_get_timeout(long timeo, void *optval, bool old_timeval)
335 {
336 	struct __kernel_sock_timeval tv;
337 
338 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
339 		tv.tv_sec = 0;
340 		tv.tv_usec = 0;
341 	} else {
342 		tv.tv_sec = timeo / HZ;
343 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
344 	}
345 
346 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
347 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
348 		*(struct old_timeval32 *)optval = tv32;
349 		return sizeof(tv32);
350 	}
351 
352 	if (old_timeval) {
353 		struct __kernel_old_timeval old_tv;
354 		old_tv.tv_sec = tv.tv_sec;
355 		old_tv.tv_usec = tv.tv_usec;
356 		*(struct __kernel_old_timeval *)optval = old_tv;
357 		return sizeof(old_tv);
358 	}
359 
360 	*(struct __kernel_sock_timeval *)optval = tv;
361 	return sizeof(tv);
362 }
363 
364 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
365 			    bool old_timeval)
366 {
367 	struct __kernel_sock_timeval tv;
368 
369 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
370 		struct old_timeval32 tv32;
371 
372 		if (optlen < sizeof(tv32))
373 			return -EINVAL;
374 
375 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
376 			return -EFAULT;
377 		tv.tv_sec = tv32.tv_sec;
378 		tv.tv_usec = tv32.tv_usec;
379 	} else if (old_timeval) {
380 		struct __kernel_old_timeval old_tv;
381 
382 		if (optlen < sizeof(old_tv))
383 			return -EINVAL;
384 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
385 			return -EFAULT;
386 		tv.tv_sec = old_tv.tv_sec;
387 		tv.tv_usec = old_tv.tv_usec;
388 	} else {
389 		if (optlen < sizeof(tv))
390 			return -EINVAL;
391 		if (copy_from_sockptr(&tv, optval, sizeof(tv)))
392 			return -EFAULT;
393 	}
394 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
395 		return -EDOM;
396 
397 	if (tv.tv_sec < 0) {
398 		static int warned __read_mostly;
399 
400 		*timeo_p = 0;
401 		if (warned < 10 && net_ratelimit()) {
402 			warned++;
403 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
404 				__func__, current->comm, task_pid_nr(current));
405 		}
406 		return 0;
407 	}
408 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
409 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
410 		return 0;
411 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
412 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
413 	return 0;
414 }
415 
416 static bool sock_needs_netstamp(const struct sock *sk)
417 {
418 	switch (sk->sk_family) {
419 	case AF_UNSPEC:
420 	case AF_UNIX:
421 		return false;
422 	default:
423 		return true;
424 	}
425 }
426 
427 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
428 {
429 	if (sk->sk_flags & flags) {
430 		sk->sk_flags &= ~flags;
431 		if (sock_needs_netstamp(sk) &&
432 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
433 			net_disable_timestamp();
434 	}
435 }
436 
437 
438 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
439 {
440 	unsigned long flags;
441 	struct sk_buff_head *list = &sk->sk_receive_queue;
442 
443 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
444 		atomic_inc(&sk->sk_drops);
445 		trace_sock_rcvqueue_full(sk, skb);
446 		return -ENOMEM;
447 	}
448 
449 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
450 		atomic_inc(&sk->sk_drops);
451 		return -ENOBUFS;
452 	}
453 
454 	skb->dev = NULL;
455 	skb_set_owner_r(skb, sk);
456 
457 	/* we escape from rcu protected region, make sure we dont leak
458 	 * a norefcounted dst
459 	 */
460 	skb_dst_force(skb);
461 
462 	spin_lock_irqsave(&list->lock, flags);
463 	sock_skb_set_dropcount(sk, skb);
464 	__skb_queue_tail(list, skb);
465 	spin_unlock_irqrestore(&list->lock, flags);
466 
467 	if (!sock_flag(sk, SOCK_DEAD))
468 		sk->sk_data_ready(sk);
469 	return 0;
470 }
471 EXPORT_SYMBOL(__sock_queue_rcv_skb);
472 
473 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
474 {
475 	int err;
476 
477 	err = sk_filter(sk, skb);
478 	if (err)
479 		return err;
480 
481 	return __sock_queue_rcv_skb(sk, skb);
482 }
483 EXPORT_SYMBOL(sock_queue_rcv_skb);
484 
485 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
486 		     const int nested, unsigned int trim_cap, bool refcounted)
487 {
488 	int rc = NET_RX_SUCCESS;
489 
490 	if (sk_filter_trim_cap(sk, skb, trim_cap))
491 		goto discard_and_relse;
492 
493 	skb->dev = NULL;
494 
495 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
496 		atomic_inc(&sk->sk_drops);
497 		goto discard_and_relse;
498 	}
499 	if (nested)
500 		bh_lock_sock_nested(sk);
501 	else
502 		bh_lock_sock(sk);
503 	if (!sock_owned_by_user(sk)) {
504 		/*
505 		 * trylock + unlock semantics:
506 		 */
507 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
508 
509 		rc = sk_backlog_rcv(sk, skb);
510 
511 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
512 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
513 		bh_unlock_sock(sk);
514 		atomic_inc(&sk->sk_drops);
515 		goto discard_and_relse;
516 	}
517 
518 	bh_unlock_sock(sk);
519 out:
520 	if (refcounted)
521 		sock_put(sk);
522 	return rc;
523 discard_and_relse:
524 	kfree_skb(skb);
525 	goto out;
526 }
527 EXPORT_SYMBOL(__sk_receive_skb);
528 
529 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
530 {
531 	struct dst_entry *dst = __sk_dst_get(sk);
532 
533 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
534 		sk_tx_queue_clear(sk);
535 		sk->sk_dst_pending_confirm = 0;
536 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
537 		dst_release(dst);
538 		return NULL;
539 	}
540 
541 	return dst;
542 }
543 EXPORT_SYMBOL(__sk_dst_check);
544 
545 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
546 {
547 	struct dst_entry *dst = sk_dst_get(sk);
548 
549 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
550 		sk_dst_reset(sk);
551 		dst_release(dst);
552 		return NULL;
553 	}
554 
555 	return dst;
556 }
557 EXPORT_SYMBOL(sk_dst_check);
558 
559 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
560 {
561 	int ret = -ENOPROTOOPT;
562 #ifdef CONFIG_NETDEVICES
563 	struct net *net = sock_net(sk);
564 
565 	/* Sorry... */
566 	ret = -EPERM;
567 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
568 		goto out;
569 
570 	ret = -EINVAL;
571 	if (ifindex < 0)
572 		goto out;
573 
574 	sk->sk_bound_dev_if = ifindex;
575 	if (sk->sk_prot->rehash)
576 		sk->sk_prot->rehash(sk);
577 	sk_dst_reset(sk);
578 
579 	ret = 0;
580 
581 out:
582 #endif
583 
584 	return ret;
585 }
586 
587 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
588 {
589 	int ret;
590 
591 	if (lock_sk)
592 		lock_sock(sk);
593 	ret = sock_bindtoindex_locked(sk, ifindex);
594 	if (lock_sk)
595 		release_sock(sk);
596 
597 	return ret;
598 }
599 EXPORT_SYMBOL(sock_bindtoindex);
600 
601 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
602 {
603 	int ret = -ENOPROTOOPT;
604 #ifdef CONFIG_NETDEVICES
605 	struct net *net = sock_net(sk);
606 	char devname[IFNAMSIZ];
607 	int index;
608 
609 	ret = -EINVAL;
610 	if (optlen < 0)
611 		goto out;
612 
613 	/* Bind this socket to a particular device like "eth0",
614 	 * as specified in the passed interface name. If the
615 	 * name is "" or the option length is zero the socket
616 	 * is not bound.
617 	 */
618 	if (optlen > IFNAMSIZ - 1)
619 		optlen = IFNAMSIZ - 1;
620 	memset(devname, 0, sizeof(devname));
621 
622 	ret = -EFAULT;
623 	if (copy_from_sockptr(devname, optval, optlen))
624 		goto out;
625 
626 	index = 0;
627 	if (devname[0] != '\0') {
628 		struct net_device *dev;
629 
630 		rcu_read_lock();
631 		dev = dev_get_by_name_rcu(net, devname);
632 		if (dev)
633 			index = dev->ifindex;
634 		rcu_read_unlock();
635 		ret = -ENODEV;
636 		if (!dev)
637 			goto out;
638 	}
639 
640 	return sock_bindtoindex(sk, index, true);
641 out:
642 #endif
643 
644 	return ret;
645 }
646 
647 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
648 				int __user *optlen, int len)
649 {
650 	int ret = -ENOPROTOOPT;
651 #ifdef CONFIG_NETDEVICES
652 	struct net *net = sock_net(sk);
653 	char devname[IFNAMSIZ];
654 
655 	if (sk->sk_bound_dev_if == 0) {
656 		len = 0;
657 		goto zero;
658 	}
659 
660 	ret = -EINVAL;
661 	if (len < IFNAMSIZ)
662 		goto out;
663 
664 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
665 	if (ret)
666 		goto out;
667 
668 	len = strlen(devname) + 1;
669 
670 	ret = -EFAULT;
671 	if (copy_to_user(optval, devname, len))
672 		goto out;
673 
674 zero:
675 	ret = -EFAULT;
676 	if (put_user(len, optlen))
677 		goto out;
678 
679 	ret = 0;
680 
681 out:
682 #endif
683 
684 	return ret;
685 }
686 
687 bool sk_mc_loop(struct sock *sk)
688 {
689 	if (dev_recursion_level())
690 		return false;
691 	if (!sk)
692 		return true;
693 	switch (sk->sk_family) {
694 	case AF_INET:
695 		return inet_sk(sk)->mc_loop;
696 #if IS_ENABLED(CONFIG_IPV6)
697 	case AF_INET6:
698 		return inet6_sk(sk)->mc_loop;
699 #endif
700 	}
701 	WARN_ON_ONCE(1);
702 	return true;
703 }
704 EXPORT_SYMBOL(sk_mc_loop);
705 
706 void sock_set_reuseaddr(struct sock *sk)
707 {
708 	lock_sock(sk);
709 	sk->sk_reuse = SK_CAN_REUSE;
710 	release_sock(sk);
711 }
712 EXPORT_SYMBOL(sock_set_reuseaddr);
713 
714 void sock_set_reuseport(struct sock *sk)
715 {
716 	lock_sock(sk);
717 	sk->sk_reuseport = true;
718 	release_sock(sk);
719 }
720 EXPORT_SYMBOL(sock_set_reuseport);
721 
722 void sock_no_linger(struct sock *sk)
723 {
724 	lock_sock(sk);
725 	sk->sk_lingertime = 0;
726 	sock_set_flag(sk, SOCK_LINGER);
727 	release_sock(sk);
728 }
729 EXPORT_SYMBOL(sock_no_linger);
730 
731 void sock_set_priority(struct sock *sk, u32 priority)
732 {
733 	lock_sock(sk);
734 	sk->sk_priority = priority;
735 	release_sock(sk);
736 }
737 EXPORT_SYMBOL(sock_set_priority);
738 
739 void sock_set_sndtimeo(struct sock *sk, s64 secs)
740 {
741 	lock_sock(sk);
742 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
743 		sk->sk_sndtimeo = secs * HZ;
744 	else
745 		sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
746 	release_sock(sk);
747 }
748 EXPORT_SYMBOL(sock_set_sndtimeo);
749 
750 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
751 {
752 	if (val)  {
753 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
754 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
755 		sock_set_flag(sk, SOCK_RCVTSTAMP);
756 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
757 	} else {
758 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
759 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
760 		sock_reset_flag(sk, SOCK_TSTAMP_NEW);
761 	}
762 }
763 
764 void sock_enable_timestamps(struct sock *sk)
765 {
766 	lock_sock(sk);
767 	__sock_set_timestamps(sk, true, false, true);
768 	release_sock(sk);
769 }
770 EXPORT_SYMBOL(sock_enable_timestamps);
771 
772 void sock_set_keepalive(struct sock *sk)
773 {
774 	lock_sock(sk);
775 	if (sk->sk_prot->keepalive)
776 		sk->sk_prot->keepalive(sk, true);
777 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
778 	release_sock(sk);
779 }
780 EXPORT_SYMBOL(sock_set_keepalive);
781 
782 static void __sock_set_rcvbuf(struct sock *sk, int val)
783 {
784 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
785 	 * as a negative value.
786 	 */
787 	val = min_t(int, val, INT_MAX / 2);
788 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
789 
790 	/* We double it on the way in to account for "struct sk_buff" etc.
791 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
792 	 * will allow that much actual data to be received on that socket.
793 	 *
794 	 * Applications are unaware that "struct sk_buff" and other overheads
795 	 * allocate from the receive buffer during socket buffer allocation.
796 	 *
797 	 * And after considering the possible alternatives, returning the value
798 	 * we actually used in getsockopt is the most desirable behavior.
799 	 */
800 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
801 }
802 
803 void sock_set_rcvbuf(struct sock *sk, int val)
804 {
805 	lock_sock(sk);
806 	__sock_set_rcvbuf(sk, val);
807 	release_sock(sk);
808 }
809 EXPORT_SYMBOL(sock_set_rcvbuf);
810 
811 void sock_set_mark(struct sock *sk, u32 val)
812 {
813 	lock_sock(sk);
814 	sk->sk_mark = val;
815 	release_sock(sk);
816 }
817 EXPORT_SYMBOL(sock_set_mark);
818 
819 /*
820  *	This is meant for all protocols to use and covers goings on
821  *	at the socket level. Everything here is generic.
822  */
823 
824 int sock_setsockopt(struct socket *sock, int level, int optname,
825 		    sockptr_t optval, unsigned int optlen)
826 {
827 	struct sock_txtime sk_txtime;
828 	struct sock *sk = sock->sk;
829 	int val;
830 	int valbool;
831 	struct linger ling;
832 	int ret = 0;
833 
834 	/*
835 	 *	Options without arguments
836 	 */
837 
838 	if (optname == SO_BINDTODEVICE)
839 		return sock_setbindtodevice(sk, optval, optlen);
840 
841 	if (optlen < sizeof(int))
842 		return -EINVAL;
843 
844 	if (copy_from_sockptr(&val, optval, sizeof(val)))
845 		return -EFAULT;
846 
847 	valbool = val ? 1 : 0;
848 
849 	lock_sock(sk);
850 
851 	switch (optname) {
852 	case SO_DEBUG:
853 		if (val && !capable(CAP_NET_ADMIN))
854 			ret = -EACCES;
855 		else
856 			sock_valbool_flag(sk, SOCK_DBG, valbool);
857 		break;
858 	case SO_REUSEADDR:
859 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
860 		break;
861 	case SO_REUSEPORT:
862 		sk->sk_reuseport = valbool;
863 		break;
864 	case SO_TYPE:
865 	case SO_PROTOCOL:
866 	case SO_DOMAIN:
867 	case SO_ERROR:
868 		ret = -ENOPROTOOPT;
869 		break;
870 	case SO_DONTROUTE:
871 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
872 		sk_dst_reset(sk);
873 		break;
874 	case SO_BROADCAST:
875 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
876 		break;
877 	case SO_SNDBUF:
878 		/* Don't error on this BSD doesn't and if you think
879 		 * about it this is right. Otherwise apps have to
880 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
881 		 * are treated in BSD as hints
882 		 */
883 		val = min_t(u32, val, sysctl_wmem_max);
884 set_sndbuf:
885 		/* Ensure val * 2 fits into an int, to prevent max_t()
886 		 * from treating it as a negative value.
887 		 */
888 		val = min_t(int, val, INT_MAX / 2);
889 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
890 		WRITE_ONCE(sk->sk_sndbuf,
891 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
892 		/* Wake up sending tasks if we upped the value. */
893 		sk->sk_write_space(sk);
894 		break;
895 
896 	case SO_SNDBUFFORCE:
897 		if (!capable(CAP_NET_ADMIN)) {
898 			ret = -EPERM;
899 			break;
900 		}
901 
902 		/* No negative values (to prevent underflow, as val will be
903 		 * multiplied by 2).
904 		 */
905 		if (val < 0)
906 			val = 0;
907 		goto set_sndbuf;
908 
909 	case SO_RCVBUF:
910 		/* Don't error on this BSD doesn't and if you think
911 		 * about it this is right. Otherwise apps have to
912 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
913 		 * are treated in BSD as hints
914 		 */
915 		__sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
916 		break;
917 
918 	case SO_RCVBUFFORCE:
919 		if (!capable(CAP_NET_ADMIN)) {
920 			ret = -EPERM;
921 			break;
922 		}
923 
924 		/* No negative values (to prevent underflow, as val will be
925 		 * multiplied by 2).
926 		 */
927 		__sock_set_rcvbuf(sk, max(val, 0));
928 		break;
929 
930 	case SO_KEEPALIVE:
931 		if (sk->sk_prot->keepalive)
932 			sk->sk_prot->keepalive(sk, valbool);
933 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
934 		break;
935 
936 	case SO_OOBINLINE:
937 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
938 		break;
939 
940 	case SO_NO_CHECK:
941 		sk->sk_no_check_tx = valbool;
942 		break;
943 
944 	case SO_PRIORITY:
945 		if ((val >= 0 && val <= 6) ||
946 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
947 			sk->sk_priority = val;
948 		else
949 			ret = -EPERM;
950 		break;
951 
952 	case SO_LINGER:
953 		if (optlen < sizeof(ling)) {
954 			ret = -EINVAL;	/* 1003.1g */
955 			break;
956 		}
957 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
958 			ret = -EFAULT;
959 			break;
960 		}
961 		if (!ling.l_onoff)
962 			sock_reset_flag(sk, SOCK_LINGER);
963 		else {
964 #if (BITS_PER_LONG == 32)
965 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
966 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
967 			else
968 #endif
969 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
970 			sock_set_flag(sk, SOCK_LINGER);
971 		}
972 		break;
973 
974 	case SO_BSDCOMPAT:
975 		break;
976 
977 	case SO_PASSCRED:
978 		if (valbool)
979 			set_bit(SOCK_PASSCRED, &sock->flags);
980 		else
981 			clear_bit(SOCK_PASSCRED, &sock->flags);
982 		break;
983 
984 	case SO_TIMESTAMP_OLD:
985 		__sock_set_timestamps(sk, valbool, false, false);
986 		break;
987 	case SO_TIMESTAMP_NEW:
988 		__sock_set_timestamps(sk, valbool, true, false);
989 		break;
990 	case SO_TIMESTAMPNS_OLD:
991 		__sock_set_timestamps(sk, valbool, false, true);
992 		break;
993 	case SO_TIMESTAMPNS_NEW:
994 		__sock_set_timestamps(sk, valbool, true, true);
995 		break;
996 	case SO_TIMESTAMPING_NEW:
997 		sock_set_flag(sk, SOCK_TSTAMP_NEW);
998 		/* fall through */
999 	case SO_TIMESTAMPING_OLD:
1000 		if (val & ~SOF_TIMESTAMPING_MASK) {
1001 			ret = -EINVAL;
1002 			break;
1003 		}
1004 
1005 		if (val & SOF_TIMESTAMPING_OPT_ID &&
1006 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
1007 			if (sk->sk_protocol == IPPROTO_TCP &&
1008 			    sk->sk_type == SOCK_STREAM) {
1009 				if ((1 << sk->sk_state) &
1010 				    (TCPF_CLOSE | TCPF_LISTEN)) {
1011 					ret = -EINVAL;
1012 					break;
1013 				}
1014 				sk->sk_tskey = tcp_sk(sk)->snd_una;
1015 			} else {
1016 				sk->sk_tskey = 0;
1017 			}
1018 		}
1019 
1020 		if (val & SOF_TIMESTAMPING_OPT_STATS &&
1021 		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
1022 			ret = -EINVAL;
1023 			break;
1024 		}
1025 
1026 		sk->sk_tsflags = val;
1027 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
1028 			sock_enable_timestamp(sk,
1029 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
1030 		else {
1031 			if (optname == SO_TIMESTAMPING_NEW)
1032 				sock_reset_flag(sk, SOCK_TSTAMP_NEW);
1033 
1034 			sock_disable_timestamp(sk,
1035 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
1036 		}
1037 		break;
1038 
1039 	case SO_RCVLOWAT:
1040 		if (val < 0)
1041 			val = INT_MAX;
1042 		if (sock->ops->set_rcvlowat)
1043 			ret = sock->ops->set_rcvlowat(sk, val);
1044 		else
1045 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1046 		break;
1047 
1048 	case SO_RCVTIMEO_OLD:
1049 	case SO_RCVTIMEO_NEW:
1050 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1051 				       optlen, optname == SO_RCVTIMEO_OLD);
1052 		break;
1053 
1054 	case SO_SNDTIMEO_OLD:
1055 	case SO_SNDTIMEO_NEW:
1056 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1057 				       optlen, optname == SO_SNDTIMEO_OLD);
1058 		break;
1059 
1060 	case SO_ATTACH_FILTER: {
1061 		struct sock_fprog fprog;
1062 
1063 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1064 		if (!ret)
1065 			ret = sk_attach_filter(&fprog, sk);
1066 		break;
1067 	}
1068 	case SO_ATTACH_BPF:
1069 		ret = -EINVAL;
1070 		if (optlen == sizeof(u32)) {
1071 			u32 ufd;
1072 
1073 			ret = -EFAULT;
1074 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1075 				break;
1076 
1077 			ret = sk_attach_bpf(ufd, sk);
1078 		}
1079 		break;
1080 
1081 	case SO_ATTACH_REUSEPORT_CBPF: {
1082 		struct sock_fprog fprog;
1083 
1084 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1085 		if (!ret)
1086 			ret = sk_reuseport_attach_filter(&fprog, sk);
1087 		break;
1088 	}
1089 	case SO_ATTACH_REUSEPORT_EBPF:
1090 		ret = -EINVAL;
1091 		if (optlen == sizeof(u32)) {
1092 			u32 ufd;
1093 
1094 			ret = -EFAULT;
1095 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1096 				break;
1097 
1098 			ret = sk_reuseport_attach_bpf(ufd, sk);
1099 		}
1100 		break;
1101 
1102 	case SO_DETACH_REUSEPORT_BPF:
1103 		ret = reuseport_detach_prog(sk);
1104 		break;
1105 
1106 	case SO_DETACH_FILTER:
1107 		ret = sk_detach_filter(sk);
1108 		break;
1109 
1110 	case SO_LOCK_FILTER:
1111 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1112 			ret = -EPERM;
1113 		else
1114 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1115 		break;
1116 
1117 	case SO_PASSSEC:
1118 		if (valbool)
1119 			set_bit(SOCK_PASSSEC, &sock->flags);
1120 		else
1121 			clear_bit(SOCK_PASSSEC, &sock->flags);
1122 		break;
1123 	case SO_MARK:
1124 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1125 			ret = -EPERM;
1126 		} else if (val != sk->sk_mark) {
1127 			sk->sk_mark = val;
1128 			sk_dst_reset(sk);
1129 		}
1130 		break;
1131 
1132 	case SO_RXQ_OVFL:
1133 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1134 		break;
1135 
1136 	case SO_WIFI_STATUS:
1137 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1138 		break;
1139 
1140 	case SO_PEEK_OFF:
1141 		if (sock->ops->set_peek_off)
1142 			ret = sock->ops->set_peek_off(sk, val);
1143 		else
1144 			ret = -EOPNOTSUPP;
1145 		break;
1146 
1147 	case SO_NOFCS:
1148 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1149 		break;
1150 
1151 	case SO_SELECT_ERR_QUEUE:
1152 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1153 		break;
1154 
1155 #ifdef CONFIG_NET_RX_BUSY_POLL
1156 	case SO_BUSY_POLL:
1157 		/* allow unprivileged users to decrease the value */
1158 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1159 			ret = -EPERM;
1160 		else {
1161 			if (val < 0)
1162 				ret = -EINVAL;
1163 			else
1164 				sk->sk_ll_usec = val;
1165 		}
1166 		break;
1167 #endif
1168 
1169 	case SO_MAX_PACING_RATE:
1170 		{
1171 		unsigned long ulval = (val == ~0U) ? ~0UL : val;
1172 
1173 		if (sizeof(ulval) != sizeof(val) &&
1174 		    optlen >= sizeof(ulval) &&
1175 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1176 			ret = -EFAULT;
1177 			break;
1178 		}
1179 		if (ulval != ~0UL)
1180 			cmpxchg(&sk->sk_pacing_status,
1181 				SK_PACING_NONE,
1182 				SK_PACING_NEEDED);
1183 		sk->sk_max_pacing_rate = ulval;
1184 		sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1185 		break;
1186 		}
1187 	case SO_INCOMING_CPU:
1188 		WRITE_ONCE(sk->sk_incoming_cpu, val);
1189 		break;
1190 
1191 	case SO_CNX_ADVICE:
1192 		if (val == 1)
1193 			dst_negative_advice(sk);
1194 		break;
1195 
1196 	case SO_ZEROCOPY:
1197 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1198 			if (!((sk->sk_type == SOCK_STREAM &&
1199 			       sk->sk_protocol == IPPROTO_TCP) ||
1200 			      (sk->sk_type == SOCK_DGRAM &&
1201 			       sk->sk_protocol == IPPROTO_UDP)))
1202 				ret = -ENOTSUPP;
1203 		} else if (sk->sk_family != PF_RDS) {
1204 			ret = -ENOTSUPP;
1205 		}
1206 		if (!ret) {
1207 			if (val < 0 || val > 1)
1208 				ret = -EINVAL;
1209 			else
1210 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1211 		}
1212 		break;
1213 
1214 	case SO_TXTIME:
1215 		if (optlen != sizeof(struct sock_txtime)) {
1216 			ret = -EINVAL;
1217 			break;
1218 		} else if (copy_from_sockptr(&sk_txtime, optval,
1219 			   sizeof(struct sock_txtime))) {
1220 			ret = -EFAULT;
1221 			break;
1222 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1223 			ret = -EINVAL;
1224 			break;
1225 		}
1226 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1227 		 * scheduler has enough safe guards.
1228 		 */
1229 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1230 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1231 			ret = -EPERM;
1232 			break;
1233 		}
1234 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1235 		sk->sk_clockid = sk_txtime.clockid;
1236 		sk->sk_txtime_deadline_mode =
1237 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1238 		sk->sk_txtime_report_errors =
1239 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1240 		break;
1241 
1242 	case SO_BINDTOIFINDEX:
1243 		ret = sock_bindtoindex_locked(sk, val);
1244 		break;
1245 
1246 	default:
1247 		ret = -ENOPROTOOPT;
1248 		break;
1249 	}
1250 	release_sock(sk);
1251 	return ret;
1252 }
1253 EXPORT_SYMBOL(sock_setsockopt);
1254 
1255 
1256 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1257 			  struct ucred *ucred)
1258 {
1259 	ucred->pid = pid_vnr(pid);
1260 	ucred->uid = ucred->gid = -1;
1261 	if (cred) {
1262 		struct user_namespace *current_ns = current_user_ns();
1263 
1264 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1265 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1266 	}
1267 }
1268 
1269 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1270 {
1271 	struct user_namespace *user_ns = current_user_ns();
1272 	int i;
1273 
1274 	for (i = 0; i < src->ngroups; i++)
1275 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1276 			return -EFAULT;
1277 
1278 	return 0;
1279 }
1280 
1281 int sock_getsockopt(struct socket *sock, int level, int optname,
1282 		    char __user *optval, int __user *optlen)
1283 {
1284 	struct sock *sk = sock->sk;
1285 
1286 	union {
1287 		int val;
1288 		u64 val64;
1289 		unsigned long ulval;
1290 		struct linger ling;
1291 		struct old_timeval32 tm32;
1292 		struct __kernel_old_timeval tm;
1293 		struct  __kernel_sock_timeval stm;
1294 		struct sock_txtime txtime;
1295 	} v;
1296 
1297 	int lv = sizeof(int);
1298 	int len;
1299 
1300 	if (get_user(len, optlen))
1301 		return -EFAULT;
1302 	if (len < 0)
1303 		return -EINVAL;
1304 
1305 	memset(&v, 0, sizeof(v));
1306 
1307 	switch (optname) {
1308 	case SO_DEBUG:
1309 		v.val = sock_flag(sk, SOCK_DBG);
1310 		break;
1311 
1312 	case SO_DONTROUTE:
1313 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1314 		break;
1315 
1316 	case SO_BROADCAST:
1317 		v.val = sock_flag(sk, SOCK_BROADCAST);
1318 		break;
1319 
1320 	case SO_SNDBUF:
1321 		v.val = sk->sk_sndbuf;
1322 		break;
1323 
1324 	case SO_RCVBUF:
1325 		v.val = sk->sk_rcvbuf;
1326 		break;
1327 
1328 	case SO_REUSEADDR:
1329 		v.val = sk->sk_reuse;
1330 		break;
1331 
1332 	case SO_REUSEPORT:
1333 		v.val = sk->sk_reuseport;
1334 		break;
1335 
1336 	case SO_KEEPALIVE:
1337 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1338 		break;
1339 
1340 	case SO_TYPE:
1341 		v.val = sk->sk_type;
1342 		break;
1343 
1344 	case SO_PROTOCOL:
1345 		v.val = sk->sk_protocol;
1346 		break;
1347 
1348 	case SO_DOMAIN:
1349 		v.val = sk->sk_family;
1350 		break;
1351 
1352 	case SO_ERROR:
1353 		v.val = -sock_error(sk);
1354 		if (v.val == 0)
1355 			v.val = xchg(&sk->sk_err_soft, 0);
1356 		break;
1357 
1358 	case SO_OOBINLINE:
1359 		v.val = sock_flag(sk, SOCK_URGINLINE);
1360 		break;
1361 
1362 	case SO_NO_CHECK:
1363 		v.val = sk->sk_no_check_tx;
1364 		break;
1365 
1366 	case SO_PRIORITY:
1367 		v.val = sk->sk_priority;
1368 		break;
1369 
1370 	case SO_LINGER:
1371 		lv		= sizeof(v.ling);
1372 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1373 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1374 		break;
1375 
1376 	case SO_BSDCOMPAT:
1377 		break;
1378 
1379 	case SO_TIMESTAMP_OLD:
1380 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1381 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1382 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1383 		break;
1384 
1385 	case SO_TIMESTAMPNS_OLD:
1386 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1387 		break;
1388 
1389 	case SO_TIMESTAMP_NEW:
1390 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1391 		break;
1392 
1393 	case SO_TIMESTAMPNS_NEW:
1394 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1395 		break;
1396 
1397 	case SO_TIMESTAMPING_OLD:
1398 		v.val = sk->sk_tsflags;
1399 		break;
1400 
1401 	case SO_RCVTIMEO_OLD:
1402 	case SO_RCVTIMEO_NEW:
1403 		lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1404 		break;
1405 
1406 	case SO_SNDTIMEO_OLD:
1407 	case SO_SNDTIMEO_NEW:
1408 		lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1409 		break;
1410 
1411 	case SO_RCVLOWAT:
1412 		v.val = sk->sk_rcvlowat;
1413 		break;
1414 
1415 	case SO_SNDLOWAT:
1416 		v.val = 1;
1417 		break;
1418 
1419 	case SO_PASSCRED:
1420 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1421 		break;
1422 
1423 	case SO_PEERCRED:
1424 	{
1425 		struct ucred peercred;
1426 		if (len > sizeof(peercred))
1427 			len = sizeof(peercred);
1428 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1429 		if (copy_to_user(optval, &peercred, len))
1430 			return -EFAULT;
1431 		goto lenout;
1432 	}
1433 
1434 	case SO_PEERGROUPS:
1435 	{
1436 		int ret, n;
1437 
1438 		if (!sk->sk_peer_cred)
1439 			return -ENODATA;
1440 
1441 		n = sk->sk_peer_cred->group_info->ngroups;
1442 		if (len < n * sizeof(gid_t)) {
1443 			len = n * sizeof(gid_t);
1444 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1445 		}
1446 		len = n * sizeof(gid_t);
1447 
1448 		ret = groups_to_user((gid_t __user *)optval,
1449 				     sk->sk_peer_cred->group_info);
1450 		if (ret)
1451 			return ret;
1452 		goto lenout;
1453 	}
1454 
1455 	case SO_PEERNAME:
1456 	{
1457 		char address[128];
1458 
1459 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1460 		if (lv < 0)
1461 			return -ENOTCONN;
1462 		if (lv < len)
1463 			return -EINVAL;
1464 		if (copy_to_user(optval, address, len))
1465 			return -EFAULT;
1466 		goto lenout;
1467 	}
1468 
1469 	/* Dubious BSD thing... Probably nobody even uses it, but
1470 	 * the UNIX standard wants it for whatever reason... -DaveM
1471 	 */
1472 	case SO_ACCEPTCONN:
1473 		v.val = sk->sk_state == TCP_LISTEN;
1474 		break;
1475 
1476 	case SO_PASSSEC:
1477 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1478 		break;
1479 
1480 	case SO_PEERSEC:
1481 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1482 
1483 	case SO_MARK:
1484 		v.val = sk->sk_mark;
1485 		break;
1486 
1487 	case SO_RXQ_OVFL:
1488 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1489 		break;
1490 
1491 	case SO_WIFI_STATUS:
1492 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1493 		break;
1494 
1495 	case SO_PEEK_OFF:
1496 		if (!sock->ops->set_peek_off)
1497 			return -EOPNOTSUPP;
1498 
1499 		v.val = sk->sk_peek_off;
1500 		break;
1501 	case SO_NOFCS:
1502 		v.val = sock_flag(sk, SOCK_NOFCS);
1503 		break;
1504 
1505 	case SO_BINDTODEVICE:
1506 		return sock_getbindtodevice(sk, optval, optlen, len);
1507 
1508 	case SO_GET_FILTER:
1509 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1510 		if (len < 0)
1511 			return len;
1512 
1513 		goto lenout;
1514 
1515 	case SO_LOCK_FILTER:
1516 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1517 		break;
1518 
1519 	case SO_BPF_EXTENSIONS:
1520 		v.val = bpf_tell_extensions();
1521 		break;
1522 
1523 	case SO_SELECT_ERR_QUEUE:
1524 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1525 		break;
1526 
1527 #ifdef CONFIG_NET_RX_BUSY_POLL
1528 	case SO_BUSY_POLL:
1529 		v.val = sk->sk_ll_usec;
1530 		break;
1531 #endif
1532 
1533 	case SO_MAX_PACING_RATE:
1534 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1535 			lv = sizeof(v.ulval);
1536 			v.ulval = sk->sk_max_pacing_rate;
1537 		} else {
1538 			/* 32bit version */
1539 			v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1540 		}
1541 		break;
1542 
1543 	case SO_INCOMING_CPU:
1544 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1545 		break;
1546 
1547 	case SO_MEMINFO:
1548 	{
1549 		u32 meminfo[SK_MEMINFO_VARS];
1550 
1551 		sk_get_meminfo(sk, meminfo);
1552 
1553 		len = min_t(unsigned int, len, sizeof(meminfo));
1554 		if (copy_to_user(optval, &meminfo, len))
1555 			return -EFAULT;
1556 
1557 		goto lenout;
1558 	}
1559 
1560 #ifdef CONFIG_NET_RX_BUSY_POLL
1561 	case SO_INCOMING_NAPI_ID:
1562 		v.val = READ_ONCE(sk->sk_napi_id);
1563 
1564 		/* aggregate non-NAPI IDs down to 0 */
1565 		if (v.val < MIN_NAPI_ID)
1566 			v.val = 0;
1567 
1568 		break;
1569 #endif
1570 
1571 	case SO_COOKIE:
1572 		lv = sizeof(u64);
1573 		if (len < lv)
1574 			return -EINVAL;
1575 		v.val64 = sock_gen_cookie(sk);
1576 		break;
1577 
1578 	case SO_ZEROCOPY:
1579 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1580 		break;
1581 
1582 	case SO_TXTIME:
1583 		lv = sizeof(v.txtime);
1584 		v.txtime.clockid = sk->sk_clockid;
1585 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1586 				  SOF_TXTIME_DEADLINE_MODE : 0;
1587 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1588 				  SOF_TXTIME_REPORT_ERRORS : 0;
1589 		break;
1590 
1591 	case SO_BINDTOIFINDEX:
1592 		v.val = sk->sk_bound_dev_if;
1593 		break;
1594 
1595 	default:
1596 		/* We implement the SO_SNDLOWAT etc to not be settable
1597 		 * (1003.1g 7).
1598 		 */
1599 		return -ENOPROTOOPT;
1600 	}
1601 
1602 	if (len > lv)
1603 		len = lv;
1604 	if (copy_to_user(optval, &v, len))
1605 		return -EFAULT;
1606 lenout:
1607 	if (put_user(len, optlen))
1608 		return -EFAULT;
1609 	return 0;
1610 }
1611 
1612 /*
1613  * Initialize an sk_lock.
1614  *
1615  * (We also register the sk_lock with the lock validator.)
1616  */
1617 static inline void sock_lock_init(struct sock *sk)
1618 {
1619 	if (sk->sk_kern_sock)
1620 		sock_lock_init_class_and_name(
1621 			sk,
1622 			af_family_kern_slock_key_strings[sk->sk_family],
1623 			af_family_kern_slock_keys + sk->sk_family,
1624 			af_family_kern_key_strings[sk->sk_family],
1625 			af_family_kern_keys + sk->sk_family);
1626 	else
1627 		sock_lock_init_class_and_name(
1628 			sk,
1629 			af_family_slock_key_strings[sk->sk_family],
1630 			af_family_slock_keys + sk->sk_family,
1631 			af_family_key_strings[sk->sk_family],
1632 			af_family_keys + sk->sk_family);
1633 }
1634 
1635 /*
1636  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1637  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1638  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1639  */
1640 static void sock_copy(struct sock *nsk, const struct sock *osk)
1641 {
1642 	const struct proto *prot = READ_ONCE(osk->sk_prot);
1643 #ifdef CONFIG_SECURITY_NETWORK
1644 	void *sptr = nsk->sk_security;
1645 #endif
1646 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1647 
1648 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1649 	       prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1650 
1651 #ifdef CONFIG_SECURITY_NETWORK
1652 	nsk->sk_security = sptr;
1653 	security_sk_clone(osk, nsk);
1654 #endif
1655 }
1656 
1657 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1658 		int family)
1659 {
1660 	struct sock *sk;
1661 	struct kmem_cache *slab;
1662 
1663 	slab = prot->slab;
1664 	if (slab != NULL) {
1665 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1666 		if (!sk)
1667 			return sk;
1668 		if (want_init_on_alloc(priority))
1669 			sk_prot_clear_nulls(sk, prot->obj_size);
1670 	} else
1671 		sk = kmalloc(prot->obj_size, priority);
1672 
1673 	if (sk != NULL) {
1674 		if (security_sk_alloc(sk, family, priority))
1675 			goto out_free;
1676 
1677 		if (!try_module_get(prot->owner))
1678 			goto out_free_sec;
1679 		sk_tx_queue_clear(sk);
1680 	}
1681 
1682 	return sk;
1683 
1684 out_free_sec:
1685 	security_sk_free(sk);
1686 out_free:
1687 	if (slab != NULL)
1688 		kmem_cache_free(slab, sk);
1689 	else
1690 		kfree(sk);
1691 	return NULL;
1692 }
1693 
1694 static void sk_prot_free(struct proto *prot, struct sock *sk)
1695 {
1696 	struct kmem_cache *slab;
1697 	struct module *owner;
1698 
1699 	owner = prot->owner;
1700 	slab = prot->slab;
1701 
1702 	cgroup_sk_free(&sk->sk_cgrp_data);
1703 	mem_cgroup_sk_free(sk);
1704 	security_sk_free(sk);
1705 	if (slab != NULL)
1706 		kmem_cache_free(slab, sk);
1707 	else
1708 		kfree(sk);
1709 	module_put(owner);
1710 }
1711 
1712 /**
1713  *	sk_alloc - All socket objects are allocated here
1714  *	@net: the applicable net namespace
1715  *	@family: protocol family
1716  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1717  *	@prot: struct proto associated with this new sock instance
1718  *	@kern: is this to be a kernel socket?
1719  */
1720 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1721 		      struct proto *prot, int kern)
1722 {
1723 	struct sock *sk;
1724 
1725 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1726 	if (sk) {
1727 		sk->sk_family = family;
1728 		/*
1729 		 * See comment in struct sock definition to understand
1730 		 * why we need sk_prot_creator -acme
1731 		 */
1732 		sk->sk_prot = sk->sk_prot_creator = prot;
1733 		sk->sk_kern_sock = kern;
1734 		sock_lock_init(sk);
1735 		sk->sk_net_refcnt = kern ? 0 : 1;
1736 		if (likely(sk->sk_net_refcnt)) {
1737 			get_net(net);
1738 			sock_inuse_add(net, 1);
1739 		}
1740 
1741 		sock_net_set(sk, net);
1742 		refcount_set(&sk->sk_wmem_alloc, 1);
1743 
1744 		mem_cgroup_sk_alloc(sk);
1745 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1746 		sock_update_classid(&sk->sk_cgrp_data);
1747 		sock_update_netprioidx(&sk->sk_cgrp_data);
1748 		sk_tx_queue_clear(sk);
1749 	}
1750 
1751 	return sk;
1752 }
1753 EXPORT_SYMBOL(sk_alloc);
1754 
1755 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1756  * grace period. This is the case for UDP sockets and TCP listeners.
1757  */
1758 static void __sk_destruct(struct rcu_head *head)
1759 {
1760 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1761 	struct sk_filter *filter;
1762 
1763 	if (sk->sk_destruct)
1764 		sk->sk_destruct(sk);
1765 
1766 	filter = rcu_dereference_check(sk->sk_filter,
1767 				       refcount_read(&sk->sk_wmem_alloc) == 0);
1768 	if (filter) {
1769 		sk_filter_uncharge(sk, filter);
1770 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1771 	}
1772 
1773 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1774 
1775 #ifdef CONFIG_BPF_SYSCALL
1776 	bpf_sk_storage_free(sk);
1777 #endif
1778 
1779 	if (atomic_read(&sk->sk_omem_alloc))
1780 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1781 			 __func__, atomic_read(&sk->sk_omem_alloc));
1782 
1783 	if (sk->sk_frag.page) {
1784 		put_page(sk->sk_frag.page);
1785 		sk->sk_frag.page = NULL;
1786 	}
1787 
1788 	if (sk->sk_peer_cred)
1789 		put_cred(sk->sk_peer_cred);
1790 	put_pid(sk->sk_peer_pid);
1791 	if (likely(sk->sk_net_refcnt))
1792 		put_net(sock_net(sk));
1793 	sk_prot_free(sk->sk_prot_creator, sk);
1794 }
1795 
1796 void sk_destruct(struct sock *sk)
1797 {
1798 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
1799 
1800 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
1801 		reuseport_detach_sock(sk);
1802 		use_call_rcu = true;
1803 	}
1804 
1805 	if (use_call_rcu)
1806 		call_rcu(&sk->sk_rcu, __sk_destruct);
1807 	else
1808 		__sk_destruct(&sk->sk_rcu);
1809 }
1810 
1811 static void __sk_free(struct sock *sk)
1812 {
1813 	if (likely(sk->sk_net_refcnt))
1814 		sock_inuse_add(sock_net(sk), -1);
1815 
1816 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1817 		sock_diag_broadcast_destroy(sk);
1818 	else
1819 		sk_destruct(sk);
1820 }
1821 
1822 void sk_free(struct sock *sk)
1823 {
1824 	/*
1825 	 * We subtract one from sk_wmem_alloc and can know if
1826 	 * some packets are still in some tx queue.
1827 	 * If not null, sock_wfree() will call __sk_free(sk) later
1828 	 */
1829 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1830 		__sk_free(sk);
1831 }
1832 EXPORT_SYMBOL(sk_free);
1833 
1834 static void sk_init_common(struct sock *sk)
1835 {
1836 	skb_queue_head_init(&sk->sk_receive_queue);
1837 	skb_queue_head_init(&sk->sk_write_queue);
1838 	skb_queue_head_init(&sk->sk_error_queue);
1839 
1840 	rwlock_init(&sk->sk_callback_lock);
1841 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1842 			af_rlock_keys + sk->sk_family,
1843 			af_family_rlock_key_strings[sk->sk_family]);
1844 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1845 			af_wlock_keys + sk->sk_family,
1846 			af_family_wlock_key_strings[sk->sk_family]);
1847 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1848 			af_elock_keys + sk->sk_family,
1849 			af_family_elock_key_strings[sk->sk_family]);
1850 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1851 			af_callback_keys + sk->sk_family,
1852 			af_family_clock_key_strings[sk->sk_family]);
1853 }
1854 
1855 /**
1856  *	sk_clone_lock - clone a socket, and lock its clone
1857  *	@sk: the socket to clone
1858  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1859  *
1860  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1861  */
1862 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1863 {
1864 	struct proto *prot = READ_ONCE(sk->sk_prot);
1865 	struct sock *newsk;
1866 	bool is_charged = true;
1867 
1868 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
1869 	if (newsk != NULL) {
1870 		struct sk_filter *filter;
1871 
1872 		sock_copy(newsk, sk);
1873 
1874 		newsk->sk_prot_creator = prot;
1875 
1876 		/* SANITY */
1877 		if (likely(newsk->sk_net_refcnt))
1878 			get_net(sock_net(newsk));
1879 		sk_node_init(&newsk->sk_node);
1880 		sock_lock_init(newsk);
1881 		bh_lock_sock(newsk);
1882 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1883 		newsk->sk_backlog.len = 0;
1884 
1885 		atomic_set(&newsk->sk_rmem_alloc, 0);
1886 		/*
1887 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1888 		 */
1889 		refcount_set(&newsk->sk_wmem_alloc, 1);
1890 		atomic_set(&newsk->sk_omem_alloc, 0);
1891 		sk_init_common(newsk);
1892 
1893 		newsk->sk_dst_cache	= NULL;
1894 		newsk->sk_dst_pending_confirm = 0;
1895 		newsk->sk_wmem_queued	= 0;
1896 		newsk->sk_forward_alloc = 0;
1897 		atomic_set(&newsk->sk_drops, 0);
1898 		newsk->sk_send_head	= NULL;
1899 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1900 		atomic_set(&newsk->sk_zckey, 0);
1901 
1902 		sock_reset_flag(newsk, SOCK_DONE);
1903 
1904 		/* sk->sk_memcg will be populated at accept() time */
1905 		newsk->sk_memcg = NULL;
1906 
1907 		cgroup_sk_clone(&newsk->sk_cgrp_data);
1908 
1909 		rcu_read_lock();
1910 		filter = rcu_dereference(sk->sk_filter);
1911 		if (filter != NULL)
1912 			/* though it's an empty new sock, the charging may fail
1913 			 * if sysctl_optmem_max was changed between creation of
1914 			 * original socket and cloning
1915 			 */
1916 			is_charged = sk_filter_charge(newsk, filter);
1917 		RCU_INIT_POINTER(newsk->sk_filter, filter);
1918 		rcu_read_unlock();
1919 
1920 		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1921 			/* We need to make sure that we don't uncharge the new
1922 			 * socket if we couldn't charge it in the first place
1923 			 * as otherwise we uncharge the parent's filter.
1924 			 */
1925 			if (!is_charged)
1926 				RCU_INIT_POINTER(newsk->sk_filter, NULL);
1927 			sk_free_unlock_clone(newsk);
1928 			newsk = NULL;
1929 			goto out;
1930 		}
1931 		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1932 
1933 		if (bpf_sk_storage_clone(sk, newsk)) {
1934 			sk_free_unlock_clone(newsk);
1935 			newsk = NULL;
1936 			goto out;
1937 		}
1938 
1939 		/* Clear sk_user_data if parent had the pointer tagged
1940 		 * as not suitable for copying when cloning.
1941 		 */
1942 		if (sk_user_data_is_nocopy(newsk))
1943 			newsk->sk_user_data = NULL;
1944 
1945 		newsk->sk_err	   = 0;
1946 		newsk->sk_err_soft = 0;
1947 		newsk->sk_priority = 0;
1948 		newsk->sk_incoming_cpu = raw_smp_processor_id();
1949 		if (likely(newsk->sk_net_refcnt))
1950 			sock_inuse_add(sock_net(newsk), 1);
1951 
1952 		/*
1953 		 * Before updating sk_refcnt, we must commit prior changes to memory
1954 		 * (Documentation/RCU/rculist_nulls.rst for details)
1955 		 */
1956 		smp_wmb();
1957 		refcount_set(&newsk->sk_refcnt, 2);
1958 
1959 		/*
1960 		 * Increment the counter in the same struct proto as the master
1961 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1962 		 * is the same as sk->sk_prot->socks, as this field was copied
1963 		 * with memcpy).
1964 		 *
1965 		 * This _changes_ the previous behaviour, where
1966 		 * tcp_create_openreq_child always was incrementing the
1967 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1968 		 * to be taken into account in all callers. -acme
1969 		 */
1970 		sk_refcnt_debug_inc(newsk);
1971 		sk_set_socket(newsk, NULL);
1972 		sk_tx_queue_clear(newsk);
1973 		RCU_INIT_POINTER(newsk->sk_wq, NULL);
1974 
1975 		if (newsk->sk_prot->sockets_allocated)
1976 			sk_sockets_allocated_inc(newsk);
1977 
1978 		if (sock_needs_netstamp(sk) &&
1979 		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1980 			net_enable_timestamp();
1981 	}
1982 out:
1983 	return newsk;
1984 }
1985 EXPORT_SYMBOL_GPL(sk_clone_lock);
1986 
1987 void sk_free_unlock_clone(struct sock *sk)
1988 {
1989 	/* It is still raw copy of parent, so invalidate
1990 	 * destructor and make plain sk_free() */
1991 	sk->sk_destruct = NULL;
1992 	bh_unlock_sock(sk);
1993 	sk_free(sk);
1994 }
1995 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1996 
1997 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1998 {
1999 	u32 max_segs = 1;
2000 
2001 	sk_dst_set(sk, dst);
2002 	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
2003 	if (sk->sk_route_caps & NETIF_F_GSO)
2004 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2005 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
2006 	if (sk_can_gso(sk)) {
2007 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2008 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2009 		} else {
2010 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2011 			sk->sk_gso_max_size = dst->dev->gso_max_size;
2012 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
2013 		}
2014 	}
2015 	sk->sk_gso_max_segs = max_segs;
2016 }
2017 EXPORT_SYMBOL_GPL(sk_setup_caps);
2018 
2019 /*
2020  *	Simple resource managers for sockets.
2021  */
2022 
2023 
2024 /*
2025  * Write buffer destructor automatically called from kfree_skb.
2026  */
2027 void sock_wfree(struct sk_buff *skb)
2028 {
2029 	struct sock *sk = skb->sk;
2030 	unsigned int len = skb->truesize;
2031 
2032 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2033 		/*
2034 		 * Keep a reference on sk_wmem_alloc, this will be released
2035 		 * after sk_write_space() call
2036 		 */
2037 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2038 		sk->sk_write_space(sk);
2039 		len = 1;
2040 	}
2041 	/*
2042 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2043 	 * could not do because of in-flight packets
2044 	 */
2045 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2046 		__sk_free(sk);
2047 }
2048 EXPORT_SYMBOL(sock_wfree);
2049 
2050 /* This variant of sock_wfree() is used by TCP,
2051  * since it sets SOCK_USE_WRITE_QUEUE.
2052  */
2053 void __sock_wfree(struct sk_buff *skb)
2054 {
2055 	struct sock *sk = skb->sk;
2056 
2057 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2058 		__sk_free(sk);
2059 }
2060 
2061 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2062 {
2063 	skb_orphan(skb);
2064 	skb->sk = sk;
2065 #ifdef CONFIG_INET
2066 	if (unlikely(!sk_fullsock(sk))) {
2067 		skb->destructor = sock_edemux;
2068 		sock_hold(sk);
2069 		return;
2070 	}
2071 #endif
2072 	skb->destructor = sock_wfree;
2073 	skb_set_hash_from_sk(skb, sk);
2074 	/*
2075 	 * We used to take a refcount on sk, but following operation
2076 	 * is enough to guarantee sk_free() wont free this sock until
2077 	 * all in-flight packets are completed
2078 	 */
2079 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2080 }
2081 EXPORT_SYMBOL(skb_set_owner_w);
2082 
2083 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2084 {
2085 #ifdef CONFIG_TLS_DEVICE
2086 	/* Drivers depend on in-order delivery for crypto offload,
2087 	 * partial orphan breaks out-of-order-OK logic.
2088 	 */
2089 	if (skb->decrypted)
2090 		return false;
2091 #endif
2092 	return (skb->destructor == sock_wfree ||
2093 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2094 }
2095 
2096 /* This helper is used by netem, as it can hold packets in its
2097  * delay queue. We want to allow the owner socket to send more
2098  * packets, as if they were already TX completed by a typical driver.
2099  * But we also want to keep skb->sk set because some packet schedulers
2100  * rely on it (sch_fq for example).
2101  */
2102 void skb_orphan_partial(struct sk_buff *skb)
2103 {
2104 	if (skb_is_tcp_pure_ack(skb))
2105 		return;
2106 
2107 	if (can_skb_orphan_partial(skb)) {
2108 		struct sock *sk = skb->sk;
2109 
2110 		if (refcount_inc_not_zero(&sk->sk_refcnt)) {
2111 			WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
2112 			skb->destructor = sock_efree;
2113 		}
2114 	} else {
2115 		skb_orphan(skb);
2116 	}
2117 }
2118 EXPORT_SYMBOL(skb_orphan_partial);
2119 
2120 /*
2121  * Read buffer destructor automatically called from kfree_skb.
2122  */
2123 void sock_rfree(struct sk_buff *skb)
2124 {
2125 	struct sock *sk = skb->sk;
2126 	unsigned int len = skb->truesize;
2127 
2128 	atomic_sub(len, &sk->sk_rmem_alloc);
2129 	sk_mem_uncharge(sk, len);
2130 }
2131 EXPORT_SYMBOL(sock_rfree);
2132 
2133 /*
2134  * Buffer destructor for skbs that are not used directly in read or write
2135  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2136  */
2137 void sock_efree(struct sk_buff *skb)
2138 {
2139 	sock_put(skb->sk);
2140 }
2141 EXPORT_SYMBOL(sock_efree);
2142 
2143 /* Buffer destructor for prefetch/receive path where reference count may
2144  * not be held, e.g. for listen sockets.
2145  */
2146 #ifdef CONFIG_INET
2147 void sock_pfree(struct sk_buff *skb)
2148 {
2149 	if (sk_is_refcounted(skb->sk))
2150 		sock_gen_put(skb->sk);
2151 }
2152 EXPORT_SYMBOL(sock_pfree);
2153 #endif /* CONFIG_INET */
2154 
2155 kuid_t sock_i_uid(struct sock *sk)
2156 {
2157 	kuid_t uid;
2158 
2159 	read_lock_bh(&sk->sk_callback_lock);
2160 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2161 	read_unlock_bh(&sk->sk_callback_lock);
2162 	return uid;
2163 }
2164 EXPORT_SYMBOL(sock_i_uid);
2165 
2166 unsigned long sock_i_ino(struct sock *sk)
2167 {
2168 	unsigned long ino;
2169 
2170 	read_lock_bh(&sk->sk_callback_lock);
2171 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2172 	read_unlock_bh(&sk->sk_callback_lock);
2173 	return ino;
2174 }
2175 EXPORT_SYMBOL(sock_i_ino);
2176 
2177 /*
2178  * Allocate a skb from the socket's send buffer.
2179  */
2180 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2181 			     gfp_t priority)
2182 {
2183 	if (force ||
2184 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2185 		struct sk_buff *skb = alloc_skb(size, priority);
2186 
2187 		if (skb) {
2188 			skb_set_owner_w(skb, sk);
2189 			return skb;
2190 		}
2191 	}
2192 	return NULL;
2193 }
2194 EXPORT_SYMBOL(sock_wmalloc);
2195 
2196 static void sock_ofree(struct sk_buff *skb)
2197 {
2198 	struct sock *sk = skb->sk;
2199 
2200 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2201 }
2202 
2203 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2204 			     gfp_t priority)
2205 {
2206 	struct sk_buff *skb;
2207 
2208 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2209 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2210 	    sysctl_optmem_max)
2211 		return NULL;
2212 
2213 	skb = alloc_skb(size, priority);
2214 	if (!skb)
2215 		return NULL;
2216 
2217 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2218 	skb->sk = sk;
2219 	skb->destructor = sock_ofree;
2220 	return skb;
2221 }
2222 
2223 /*
2224  * Allocate a memory block from the socket's option memory buffer.
2225  */
2226 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2227 {
2228 	if ((unsigned int)size <= sysctl_optmem_max &&
2229 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2230 		void *mem;
2231 		/* First do the add, to avoid the race if kmalloc
2232 		 * might sleep.
2233 		 */
2234 		atomic_add(size, &sk->sk_omem_alloc);
2235 		mem = kmalloc(size, priority);
2236 		if (mem)
2237 			return mem;
2238 		atomic_sub(size, &sk->sk_omem_alloc);
2239 	}
2240 	return NULL;
2241 }
2242 EXPORT_SYMBOL(sock_kmalloc);
2243 
2244 /* Free an option memory block. Note, we actually want the inline
2245  * here as this allows gcc to detect the nullify and fold away the
2246  * condition entirely.
2247  */
2248 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2249 				  const bool nullify)
2250 {
2251 	if (WARN_ON_ONCE(!mem))
2252 		return;
2253 	if (nullify)
2254 		kfree_sensitive(mem);
2255 	else
2256 		kfree(mem);
2257 	atomic_sub(size, &sk->sk_omem_alloc);
2258 }
2259 
2260 void sock_kfree_s(struct sock *sk, void *mem, int size)
2261 {
2262 	__sock_kfree_s(sk, mem, size, false);
2263 }
2264 EXPORT_SYMBOL(sock_kfree_s);
2265 
2266 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2267 {
2268 	__sock_kfree_s(sk, mem, size, true);
2269 }
2270 EXPORT_SYMBOL(sock_kzfree_s);
2271 
2272 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2273    I think, these locks should be removed for datagram sockets.
2274  */
2275 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2276 {
2277 	DEFINE_WAIT(wait);
2278 
2279 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2280 	for (;;) {
2281 		if (!timeo)
2282 			break;
2283 		if (signal_pending(current))
2284 			break;
2285 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2286 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2287 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2288 			break;
2289 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2290 			break;
2291 		if (sk->sk_err)
2292 			break;
2293 		timeo = schedule_timeout(timeo);
2294 	}
2295 	finish_wait(sk_sleep(sk), &wait);
2296 	return timeo;
2297 }
2298 
2299 
2300 /*
2301  *	Generic send/receive buffer handlers
2302  */
2303 
2304 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2305 				     unsigned long data_len, int noblock,
2306 				     int *errcode, int max_page_order)
2307 {
2308 	struct sk_buff *skb;
2309 	long timeo;
2310 	int err;
2311 
2312 	timeo = sock_sndtimeo(sk, noblock);
2313 	for (;;) {
2314 		err = sock_error(sk);
2315 		if (err != 0)
2316 			goto failure;
2317 
2318 		err = -EPIPE;
2319 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2320 			goto failure;
2321 
2322 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2323 			break;
2324 
2325 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2326 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2327 		err = -EAGAIN;
2328 		if (!timeo)
2329 			goto failure;
2330 		if (signal_pending(current))
2331 			goto interrupted;
2332 		timeo = sock_wait_for_wmem(sk, timeo);
2333 	}
2334 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2335 				   errcode, sk->sk_allocation);
2336 	if (skb)
2337 		skb_set_owner_w(skb, sk);
2338 	return skb;
2339 
2340 interrupted:
2341 	err = sock_intr_errno(timeo);
2342 failure:
2343 	*errcode = err;
2344 	return NULL;
2345 }
2346 EXPORT_SYMBOL(sock_alloc_send_pskb);
2347 
2348 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2349 				    int noblock, int *errcode)
2350 {
2351 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2352 }
2353 EXPORT_SYMBOL(sock_alloc_send_skb);
2354 
2355 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2356 		     struct sockcm_cookie *sockc)
2357 {
2358 	u32 tsflags;
2359 
2360 	switch (cmsg->cmsg_type) {
2361 	case SO_MARK:
2362 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2363 			return -EPERM;
2364 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2365 			return -EINVAL;
2366 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2367 		break;
2368 	case SO_TIMESTAMPING_OLD:
2369 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2370 			return -EINVAL;
2371 
2372 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2373 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2374 			return -EINVAL;
2375 
2376 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2377 		sockc->tsflags |= tsflags;
2378 		break;
2379 	case SCM_TXTIME:
2380 		if (!sock_flag(sk, SOCK_TXTIME))
2381 			return -EINVAL;
2382 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2383 			return -EINVAL;
2384 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2385 		break;
2386 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2387 	case SCM_RIGHTS:
2388 	case SCM_CREDENTIALS:
2389 		break;
2390 	default:
2391 		return -EINVAL;
2392 	}
2393 	return 0;
2394 }
2395 EXPORT_SYMBOL(__sock_cmsg_send);
2396 
2397 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2398 		   struct sockcm_cookie *sockc)
2399 {
2400 	struct cmsghdr *cmsg;
2401 	int ret;
2402 
2403 	for_each_cmsghdr(cmsg, msg) {
2404 		if (!CMSG_OK(msg, cmsg))
2405 			return -EINVAL;
2406 		if (cmsg->cmsg_level != SOL_SOCKET)
2407 			continue;
2408 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2409 		if (ret)
2410 			return ret;
2411 	}
2412 	return 0;
2413 }
2414 EXPORT_SYMBOL(sock_cmsg_send);
2415 
2416 static void sk_enter_memory_pressure(struct sock *sk)
2417 {
2418 	if (!sk->sk_prot->enter_memory_pressure)
2419 		return;
2420 
2421 	sk->sk_prot->enter_memory_pressure(sk);
2422 }
2423 
2424 static void sk_leave_memory_pressure(struct sock *sk)
2425 {
2426 	if (sk->sk_prot->leave_memory_pressure) {
2427 		sk->sk_prot->leave_memory_pressure(sk);
2428 	} else {
2429 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2430 
2431 		if (memory_pressure && READ_ONCE(*memory_pressure))
2432 			WRITE_ONCE(*memory_pressure, 0);
2433 	}
2434 }
2435 
2436 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
2437 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2438 
2439 /**
2440  * skb_page_frag_refill - check that a page_frag contains enough room
2441  * @sz: minimum size of the fragment we want to get
2442  * @pfrag: pointer to page_frag
2443  * @gfp: priority for memory allocation
2444  *
2445  * Note: While this allocator tries to use high order pages, there is
2446  * no guarantee that allocations succeed. Therefore, @sz MUST be
2447  * less or equal than PAGE_SIZE.
2448  */
2449 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2450 {
2451 	if (pfrag->page) {
2452 		if (page_ref_count(pfrag->page) == 1) {
2453 			pfrag->offset = 0;
2454 			return true;
2455 		}
2456 		if (pfrag->offset + sz <= pfrag->size)
2457 			return true;
2458 		put_page(pfrag->page);
2459 	}
2460 
2461 	pfrag->offset = 0;
2462 	if (SKB_FRAG_PAGE_ORDER &&
2463 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2464 		/* Avoid direct reclaim but allow kswapd to wake */
2465 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2466 					  __GFP_COMP | __GFP_NOWARN |
2467 					  __GFP_NORETRY,
2468 					  SKB_FRAG_PAGE_ORDER);
2469 		if (likely(pfrag->page)) {
2470 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2471 			return true;
2472 		}
2473 	}
2474 	pfrag->page = alloc_page(gfp);
2475 	if (likely(pfrag->page)) {
2476 		pfrag->size = PAGE_SIZE;
2477 		return true;
2478 	}
2479 	return false;
2480 }
2481 EXPORT_SYMBOL(skb_page_frag_refill);
2482 
2483 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2484 {
2485 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2486 		return true;
2487 
2488 	sk_enter_memory_pressure(sk);
2489 	sk_stream_moderate_sndbuf(sk);
2490 	return false;
2491 }
2492 EXPORT_SYMBOL(sk_page_frag_refill);
2493 
2494 static void __lock_sock(struct sock *sk)
2495 	__releases(&sk->sk_lock.slock)
2496 	__acquires(&sk->sk_lock.slock)
2497 {
2498 	DEFINE_WAIT(wait);
2499 
2500 	for (;;) {
2501 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2502 					TASK_UNINTERRUPTIBLE);
2503 		spin_unlock_bh(&sk->sk_lock.slock);
2504 		schedule();
2505 		spin_lock_bh(&sk->sk_lock.slock);
2506 		if (!sock_owned_by_user(sk))
2507 			break;
2508 	}
2509 	finish_wait(&sk->sk_lock.wq, &wait);
2510 }
2511 
2512 void __release_sock(struct sock *sk)
2513 	__releases(&sk->sk_lock.slock)
2514 	__acquires(&sk->sk_lock.slock)
2515 {
2516 	struct sk_buff *skb, *next;
2517 
2518 	while ((skb = sk->sk_backlog.head) != NULL) {
2519 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2520 
2521 		spin_unlock_bh(&sk->sk_lock.slock);
2522 
2523 		do {
2524 			next = skb->next;
2525 			prefetch(next);
2526 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2527 			skb_mark_not_on_list(skb);
2528 			sk_backlog_rcv(sk, skb);
2529 
2530 			cond_resched();
2531 
2532 			skb = next;
2533 		} while (skb != NULL);
2534 
2535 		spin_lock_bh(&sk->sk_lock.slock);
2536 	}
2537 
2538 	/*
2539 	 * Doing the zeroing here guarantee we can not loop forever
2540 	 * while a wild producer attempts to flood us.
2541 	 */
2542 	sk->sk_backlog.len = 0;
2543 }
2544 
2545 void __sk_flush_backlog(struct sock *sk)
2546 {
2547 	spin_lock_bh(&sk->sk_lock.slock);
2548 	__release_sock(sk);
2549 	spin_unlock_bh(&sk->sk_lock.slock);
2550 }
2551 
2552 /**
2553  * sk_wait_data - wait for data to arrive at sk_receive_queue
2554  * @sk:    sock to wait on
2555  * @timeo: for how long
2556  * @skb:   last skb seen on sk_receive_queue
2557  *
2558  * Now socket state including sk->sk_err is changed only under lock,
2559  * hence we may omit checks after joining wait queue.
2560  * We check receive queue before schedule() only as optimization;
2561  * it is very likely that release_sock() added new data.
2562  */
2563 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2564 {
2565 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2566 	int rc;
2567 
2568 	add_wait_queue(sk_sleep(sk), &wait);
2569 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2570 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2571 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2572 	remove_wait_queue(sk_sleep(sk), &wait);
2573 	return rc;
2574 }
2575 EXPORT_SYMBOL(sk_wait_data);
2576 
2577 /**
2578  *	__sk_mem_raise_allocated - increase memory_allocated
2579  *	@sk: socket
2580  *	@size: memory size to allocate
2581  *	@amt: pages to allocate
2582  *	@kind: allocation type
2583  *
2584  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2585  */
2586 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2587 {
2588 	struct proto *prot = sk->sk_prot;
2589 	long allocated = sk_memory_allocated_add(sk, amt);
2590 	bool charged = true;
2591 
2592 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2593 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2594 		goto suppress_allocation;
2595 
2596 	/* Under limit. */
2597 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2598 		sk_leave_memory_pressure(sk);
2599 		return 1;
2600 	}
2601 
2602 	/* Under pressure. */
2603 	if (allocated > sk_prot_mem_limits(sk, 1))
2604 		sk_enter_memory_pressure(sk);
2605 
2606 	/* Over hard limit. */
2607 	if (allocated > sk_prot_mem_limits(sk, 2))
2608 		goto suppress_allocation;
2609 
2610 	/* guarantee minimum buffer size under pressure */
2611 	if (kind == SK_MEM_RECV) {
2612 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2613 			return 1;
2614 
2615 	} else { /* SK_MEM_SEND */
2616 		int wmem0 = sk_get_wmem0(sk, prot);
2617 
2618 		if (sk->sk_type == SOCK_STREAM) {
2619 			if (sk->sk_wmem_queued < wmem0)
2620 				return 1;
2621 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2622 				return 1;
2623 		}
2624 	}
2625 
2626 	if (sk_has_memory_pressure(sk)) {
2627 		u64 alloc;
2628 
2629 		if (!sk_under_memory_pressure(sk))
2630 			return 1;
2631 		alloc = sk_sockets_allocated_read_positive(sk);
2632 		if (sk_prot_mem_limits(sk, 2) > alloc *
2633 		    sk_mem_pages(sk->sk_wmem_queued +
2634 				 atomic_read(&sk->sk_rmem_alloc) +
2635 				 sk->sk_forward_alloc))
2636 			return 1;
2637 	}
2638 
2639 suppress_allocation:
2640 
2641 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2642 		sk_stream_moderate_sndbuf(sk);
2643 
2644 		/* Fail only if socket is _under_ its sndbuf.
2645 		 * In this case we cannot block, so that we have to fail.
2646 		 */
2647 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2648 			return 1;
2649 	}
2650 
2651 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2652 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2653 
2654 	sk_memory_allocated_sub(sk, amt);
2655 
2656 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2657 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2658 
2659 	return 0;
2660 }
2661 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2662 
2663 /**
2664  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2665  *	@sk: socket
2666  *	@size: memory size to allocate
2667  *	@kind: allocation type
2668  *
2669  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2670  *	rmem allocation. This function assumes that protocols which have
2671  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2672  */
2673 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2674 {
2675 	int ret, amt = sk_mem_pages(size);
2676 
2677 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2678 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2679 	if (!ret)
2680 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2681 	return ret;
2682 }
2683 EXPORT_SYMBOL(__sk_mem_schedule);
2684 
2685 /**
2686  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2687  *	@sk: socket
2688  *	@amount: number of quanta
2689  *
2690  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2691  */
2692 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2693 {
2694 	sk_memory_allocated_sub(sk, amount);
2695 
2696 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2697 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2698 
2699 	if (sk_under_memory_pressure(sk) &&
2700 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2701 		sk_leave_memory_pressure(sk);
2702 }
2703 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2704 
2705 /**
2706  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2707  *	@sk: socket
2708  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2709  */
2710 void __sk_mem_reclaim(struct sock *sk, int amount)
2711 {
2712 	amount >>= SK_MEM_QUANTUM_SHIFT;
2713 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2714 	__sk_mem_reduce_allocated(sk, amount);
2715 }
2716 EXPORT_SYMBOL(__sk_mem_reclaim);
2717 
2718 int sk_set_peek_off(struct sock *sk, int val)
2719 {
2720 	sk->sk_peek_off = val;
2721 	return 0;
2722 }
2723 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2724 
2725 /*
2726  * Set of default routines for initialising struct proto_ops when
2727  * the protocol does not support a particular function. In certain
2728  * cases where it makes no sense for a protocol to have a "do nothing"
2729  * function, some default processing is provided.
2730  */
2731 
2732 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2733 {
2734 	return -EOPNOTSUPP;
2735 }
2736 EXPORT_SYMBOL(sock_no_bind);
2737 
2738 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2739 		    int len, int flags)
2740 {
2741 	return -EOPNOTSUPP;
2742 }
2743 EXPORT_SYMBOL(sock_no_connect);
2744 
2745 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2746 {
2747 	return -EOPNOTSUPP;
2748 }
2749 EXPORT_SYMBOL(sock_no_socketpair);
2750 
2751 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2752 		   bool kern)
2753 {
2754 	return -EOPNOTSUPP;
2755 }
2756 EXPORT_SYMBOL(sock_no_accept);
2757 
2758 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2759 		    int peer)
2760 {
2761 	return -EOPNOTSUPP;
2762 }
2763 EXPORT_SYMBOL(sock_no_getname);
2764 
2765 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2766 {
2767 	return -EOPNOTSUPP;
2768 }
2769 EXPORT_SYMBOL(sock_no_ioctl);
2770 
2771 int sock_no_listen(struct socket *sock, int backlog)
2772 {
2773 	return -EOPNOTSUPP;
2774 }
2775 EXPORT_SYMBOL(sock_no_listen);
2776 
2777 int sock_no_shutdown(struct socket *sock, int how)
2778 {
2779 	return -EOPNOTSUPP;
2780 }
2781 EXPORT_SYMBOL(sock_no_shutdown);
2782 
2783 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2784 {
2785 	return -EOPNOTSUPP;
2786 }
2787 EXPORT_SYMBOL(sock_no_sendmsg);
2788 
2789 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2790 {
2791 	return -EOPNOTSUPP;
2792 }
2793 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2794 
2795 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2796 		    int flags)
2797 {
2798 	return -EOPNOTSUPP;
2799 }
2800 EXPORT_SYMBOL(sock_no_recvmsg);
2801 
2802 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2803 {
2804 	/* Mirror missing mmap method error code */
2805 	return -ENODEV;
2806 }
2807 EXPORT_SYMBOL(sock_no_mmap);
2808 
2809 /*
2810  * When a file is received (via SCM_RIGHTS, etc), we must bump the
2811  * various sock-based usage counts.
2812  */
2813 void __receive_sock(struct file *file)
2814 {
2815 	struct socket *sock;
2816 	int error;
2817 
2818 	/*
2819 	 * The resulting value of "error" is ignored here since we only
2820 	 * need to take action when the file is a socket and testing
2821 	 * "sock" for NULL is sufficient.
2822 	 */
2823 	sock = sock_from_file(file, &error);
2824 	if (sock) {
2825 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
2826 		sock_update_classid(&sock->sk->sk_cgrp_data);
2827 	}
2828 }
2829 
2830 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2831 {
2832 	ssize_t res;
2833 	struct msghdr msg = {.msg_flags = flags};
2834 	struct kvec iov;
2835 	char *kaddr = kmap(page);
2836 	iov.iov_base = kaddr + offset;
2837 	iov.iov_len = size;
2838 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2839 	kunmap(page);
2840 	return res;
2841 }
2842 EXPORT_SYMBOL(sock_no_sendpage);
2843 
2844 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2845 				int offset, size_t size, int flags)
2846 {
2847 	ssize_t res;
2848 	struct msghdr msg = {.msg_flags = flags};
2849 	struct kvec iov;
2850 	char *kaddr = kmap(page);
2851 
2852 	iov.iov_base = kaddr + offset;
2853 	iov.iov_len = size;
2854 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2855 	kunmap(page);
2856 	return res;
2857 }
2858 EXPORT_SYMBOL(sock_no_sendpage_locked);
2859 
2860 /*
2861  *	Default Socket Callbacks
2862  */
2863 
2864 static void sock_def_wakeup(struct sock *sk)
2865 {
2866 	struct socket_wq *wq;
2867 
2868 	rcu_read_lock();
2869 	wq = rcu_dereference(sk->sk_wq);
2870 	if (skwq_has_sleeper(wq))
2871 		wake_up_interruptible_all(&wq->wait);
2872 	rcu_read_unlock();
2873 }
2874 
2875 static void sock_def_error_report(struct sock *sk)
2876 {
2877 	struct socket_wq *wq;
2878 
2879 	rcu_read_lock();
2880 	wq = rcu_dereference(sk->sk_wq);
2881 	if (skwq_has_sleeper(wq))
2882 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2883 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2884 	rcu_read_unlock();
2885 }
2886 
2887 void sock_def_readable(struct sock *sk)
2888 {
2889 	struct socket_wq *wq;
2890 
2891 	rcu_read_lock();
2892 	wq = rcu_dereference(sk->sk_wq);
2893 	if (skwq_has_sleeper(wq))
2894 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2895 						EPOLLRDNORM | EPOLLRDBAND);
2896 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2897 	rcu_read_unlock();
2898 }
2899 
2900 static void sock_def_write_space(struct sock *sk)
2901 {
2902 	struct socket_wq *wq;
2903 
2904 	rcu_read_lock();
2905 
2906 	/* Do not wake up a writer until he can make "significant"
2907 	 * progress.  --DaveM
2908 	 */
2909 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
2910 		wq = rcu_dereference(sk->sk_wq);
2911 		if (skwq_has_sleeper(wq))
2912 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2913 						EPOLLWRNORM | EPOLLWRBAND);
2914 
2915 		/* Should agree with poll, otherwise some programs break */
2916 		if (sock_writeable(sk))
2917 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2918 	}
2919 
2920 	rcu_read_unlock();
2921 }
2922 
2923 static void sock_def_destruct(struct sock *sk)
2924 {
2925 }
2926 
2927 void sk_send_sigurg(struct sock *sk)
2928 {
2929 	if (sk->sk_socket && sk->sk_socket->file)
2930 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2931 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2932 }
2933 EXPORT_SYMBOL(sk_send_sigurg);
2934 
2935 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2936 		    unsigned long expires)
2937 {
2938 	if (!mod_timer(timer, expires))
2939 		sock_hold(sk);
2940 }
2941 EXPORT_SYMBOL(sk_reset_timer);
2942 
2943 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2944 {
2945 	if (del_timer(timer))
2946 		__sock_put(sk);
2947 }
2948 EXPORT_SYMBOL(sk_stop_timer);
2949 
2950 void sock_init_data(struct socket *sock, struct sock *sk)
2951 {
2952 	sk_init_common(sk);
2953 	sk->sk_send_head	=	NULL;
2954 
2955 	timer_setup(&sk->sk_timer, NULL, 0);
2956 
2957 	sk->sk_allocation	=	GFP_KERNEL;
2958 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2959 	sk->sk_sndbuf		=	sysctl_wmem_default;
2960 	sk->sk_state		=	TCP_CLOSE;
2961 	sk_set_socket(sk, sock);
2962 
2963 	sock_set_flag(sk, SOCK_ZAPPED);
2964 
2965 	if (sock) {
2966 		sk->sk_type	=	sock->type;
2967 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
2968 		sock->sk	=	sk;
2969 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
2970 	} else {
2971 		RCU_INIT_POINTER(sk->sk_wq, NULL);
2972 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
2973 	}
2974 
2975 	rwlock_init(&sk->sk_callback_lock);
2976 	if (sk->sk_kern_sock)
2977 		lockdep_set_class_and_name(
2978 			&sk->sk_callback_lock,
2979 			af_kern_callback_keys + sk->sk_family,
2980 			af_family_kern_clock_key_strings[sk->sk_family]);
2981 	else
2982 		lockdep_set_class_and_name(
2983 			&sk->sk_callback_lock,
2984 			af_callback_keys + sk->sk_family,
2985 			af_family_clock_key_strings[sk->sk_family]);
2986 
2987 	sk->sk_state_change	=	sock_def_wakeup;
2988 	sk->sk_data_ready	=	sock_def_readable;
2989 	sk->sk_write_space	=	sock_def_write_space;
2990 	sk->sk_error_report	=	sock_def_error_report;
2991 	sk->sk_destruct		=	sock_def_destruct;
2992 
2993 	sk->sk_frag.page	=	NULL;
2994 	sk->sk_frag.offset	=	0;
2995 	sk->sk_peek_off		=	-1;
2996 
2997 	sk->sk_peer_pid 	=	NULL;
2998 	sk->sk_peer_cred	=	NULL;
2999 	sk->sk_write_pending	=	0;
3000 	sk->sk_rcvlowat		=	1;
3001 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3002 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3003 
3004 	sk->sk_stamp = SK_DEFAULT_STAMP;
3005 #if BITS_PER_LONG==32
3006 	seqlock_init(&sk->sk_stamp_seq);
3007 #endif
3008 	atomic_set(&sk->sk_zckey, 0);
3009 
3010 #ifdef CONFIG_NET_RX_BUSY_POLL
3011 	sk->sk_napi_id		=	0;
3012 	sk->sk_ll_usec		=	sysctl_net_busy_read;
3013 #endif
3014 
3015 	sk->sk_max_pacing_rate = ~0UL;
3016 	sk->sk_pacing_rate = ~0UL;
3017 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3018 	sk->sk_incoming_cpu = -1;
3019 
3020 	sk_rx_queue_clear(sk);
3021 	/*
3022 	 * Before updating sk_refcnt, we must commit prior changes to memory
3023 	 * (Documentation/RCU/rculist_nulls.rst for details)
3024 	 */
3025 	smp_wmb();
3026 	refcount_set(&sk->sk_refcnt, 1);
3027 	atomic_set(&sk->sk_drops, 0);
3028 }
3029 EXPORT_SYMBOL(sock_init_data);
3030 
3031 void lock_sock_nested(struct sock *sk, int subclass)
3032 {
3033 	might_sleep();
3034 	spin_lock_bh(&sk->sk_lock.slock);
3035 	if (sk->sk_lock.owned)
3036 		__lock_sock(sk);
3037 	sk->sk_lock.owned = 1;
3038 	spin_unlock(&sk->sk_lock.slock);
3039 	/*
3040 	 * The sk_lock has mutex_lock() semantics here:
3041 	 */
3042 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3043 	local_bh_enable();
3044 }
3045 EXPORT_SYMBOL(lock_sock_nested);
3046 
3047 void release_sock(struct sock *sk)
3048 {
3049 	spin_lock_bh(&sk->sk_lock.slock);
3050 	if (sk->sk_backlog.tail)
3051 		__release_sock(sk);
3052 
3053 	/* Warning : release_cb() might need to release sk ownership,
3054 	 * ie call sock_release_ownership(sk) before us.
3055 	 */
3056 	if (sk->sk_prot->release_cb)
3057 		sk->sk_prot->release_cb(sk);
3058 
3059 	sock_release_ownership(sk);
3060 	if (waitqueue_active(&sk->sk_lock.wq))
3061 		wake_up(&sk->sk_lock.wq);
3062 	spin_unlock_bh(&sk->sk_lock.slock);
3063 }
3064 EXPORT_SYMBOL(release_sock);
3065 
3066 /**
3067  * lock_sock_fast - fast version of lock_sock
3068  * @sk: socket
3069  *
3070  * This version should be used for very small section, where process wont block
3071  * return false if fast path is taken:
3072  *
3073  *   sk_lock.slock locked, owned = 0, BH disabled
3074  *
3075  * return true if slow path is taken:
3076  *
3077  *   sk_lock.slock unlocked, owned = 1, BH enabled
3078  */
3079 bool lock_sock_fast(struct sock *sk)
3080 {
3081 	might_sleep();
3082 	spin_lock_bh(&sk->sk_lock.slock);
3083 
3084 	if (!sk->sk_lock.owned)
3085 		/*
3086 		 * Note : We must disable BH
3087 		 */
3088 		return false;
3089 
3090 	__lock_sock(sk);
3091 	sk->sk_lock.owned = 1;
3092 	spin_unlock(&sk->sk_lock.slock);
3093 	/*
3094 	 * The sk_lock has mutex_lock() semantics here:
3095 	 */
3096 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
3097 	local_bh_enable();
3098 	return true;
3099 }
3100 EXPORT_SYMBOL(lock_sock_fast);
3101 
3102 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3103 		   bool timeval, bool time32)
3104 {
3105 	struct sock *sk = sock->sk;
3106 	struct timespec64 ts;
3107 
3108 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3109 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3110 	if (ts.tv_sec == -1)
3111 		return -ENOENT;
3112 	if (ts.tv_sec == 0) {
3113 		ktime_t kt = ktime_get_real();
3114 		sock_write_timestamp(sk, kt);
3115 		ts = ktime_to_timespec64(kt);
3116 	}
3117 
3118 	if (timeval)
3119 		ts.tv_nsec /= 1000;
3120 
3121 #ifdef CONFIG_COMPAT_32BIT_TIME
3122 	if (time32)
3123 		return put_old_timespec32(&ts, userstamp);
3124 #endif
3125 #ifdef CONFIG_SPARC64
3126 	/* beware of padding in sparc64 timeval */
3127 	if (timeval && !in_compat_syscall()) {
3128 		struct __kernel_old_timeval __user tv = {
3129 			.tv_sec = ts.tv_sec,
3130 			.tv_usec = ts.tv_nsec,
3131 		};
3132 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3133 			return -EFAULT;
3134 		return 0;
3135 	}
3136 #endif
3137 	return put_timespec64(&ts, userstamp);
3138 }
3139 EXPORT_SYMBOL(sock_gettstamp);
3140 
3141 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3142 {
3143 	if (!sock_flag(sk, flag)) {
3144 		unsigned long previous_flags = sk->sk_flags;
3145 
3146 		sock_set_flag(sk, flag);
3147 		/*
3148 		 * we just set one of the two flags which require net
3149 		 * time stamping, but time stamping might have been on
3150 		 * already because of the other one
3151 		 */
3152 		if (sock_needs_netstamp(sk) &&
3153 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3154 			net_enable_timestamp();
3155 	}
3156 }
3157 
3158 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3159 		       int level, int type)
3160 {
3161 	struct sock_exterr_skb *serr;
3162 	struct sk_buff *skb;
3163 	int copied, err;
3164 
3165 	err = -EAGAIN;
3166 	skb = sock_dequeue_err_skb(sk);
3167 	if (skb == NULL)
3168 		goto out;
3169 
3170 	copied = skb->len;
3171 	if (copied > len) {
3172 		msg->msg_flags |= MSG_TRUNC;
3173 		copied = len;
3174 	}
3175 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3176 	if (err)
3177 		goto out_free_skb;
3178 
3179 	sock_recv_timestamp(msg, sk, skb);
3180 
3181 	serr = SKB_EXT_ERR(skb);
3182 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3183 
3184 	msg->msg_flags |= MSG_ERRQUEUE;
3185 	err = copied;
3186 
3187 out_free_skb:
3188 	kfree_skb(skb);
3189 out:
3190 	return err;
3191 }
3192 EXPORT_SYMBOL(sock_recv_errqueue);
3193 
3194 /*
3195  *	Get a socket option on an socket.
3196  *
3197  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3198  *	asynchronous errors should be reported by getsockopt. We assume
3199  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3200  */
3201 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3202 			   char __user *optval, int __user *optlen)
3203 {
3204 	struct sock *sk = sock->sk;
3205 
3206 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3207 }
3208 EXPORT_SYMBOL(sock_common_getsockopt);
3209 
3210 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3211 			int flags)
3212 {
3213 	struct sock *sk = sock->sk;
3214 	int addr_len = 0;
3215 	int err;
3216 
3217 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3218 				   flags & ~MSG_DONTWAIT, &addr_len);
3219 	if (err >= 0)
3220 		msg->msg_namelen = addr_len;
3221 	return err;
3222 }
3223 EXPORT_SYMBOL(sock_common_recvmsg);
3224 
3225 /*
3226  *	Set socket options on an inet socket.
3227  */
3228 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3229 			   sockptr_t optval, unsigned int optlen)
3230 {
3231 	struct sock *sk = sock->sk;
3232 
3233 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3234 }
3235 EXPORT_SYMBOL(sock_common_setsockopt);
3236 
3237 void sk_common_release(struct sock *sk)
3238 {
3239 	if (sk->sk_prot->destroy)
3240 		sk->sk_prot->destroy(sk);
3241 
3242 	/*
3243 	 * Observation: when sock_common_release is called, processes have
3244 	 * no access to socket. But net still has.
3245 	 * Step one, detach it from networking:
3246 	 *
3247 	 * A. Remove from hash tables.
3248 	 */
3249 
3250 	sk->sk_prot->unhash(sk);
3251 
3252 	/*
3253 	 * In this point socket cannot receive new packets, but it is possible
3254 	 * that some packets are in flight because some CPU runs receiver and
3255 	 * did hash table lookup before we unhashed socket. They will achieve
3256 	 * receive queue and will be purged by socket destructor.
3257 	 *
3258 	 * Also we still have packets pending on receive queue and probably,
3259 	 * our own packets waiting in device queues. sock_destroy will drain
3260 	 * receive queue, but transmitted packets will delay socket destruction
3261 	 * until the last reference will be released.
3262 	 */
3263 
3264 	sock_orphan(sk);
3265 
3266 	xfrm_sk_free_policy(sk);
3267 
3268 	sk_refcnt_debug_release(sk);
3269 
3270 	sock_put(sk);
3271 }
3272 EXPORT_SYMBOL(sk_common_release);
3273 
3274 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3275 {
3276 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3277 
3278 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3279 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3280 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3281 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3282 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3283 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3284 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3285 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3286 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3287 }
3288 
3289 #ifdef CONFIG_PROC_FS
3290 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
3291 struct prot_inuse {
3292 	int val[PROTO_INUSE_NR];
3293 };
3294 
3295 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3296 
3297 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3298 {
3299 	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3300 }
3301 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3302 
3303 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3304 {
3305 	int cpu, idx = prot->inuse_idx;
3306 	int res = 0;
3307 
3308 	for_each_possible_cpu(cpu)
3309 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3310 
3311 	return res >= 0 ? res : 0;
3312 }
3313 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3314 
3315 static void sock_inuse_add(struct net *net, int val)
3316 {
3317 	this_cpu_add(*net->core.sock_inuse, val);
3318 }
3319 
3320 int sock_inuse_get(struct net *net)
3321 {
3322 	int cpu, res = 0;
3323 
3324 	for_each_possible_cpu(cpu)
3325 		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3326 
3327 	return res;
3328 }
3329 
3330 EXPORT_SYMBOL_GPL(sock_inuse_get);
3331 
3332 static int __net_init sock_inuse_init_net(struct net *net)
3333 {
3334 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3335 	if (net->core.prot_inuse == NULL)
3336 		return -ENOMEM;
3337 
3338 	net->core.sock_inuse = alloc_percpu(int);
3339 	if (net->core.sock_inuse == NULL)
3340 		goto out;
3341 
3342 	return 0;
3343 
3344 out:
3345 	free_percpu(net->core.prot_inuse);
3346 	return -ENOMEM;
3347 }
3348 
3349 static void __net_exit sock_inuse_exit_net(struct net *net)
3350 {
3351 	free_percpu(net->core.prot_inuse);
3352 	free_percpu(net->core.sock_inuse);
3353 }
3354 
3355 static struct pernet_operations net_inuse_ops = {
3356 	.init = sock_inuse_init_net,
3357 	.exit = sock_inuse_exit_net,
3358 };
3359 
3360 static __init int net_inuse_init(void)
3361 {
3362 	if (register_pernet_subsys(&net_inuse_ops))
3363 		panic("Cannot initialize net inuse counters");
3364 
3365 	return 0;
3366 }
3367 
3368 core_initcall(net_inuse_init);
3369 
3370 static int assign_proto_idx(struct proto *prot)
3371 {
3372 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3373 
3374 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3375 		pr_err("PROTO_INUSE_NR exhausted\n");
3376 		return -ENOSPC;
3377 	}
3378 
3379 	set_bit(prot->inuse_idx, proto_inuse_idx);
3380 	return 0;
3381 }
3382 
3383 static void release_proto_idx(struct proto *prot)
3384 {
3385 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3386 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3387 }
3388 #else
3389 static inline int assign_proto_idx(struct proto *prot)
3390 {
3391 	return 0;
3392 }
3393 
3394 static inline void release_proto_idx(struct proto *prot)
3395 {
3396 }
3397 
3398 static void sock_inuse_add(struct net *net, int val)
3399 {
3400 }
3401 #endif
3402 
3403 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3404 {
3405 	if (!twsk_prot)
3406 		return;
3407 	kfree(twsk_prot->twsk_slab_name);
3408 	twsk_prot->twsk_slab_name = NULL;
3409 	kmem_cache_destroy(twsk_prot->twsk_slab);
3410 	twsk_prot->twsk_slab = NULL;
3411 }
3412 
3413 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3414 {
3415 	if (!rsk_prot)
3416 		return;
3417 	kfree(rsk_prot->slab_name);
3418 	rsk_prot->slab_name = NULL;
3419 	kmem_cache_destroy(rsk_prot->slab);
3420 	rsk_prot->slab = NULL;
3421 }
3422 
3423 static int req_prot_init(const struct proto *prot)
3424 {
3425 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3426 
3427 	if (!rsk_prot)
3428 		return 0;
3429 
3430 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3431 					prot->name);
3432 	if (!rsk_prot->slab_name)
3433 		return -ENOMEM;
3434 
3435 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3436 					   rsk_prot->obj_size, 0,
3437 					   SLAB_ACCOUNT | prot->slab_flags,
3438 					   NULL);
3439 
3440 	if (!rsk_prot->slab) {
3441 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3442 			prot->name);
3443 		return -ENOMEM;
3444 	}
3445 	return 0;
3446 }
3447 
3448 int proto_register(struct proto *prot, int alloc_slab)
3449 {
3450 	int ret = -ENOBUFS;
3451 
3452 	if (alloc_slab) {
3453 		prot->slab = kmem_cache_create_usercopy(prot->name,
3454 					prot->obj_size, 0,
3455 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3456 					prot->slab_flags,
3457 					prot->useroffset, prot->usersize,
3458 					NULL);
3459 
3460 		if (prot->slab == NULL) {
3461 			pr_crit("%s: Can't create sock SLAB cache!\n",
3462 				prot->name);
3463 			goto out;
3464 		}
3465 
3466 		if (req_prot_init(prot))
3467 			goto out_free_request_sock_slab;
3468 
3469 		if (prot->twsk_prot != NULL) {
3470 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3471 
3472 			if (prot->twsk_prot->twsk_slab_name == NULL)
3473 				goto out_free_request_sock_slab;
3474 
3475 			prot->twsk_prot->twsk_slab =
3476 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3477 						  prot->twsk_prot->twsk_obj_size,
3478 						  0,
3479 						  SLAB_ACCOUNT |
3480 						  prot->slab_flags,
3481 						  NULL);
3482 			if (prot->twsk_prot->twsk_slab == NULL)
3483 				goto out_free_timewait_sock_slab;
3484 		}
3485 	}
3486 
3487 	mutex_lock(&proto_list_mutex);
3488 	ret = assign_proto_idx(prot);
3489 	if (ret) {
3490 		mutex_unlock(&proto_list_mutex);
3491 		goto out_free_timewait_sock_slab;
3492 	}
3493 	list_add(&prot->node, &proto_list);
3494 	mutex_unlock(&proto_list_mutex);
3495 	return ret;
3496 
3497 out_free_timewait_sock_slab:
3498 	if (alloc_slab && prot->twsk_prot)
3499 		tw_prot_cleanup(prot->twsk_prot);
3500 out_free_request_sock_slab:
3501 	if (alloc_slab) {
3502 		req_prot_cleanup(prot->rsk_prot);
3503 
3504 		kmem_cache_destroy(prot->slab);
3505 		prot->slab = NULL;
3506 	}
3507 out:
3508 	return ret;
3509 }
3510 EXPORT_SYMBOL(proto_register);
3511 
3512 void proto_unregister(struct proto *prot)
3513 {
3514 	mutex_lock(&proto_list_mutex);
3515 	release_proto_idx(prot);
3516 	list_del(&prot->node);
3517 	mutex_unlock(&proto_list_mutex);
3518 
3519 	kmem_cache_destroy(prot->slab);
3520 	prot->slab = NULL;
3521 
3522 	req_prot_cleanup(prot->rsk_prot);
3523 	tw_prot_cleanup(prot->twsk_prot);
3524 }
3525 EXPORT_SYMBOL(proto_unregister);
3526 
3527 int sock_load_diag_module(int family, int protocol)
3528 {
3529 	if (!protocol) {
3530 		if (!sock_is_registered(family))
3531 			return -ENOENT;
3532 
3533 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3534 				      NETLINK_SOCK_DIAG, family);
3535 	}
3536 
3537 #ifdef CONFIG_INET
3538 	if (family == AF_INET &&
3539 	    protocol != IPPROTO_RAW &&
3540 	    protocol < MAX_INET_PROTOS &&
3541 	    !rcu_access_pointer(inet_protos[protocol]))
3542 		return -ENOENT;
3543 #endif
3544 
3545 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3546 			      NETLINK_SOCK_DIAG, family, protocol);
3547 }
3548 EXPORT_SYMBOL(sock_load_diag_module);
3549 
3550 #ifdef CONFIG_PROC_FS
3551 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3552 	__acquires(proto_list_mutex)
3553 {
3554 	mutex_lock(&proto_list_mutex);
3555 	return seq_list_start_head(&proto_list, *pos);
3556 }
3557 
3558 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3559 {
3560 	return seq_list_next(v, &proto_list, pos);
3561 }
3562 
3563 static void proto_seq_stop(struct seq_file *seq, void *v)
3564 	__releases(proto_list_mutex)
3565 {
3566 	mutex_unlock(&proto_list_mutex);
3567 }
3568 
3569 static char proto_method_implemented(const void *method)
3570 {
3571 	return method == NULL ? 'n' : 'y';
3572 }
3573 static long sock_prot_memory_allocated(struct proto *proto)
3574 {
3575 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3576 }
3577 
3578 static const char *sock_prot_memory_pressure(struct proto *proto)
3579 {
3580 	return proto->memory_pressure != NULL ?
3581 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3582 }
3583 
3584 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3585 {
3586 
3587 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3588 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3589 		   proto->name,
3590 		   proto->obj_size,
3591 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3592 		   sock_prot_memory_allocated(proto),
3593 		   sock_prot_memory_pressure(proto),
3594 		   proto->max_header,
3595 		   proto->slab == NULL ? "no" : "yes",
3596 		   module_name(proto->owner),
3597 		   proto_method_implemented(proto->close),
3598 		   proto_method_implemented(proto->connect),
3599 		   proto_method_implemented(proto->disconnect),
3600 		   proto_method_implemented(proto->accept),
3601 		   proto_method_implemented(proto->ioctl),
3602 		   proto_method_implemented(proto->init),
3603 		   proto_method_implemented(proto->destroy),
3604 		   proto_method_implemented(proto->shutdown),
3605 		   proto_method_implemented(proto->setsockopt),
3606 		   proto_method_implemented(proto->getsockopt),
3607 		   proto_method_implemented(proto->sendmsg),
3608 		   proto_method_implemented(proto->recvmsg),
3609 		   proto_method_implemented(proto->sendpage),
3610 		   proto_method_implemented(proto->bind),
3611 		   proto_method_implemented(proto->backlog_rcv),
3612 		   proto_method_implemented(proto->hash),
3613 		   proto_method_implemented(proto->unhash),
3614 		   proto_method_implemented(proto->get_port),
3615 		   proto_method_implemented(proto->enter_memory_pressure));
3616 }
3617 
3618 static int proto_seq_show(struct seq_file *seq, void *v)
3619 {
3620 	if (v == &proto_list)
3621 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3622 			   "protocol",
3623 			   "size",
3624 			   "sockets",
3625 			   "memory",
3626 			   "press",
3627 			   "maxhdr",
3628 			   "slab",
3629 			   "module",
3630 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3631 	else
3632 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3633 	return 0;
3634 }
3635 
3636 static const struct seq_operations proto_seq_ops = {
3637 	.start  = proto_seq_start,
3638 	.next   = proto_seq_next,
3639 	.stop   = proto_seq_stop,
3640 	.show   = proto_seq_show,
3641 };
3642 
3643 static __net_init int proto_init_net(struct net *net)
3644 {
3645 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3646 			sizeof(struct seq_net_private)))
3647 		return -ENOMEM;
3648 
3649 	return 0;
3650 }
3651 
3652 static __net_exit void proto_exit_net(struct net *net)
3653 {
3654 	remove_proc_entry("protocols", net->proc_net);
3655 }
3656 
3657 
3658 static __net_initdata struct pernet_operations proto_net_ops = {
3659 	.init = proto_init_net,
3660 	.exit = proto_exit_net,
3661 };
3662 
3663 static int __init proto_init(void)
3664 {
3665 	return register_pernet_subsys(&proto_net_ops);
3666 }
3667 
3668 subsys_initcall(proto_init);
3669 
3670 #endif /* PROC_FS */
3671 
3672 #ifdef CONFIG_NET_RX_BUSY_POLL
3673 bool sk_busy_loop_end(void *p, unsigned long start_time)
3674 {
3675 	struct sock *sk = p;
3676 
3677 	return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3678 	       sk_busy_loop_timeout(sk, start_time);
3679 }
3680 EXPORT_SYMBOL(sk_busy_loop_end);
3681 #endif /* CONFIG_NET_RX_BUSY_POLL */
3682 
3683 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3684 {
3685 	if (!sk->sk_prot->bind_add)
3686 		return -EOPNOTSUPP;
3687 	return sk->sk_prot->bind_add(sk, addr, addr_len);
3688 }
3689 EXPORT_SYMBOL(sock_bind_add);
3690