xref: /linux/net/core/sock.c (revision 89721e3038d181bacbd6be54354b513fdf1b4f10)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <net/net_namespace.h>
128 #include <net/request_sock.h>
129 #include <net/sock.h>
130 #include <net/proto_memory.h>
131 #include <linux/net_tstamp.h>
132 #include <net/xfrm.h>
133 #include <linux/ipsec.h>
134 #include <net/cls_cgroup.h>
135 #include <net/netprio_cgroup.h>
136 #include <linux/sock_diag.h>
137 
138 #include <linux/filter.h>
139 #include <net/sock_reuseport.h>
140 #include <net/bpf_sk_storage.h>
141 
142 #include <trace/events/sock.h>
143 
144 #include <net/tcp.h>
145 #include <net/busy_poll.h>
146 #include <net/phonet/phonet.h>
147 
148 #include <linux/ethtool.h>
149 
150 #include "dev.h"
151 
152 static DEFINE_MUTEX(proto_list_mutex);
153 static LIST_HEAD(proto_list);
154 
155 static void sock_def_write_space_wfree(struct sock *sk);
156 static void sock_def_write_space(struct sock *sk);
157 
158 /**
159  * sk_ns_capable - General socket capability test
160  * @sk: Socket to use a capability on or through
161  * @user_ns: The user namespace of the capability to use
162  * @cap: The capability to use
163  *
164  * Test to see if the opener of the socket had when the socket was
165  * created and the current process has the capability @cap in the user
166  * namespace @user_ns.
167  */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)168 bool sk_ns_capable(const struct sock *sk,
169 		   struct user_namespace *user_ns, int cap)
170 {
171 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
172 		ns_capable(user_ns, cap);
173 }
174 EXPORT_SYMBOL(sk_ns_capable);
175 
176 /**
177  * sk_capable - Socket global capability test
178  * @sk: Socket to use a capability on or through
179  * @cap: The global capability to use
180  *
181  * Test to see if the opener of the socket had when the socket was
182  * created and the current process has the capability @cap in all user
183  * namespaces.
184  */
sk_capable(const struct sock * sk,int cap)185 bool sk_capable(const struct sock *sk, int cap)
186 {
187 	return sk_ns_capable(sk, &init_user_ns, cap);
188 }
189 EXPORT_SYMBOL(sk_capable);
190 
191 /**
192  * sk_net_capable - Network namespace socket capability test
193  * @sk: Socket to use a capability on or through
194  * @cap: The capability to use
195  *
196  * Test to see if the opener of the socket had when the socket was created
197  * and the current process has the capability @cap over the network namespace
198  * the socket is a member of.
199  */
sk_net_capable(const struct sock * sk,int cap)200 bool sk_net_capable(const struct sock *sk, int cap)
201 {
202 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
203 }
204 EXPORT_SYMBOL(sk_net_capable);
205 
206 /*
207  * Each address family might have different locking rules, so we have
208  * one slock key per address family and separate keys for internal and
209  * userspace sockets.
210  */
211 static struct lock_class_key af_family_keys[AF_MAX];
212 static struct lock_class_key af_family_kern_keys[AF_MAX];
213 static struct lock_class_key af_family_slock_keys[AF_MAX];
214 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
215 
216 /*
217  * Make lock validator output more readable. (we pre-construct these
218  * strings build-time, so that runtime initialization of socket
219  * locks is fast):
220  */
221 
222 #define _sock_locks(x)						  \
223   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
224   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
225   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
226   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
227   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
228   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
229   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
230   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
231   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
232   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
233   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
234   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
235   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
236   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
237   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
238   x "AF_MCTP"  , \
239   x "AF_MAX"
240 
241 static const char *const af_family_key_strings[AF_MAX+1] = {
242 	_sock_locks("sk_lock-")
243 };
244 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
245 	_sock_locks("slock-")
246 };
247 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
248 	_sock_locks("clock-")
249 };
250 
251 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
252 	_sock_locks("k-sk_lock-")
253 };
254 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
255 	_sock_locks("k-slock-")
256 };
257 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
258 	_sock_locks("k-clock-")
259 };
260 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
261 	_sock_locks("rlock-")
262 };
263 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
264 	_sock_locks("wlock-")
265 };
266 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
267 	_sock_locks("elock-")
268 };
269 
270 /*
271  * sk_callback_lock and sk queues locking rules are per-address-family,
272  * so split the lock classes by using a per-AF key:
273  */
274 static struct lock_class_key af_callback_keys[AF_MAX];
275 static struct lock_class_key af_rlock_keys[AF_MAX];
276 static struct lock_class_key af_wlock_keys[AF_MAX];
277 static struct lock_class_key af_elock_keys[AF_MAX];
278 static struct lock_class_key af_kern_callback_keys[AF_MAX];
279 
280 /* Run time adjustable parameters. */
281 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
282 EXPORT_SYMBOL(sysctl_wmem_max);
283 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
284 EXPORT_SYMBOL(sysctl_rmem_max);
285 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
286 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
287 
288 int sysctl_tstamp_allow_data __read_mostly = 1;
289 
290 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
291 EXPORT_SYMBOL_GPL(memalloc_socks_key);
292 
293 /**
294  * sk_set_memalloc - sets %SOCK_MEMALLOC
295  * @sk: socket to set it on
296  *
297  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
298  * It's the responsibility of the admin to adjust min_free_kbytes
299  * to meet the requirements
300  */
sk_set_memalloc(struct sock * sk)301 void sk_set_memalloc(struct sock *sk)
302 {
303 	sock_set_flag(sk, SOCK_MEMALLOC);
304 	sk->sk_allocation |= __GFP_MEMALLOC;
305 	static_branch_inc(&memalloc_socks_key);
306 }
307 EXPORT_SYMBOL_GPL(sk_set_memalloc);
308 
sk_clear_memalloc(struct sock * sk)309 void sk_clear_memalloc(struct sock *sk)
310 {
311 	sock_reset_flag(sk, SOCK_MEMALLOC);
312 	sk->sk_allocation &= ~__GFP_MEMALLOC;
313 	static_branch_dec(&memalloc_socks_key);
314 
315 	/*
316 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
317 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
318 	 * it has rmem allocations due to the last swapfile being deactivated
319 	 * but there is a risk that the socket is unusable due to exceeding
320 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
321 	 */
322 	sk_mem_reclaim(sk);
323 }
324 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
325 
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)326 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
327 {
328 	int ret;
329 	unsigned int noreclaim_flag;
330 
331 	/* these should have been dropped before queueing */
332 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
333 
334 	noreclaim_flag = memalloc_noreclaim_save();
335 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
336 				 tcp_v6_do_rcv,
337 				 tcp_v4_do_rcv,
338 				 sk, skb);
339 	memalloc_noreclaim_restore(noreclaim_flag);
340 
341 	return ret;
342 }
343 EXPORT_SYMBOL(__sk_backlog_rcv);
344 
sk_error_report(struct sock * sk)345 void sk_error_report(struct sock *sk)
346 {
347 	sk->sk_error_report(sk);
348 
349 	switch (sk->sk_family) {
350 	case AF_INET:
351 		fallthrough;
352 	case AF_INET6:
353 		trace_inet_sk_error_report(sk);
354 		break;
355 	default:
356 		break;
357 	}
358 }
359 EXPORT_SYMBOL(sk_error_report);
360 
sock_get_timeout(long timeo,void * optval,bool old_timeval)361 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
362 {
363 	struct __kernel_sock_timeval tv;
364 
365 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
366 		tv.tv_sec = 0;
367 		tv.tv_usec = 0;
368 	} else {
369 		tv.tv_sec = timeo / HZ;
370 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
371 	}
372 
373 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
374 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
375 		*(struct old_timeval32 *)optval = tv32;
376 		return sizeof(tv32);
377 	}
378 
379 	if (old_timeval) {
380 		struct __kernel_old_timeval old_tv;
381 		old_tv.tv_sec = tv.tv_sec;
382 		old_tv.tv_usec = tv.tv_usec;
383 		*(struct __kernel_old_timeval *)optval = old_tv;
384 		return sizeof(old_tv);
385 	}
386 
387 	*(struct __kernel_sock_timeval *)optval = tv;
388 	return sizeof(tv);
389 }
390 EXPORT_SYMBOL(sock_get_timeout);
391 
sock_copy_user_timeval(struct __kernel_sock_timeval * tv,sockptr_t optval,int optlen,bool old_timeval)392 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
393 			   sockptr_t optval, int optlen, bool old_timeval)
394 {
395 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
396 		struct old_timeval32 tv32;
397 
398 		if (optlen < sizeof(tv32))
399 			return -EINVAL;
400 
401 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
402 			return -EFAULT;
403 		tv->tv_sec = tv32.tv_sec;
404 		tv->tv_usec = tv32.tv_usec;
405 	} else if (old_timeval) {
406 		struct __kernel_old_timeval old_tv;
407 
408 		if (optlen < sizeof(old_tv))
409 			return -EINVAL;
410 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
411 			return -EFAULT;
412 		tv->tv_sec = old_tv.tv_sec;
413 		tv->tv_usec = old_tv.tv_usec;
414 	} else {
415 		if (optlen < sizeof(*tv))
416 			return -EINVAL;
417 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
418 			return -EFAULT;
419 	}
420 
421 	return 0;
422 }
423 EXPORT_SYMBOL(sock_copy_user_timeval);
424 
sock_set_timeout(long * timeo_p,sockptr_t optval,int optlen,bool old_timeval)425 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
426 			    bool old_timeval)
427 {
428 	struct __kernel_sock_timeval tv;
429 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
430 	long val;
431 
432 	if (err)
433 		return err;
434 
435 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
436 		return -EDOM;
437 
438 	if (tv.tv_sec < 0) {
439 		static int warned __read_mostly;
440 
441 		WRITE_ONCE(*timeo_p, 0);
442 		if (warned < 10 && net_ratelimit()) {
443 			warned++;
444 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
445 				__func__, current->comm, task_pid_nr(current));
446 		}
447 		return 0;
448 	}
449 	val = MAX_SCHEDULE_TIMEOUT;
450 	if ((tv.tv_sec || tv.tv_usec) &&
451 	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
452 		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
453 						    USEC_PER_SEC / HZ);
454 	WRITE_ONCE(*timeo_p, val);
455 	return 0;
456 }
457 
sock_needs_netstamp(const struct sock * sk)458 static bool sock_needs_netstamp(const struct sock *sk)
459 {
460 	switch (sk->sk_family) {
461 	case AF_UNSPEC:
462 	case AF_UNIX:
463 		return false;
464 	default:
465 		return true;
466 	}
467 }
468 
sock_disable_timestamp(struct sock * sk,unsigned long flags)469 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
470 {
471 	if (sk->sk_flags & flags) {
472 		sk->sk_flags &= ~flags;
473 		if (sock_needs_netstamp(sk) &&
474 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
475 			net_disable_timestamp();
476 	}
477 }
478 
479 
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)480 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
481 {
482 	unsigned long flags;
483 	struct sk_buff_head *list = &sk->sk_receive_queue;
484 
485 	if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
486 		atomic_inc(&sk->sk_drops);
487 		trace_sock_rcvqueue_full(sk, skb);
488 		return -ENOMEM;
489 	}
490 
491 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
492 		atomic_inc(&sk->sk_drops);
493 		return -ENOBUFS;
494 	}
495 
496 	skb->dev = NULL;
497 	skb_set_owner_r(skb, sk);
498 
499 	/* we escape from rcu protected region, make sure we dont leak
500 	 * a norefcounted dst
501 	 */
502 	skb_dst_force(skb);
503 
504 	spin_lock_irqsave(&list->lock, flags);
505 	sock_skb_set_dropcount(sk, skb);
506 	__skb_queue_tail(list, skb);
507 	spin_unlock_irqrestore(&list->lock, flags);
508 
509 	if (!sock_flag(sk, SOCK_DEAD))
510 		sk->sk_data_ready(sk);
511 	return 0;
512 }
513 EXPORT_SYMBOL(__sock_queue_rcv_skb);
514 
sock_queue_rcv_skb_reason(struct sock * sk,struct sk_buff * skb,enum skb_drop_reason * reason)515 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
516 			      enum skb_drop_reason *reason)
517 {
518 	enum skb_drop_reason drop_reason;
519 	int err;
520 
521 	err = sk_filter(sk, skb);
522 	if (err) {
523 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
524 		goto out;
525 	}
526 	err = __sock_queue_rcv_skb(sk, skb);
527 	switch (err) {
528 	case -ENOMEM:
529 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
530 		break;
531 	case -ENOBUFS:
532 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
533 		break;
534 	default:
535 		drop_reason = SKB_NOT_DROPPED_YET;
536 		break;
537 	}
538 out:
539 	if (reason)
540 		*reason = drop_reason;
541 	return err;
542 }
543 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
544 
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)545 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
546 		     const int nested, unsigned int trim_cap, bool refcounted)
547 {
548 	int rc = NET_RX_SUCCESS;
549 
550 	if (sk_filter_trim_cap(sk, skb, trim_cap))
551 		goto discard_and_relse;
552 
553 	skb->dev = NULL;
554 
555 	if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
556 		atomic_inc(&sk->sk_drops);
557 		goto discard_and_relse;
558 	}
559 	if (nested)
560 		bh_lock_sock_nested(sk);
561 	else
562 		bh_lock_sock(sk);
563 	if (!sock_owned_by_user(sk)) {
564 		/*
565 		 * trylock + unlock semantics:
566 		 */
567 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
568 
569 		rc = sk_backlog_rcv(sk, skb);
570 
571 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
572 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
573 		bh_unlock_sock(sk);
574 		atomic_inc(&sk->sk_drops);
575 		goto discard_and_relse;
576 	}
577 
578 	bh_unlock_sock(sk);
579 out:
580 	if (refcounted)
581 		sock_put(sk);
582 	return rc;
583 discard_and_relse:
584 	kfree_skb(skb);
585 	goto out;
586 }
587 EXPORT_SYMBOL(__sk_receive_skb);
588 
589 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
590 							  u32));
591 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
592 							   u32));
__sk_dst_check(struct sock * sk,u32 cookie)593 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
594 {
595 	struct dst_entry *dst = __sk_dst_get(sk);
596 
597 	if (dst && dst->obsolete &&
598 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
599 			       dst, cookie) == NULL) {
600 		sk_tx_queue_clear(sk);
601 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
602 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
603 		dst_release(dst);
604 		return NULL;
605 	}
606 
607 	return dst;
608 }
609 EXPORT_SYMBOL(__sk_dst_check);
610 
sk_dst_check(struct sock * sk,u32 cookie)611 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
612 {
613 	struct dst_entry *dst = sk_dst_get(sk);
614 
615 	if (dst && dst->obsolete &&
616 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
617 			       dst, cookie) == NULL) {
618 		sk_dst_reset(sk);
619 		dst_release(dst);
620 		return NULL;
621 	}
622 
623 	return dst;
624 }
625 EXPORT_SYMBOL(sk_dst_check);
626 
sock_bindtoindex_locked(struct sock * sk,int ifindex)627 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
628 {
629 	int ret = -ENOPROTOOPT;
630 #ifdef CONFIG_NETDEVICES
631 	struct net *net = sock_net(sk);
632 
633 	/* Sorry... */
634 	ret = -EPERM;
635 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
636 		goto out;
637 
638 	ret = -EINVAL;
639 	if (ifindex < 0)
640 		goto out;
641 
642 	/* Paired with all READ_ONCE() done locklessly. */
643 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
644 
645 	if (sk->sk_prot->rehash)
646 		sk->sk_prot->rehash(sk);
647 	sk_dst_reset(sk);
648 
649 	ret = 0;
650 
651 out:
652 #endif
653 
654 	return ret;
655 }
656 
sock_bindtoindex(struct sock * sk,int ifindex,bool lock_sk)657 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
658 {
659 	int ret;
660 
661 	if (lock_sk)
662 		lock_sock(sk);
663 	ret = sock_bindtoindex_locked(sk, ifindex);
664 	if (lock_sk)
665 		release_sock(sk);
666 
667 	return ret;
668 }
669 EXPORT_SYMBOL(sock_bindtoindex);
670 
sock_setbindtodevice(struct sock * sk,sockptr_t optval,int optlen)671 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
672 {
673 	int ret = -ENOPROTOOPT;
674 #ifdef CONFIG_NETDEVICES
675 	struct net *net = sock_net(sk);
676 	char devname[IFNAMSIZ];
677 	int index;
678 
679 	ret = -EINVAL;
680 	if (optlen < 0)
681 		goto out;
682 
683 	/* Bind this socket to a particular device like "eth0",
684 	 * as specified in the passed interface name. If the
685 	 * name is "" or the option length is zero the socket
686 	 * is not bound.
687 	 */
688 	if (optlen > IFNAMSIZ - 1)
689 		optlen = IFNAMSIZ - 1;
690 	memset(devname, 0, sizeof(devname));
691 
692 	ret = -EFAULT;
693 	if (copy_from_sockptr(devname, optval, optlen))
694 		goto out;
695 
696 	index = 0;
697 	if (devname[0] != '\0') {
698 		struct net_device *dev;
699 
700 		rcu_read_lock();
701 		dev = dev_get_by_name_rcu(net, devname);
702 		if (dev)
703 			index = dev->ifindex;
704 		rcu_read_unlock();
705 		ret = -ENODEV;
706 		if (!dev)
707 			goto out;
708 	}
709 
710 	sockopt_lock_sock(sk);
711 	ret = sock_bindtoindex_locked(sk, index);
712 	sockopt_release_sock(sk);
713 out:
714 #endif
715 
716 	return ret;
717 }
718 
sock_getbindtodevice(struct sock * sk,sockptr_t optval,sockptr_t optlen,int len)719 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
720 				sockptr_t optlen, int len)
721 {
722 	int ret = -ENOPROTOOPT;
723 #ifdef CONFIG_NETDEVICES
724 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
725 	struct net *net = sock_net(sk);
726 	char devname[IFNAMSIZ];
727 
728 	if (bound_dev_if == 0) {
729 		len = 0;
730 		goto zero;
731 	}
732 
733 	ret = -EINVAL;
734 	if (len < IFNAMSIZ)
735 		goto out;
736 
737 	ret = netdev_get_name(net, devname, bound_dev_if);
738 	if (ret)
739 		goto out;
740 
741 	len = strlen(devname) + 1;
742 
743 	ret = -EFAULT;
744 	if (copy_to_sockptr(optval, devname, len))
745 		goto out;
746 
747 zero:
748 	ret = -EFAULT;
749 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
750 		goto out;
751 
752 	ret = 0;
753 
754 out:
755 #endif
756 
757 	return ret;
758 }
759 
sk_mc_loop(const struct sock * sk)760 bool sk_mc_loop(const struct sock *sk)
761 {
762 	if (dev_recursion_level())
763 		return false;
764 	if (!sk)
765 		return true;
766 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
767 	switch (READ_ONCE(sk->sk_family)) {
768 	case AF_INET:
769 		return inet_test_bit(MC_LOOP, sk);
770 #if IS_ENABLED(CONFIG_IPV6)
771 	case AF_INET6:
772 		return inet6_test_bit(MC6_LOOP, sk);
773 #endif
774 	}
775 	WARN_ON_ONCE(1);
776 	return true;
777 }
778 EXPORT_SYMBOL(sk_mc_loop);
779 
sock_set_reuseaddr(struct sock * sk)780 void sock_set_reuseaddr(struct sock *sk)
781 {
782 	lock_sock(sk);
783 	sk->sk_reuse = SK_CAN_REUSE;
784 	release_sock(sk);
785 }
786 EXPORT_SYMBOL(sock_set_reuseaddr);
787 
sock_set_reuseport(struct sock * sk)788 void sock_set_reuseport(struct sock *sk)
789 {
790 	lock_sock(sk);
791 	sk->sk_reuseport = true;
792 	release_sock(sk);
793 }
794 EXPORT_SYMBOL(sock_set_reuseport);
795 
sock_no_linger(struct sock * sk)796 void sock_no_linger(struct sock *sk)
797 {
798 	lock_sock(sk);
799 	WRITE_ONCE(sk->sk_lingertime, 0);
800 	sock_set_flag(sk, SOCK_LINGER);
801 	release_sock(sk);
802 }
803 EXPORT_SYMBOL(sock_no_linger);
804 
sock_set_priority(struct sock * sk,u32 priority)805 void sock_set_priority(struct sock *sk, u32 priority)
806 {
807 	WRITE_ONCE(sk->sk_priority, priority);
808 }
809 EXPORT_SYMBOL(sock_set_priority);
810 
sock_set_sndtimeo(struct sock * sk,s64 secs)811 void sock_set_sndtimeo(struct sock *sk, s64 secs)
812 {
813 	lock_sock(sk);
814 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
815 		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
816 	else
817 		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
818 	release_sock(sk);
819 }
820 EXPORT_SYMBOL(sock_set_sndtimeo);
821 
__sock_set_timestamps(struct sock * sk,bool val,bool new,bool ns)822 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
823 {
824 	if (val)  {
825 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
826 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
827 		sock_set_flag(sk, SOCK_RCVTSTAMP);
828 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
829 	} else {
830 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
831 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
832 	}
833 }
834 
sock_enable_timestamps(struct sock * sk)835 void sock_enable_timestamps(struct sock *sk)
836 {
837 	lock_sock(sk);
838 	__sock_set_timestamps(sk, true, false, true);
839 	release_sock(sk);
840 }
841 EXPORT_SYMBOL(sock_enable_timestamps);
842 
sock_set_timestamp(struct sock * sk,int optname,bool valbool)843 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
844 {
845 	switch (optname) {
846 	case SO_TIMESTAMP_OLD:
847 		__sock_set_timestamps(sk, valbool, false, false);
848 		break;
849 	case SO_TIMESTAMP_NEW:
850 		__sock_set_timestamps(sk, valbool, true, false);
851 		break;
852 	case SO_TIMESTAMPNS_OLD:
853 		__sock_set_timestamps(sk, valbool, false, true);
854 		break;
855 	case SO_TIMESTAMPNS_NEW:
856 		__sock_set_timestamps(sk, valbool, true, true);
857 		break;
858 	}
859 }
860 
sock_timestamping_bind_phc(struct sock * sk,int phc_index)861 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
862 {
863 	struct net *net = sock_net(sk);
864 	struct net_device *dev = NULL;
865 	bool match = false;
866 	int *vclock_index;
867 	int i, num;
868 
869 	if (sk->sk_bound_dev_if)
870 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
871 
872 	if (!dev) {
873 		pr_err("%s: sock not bind to device\n", __func__);
874 		return -EOPNOTSUPP;
875 	}
876 
877 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
878 	dev_put(dev);
879 
880 	for (i = 0; i < num; i++) {
881 		if (*(vclock_index + i) == phc_index) {
882 			match = true;
883 			break;
884 		}
885 	}
886 
887 	if (num > 0)
888 		kfree(vclock_index);
889 
890 	if (!match)
891 		return -EINVAL;
892 
893 	WRITE_ONCE(sk->sk_bind_phc, phc_index);
894 
895 	return 0;
896 }
897 
sock_set_timestamping(struct sock * sk,int optname,struct so_timestamping timestamping)898 int sock_set_timestamping(struct sock *sk, int optname,
899 			  struct so_timestamping timestamping)
900 {
901 	int val = timestamping.flags;
902 	int ret;
903 
904 	if (val & ~SOF_TIMESTAMPING_MASK)
905 		return -EINVAL;
906 
907 	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
908 	    !(val & SOF_TIMESTAMPING_OPT_ID))
909 		return -EINVAL;
910 
911 	if (val & SOF_TIMESTAMPING_OPT_ID &&
912 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
913 		if (sk_is_tcp(sk)) {
914 			if ((1 << sk->sk_state) &
915 			    (TCPF_CLOSE | TCPF_LISTEN))
916 				return -EINVAL;
917 			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
918 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
919 			else
920 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
921 		} else {
922 			atomic_set(&sk->sk_tskey, 0);
923 		}
924 	}
925 
926 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
927 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
928 		return -EINVAL;
929 
930 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
931 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
932 		if (ret)
933 			return ret;
934 	}
935 
936 	WRITE_ONCE(sk->sk_tsflags, val);
937 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
938 
939 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
940 		sock_enable_timestamp(sk,
941 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
942 	else
943 		sock_disable_timestamp(sk,
944 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
945 	return 0;
946 }
947 
sock_set_keepalive(struct sock * sk)948 void sock_set_keepalive(struct sock *sk)
949 {
950 	lock_sock(sk);
951 	if (sk->sk_prot->keepalive)
952 		sk->sk_prot->keepalive(sk, true);
953 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
954 	release_sock(sk);
955 }
956 EXPORT_SYMBOL(sock_set_keepalive);
957 
__sock_set_rcvbuf(struct sock * sk,int val)958 static void __sock_set_rcvbuf(struct sock *sk, int val)
959 {
960 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
961 	 * as a negative value.
962 	 */
963 	val = min_t(int, val, INT_MAX / 2);
964 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
965 
966 	/* We double it on the way in to account for "struct sk_buff" etc.
967 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
968 	 * will allow that much actual data to be received on that socket.
969 	 *
970 	 * Applications are unaware that "struct sk_buff" and other overheads
971 	 * allocate from the receive buffer during socket buffer allocation.
972 	 *
973 	 * And after considering the possible alternatives, returning the value
974 	 * we actually used in getsockopt is the most desirable behavior.
975 	 */
976 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
977 }
978 
sock_set_rcvbuf(struct sock * sk,int val)979 void sock_set_rcvbuf(struct sock *sk, int val)
980 {
981 	lock_sock(sk);
982 	__sock_set_rcvbuf(sk, val);
983 	release_sock(sk);
984 }
985 EXPORT_SYMBOL(sock_set_rcvbuf);
986 
__sock_set_mark(struct sock * sk,u32 val)987 static void __sock_set_mark(struct sock *sk, u32 val)
988 {
989 	if (val != sk->sk_mark) {
990 		WRITE_ONCE(sk->sk_mark, val);
991 		sk_dst_reset(sk);
992 	}
993 }
994 
sock_set_mark(struct sock * sk,u32 val)995 void sock_set_mark(struct sock *sk, u32 val)
996 {
997 	lock_sock(sk);
998 	__sock_set_mark(sk, val);
999 	release_sock(sk);
1000 }
1001 EXPORT_SYMBOL(sock_set_mark);
1002 
sock_release_reserved_memory(struct sock * sk,int bytes)1003 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1004 {
1005 	/* Round down bytes to multiple of pages */
1006 	bytes = round_down(bytes, PAGE_SIZE);
1007 
1008 	WARN_ON(bytes > sk->sk_reserved_mem);
1009 	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1010 	sk_mem_reclaim(sk);
1011 }
1012 
sock_reserve_memory(struct sock * sk,int bytes)1013 static int sock_reserve_memory(struct sock *sk, int bytes)
1014 {
1015 	long allocated;
1016 	bool charged;
1017 	int pages;
1018 
1019 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1020 		return -EOPNOTSUPP;
1021 
1022 	if (!bytes)
1023 		return 0;
1024 
1025 	pages = sk_mem_pages(bytes);
1026 
1027 	/* pre-charge to memcg */
1028 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1029 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1030 	if (!charged)
1031 		return -ENOMEM;
1032 
1033 	/* pre-charge to forward_alloc */
1034 	sk_memory_allocated_add(sk, pages);
1035 	allocated = sk_memory_allocated(sk);
1036 	/* If the system goes into memory pressure with this
1037 	 * precharge, give up and return error.
1038 	 */
1039 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1040 		sk_memory_allocated_sub(sk, pages);
1041 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1042 		return -ENOMEM;
1043 	}
1044 	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1045 
1046 	WRITE_ONCE(sk->sk_reserved_mem,
1047 		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1048 
1049 	return 0;
1050 }
1051 
sockopt_lock_sock(struct sock * sk)1052 void sockopt_lock_sock(struct sock *sk)
1053 {
1054 	/* When current->bpf_ctx is set, the setsockopt is called from
1055 	 * a bpf prog.  bpf has ensured the sk lock has been
1056 	 * acquired before calling setsockopt().
1057 	 */
1058 	if (has_current_bpf_ctx())
1059 		return;
1060 
1061 	lock_sock(sk);
1062 }
1063 EXPORT_SYMBOL(sockopt_lock_sock);
1064 
sockopt_release_sock(struct sock * sk)1065 void sockopt_release_sock(struct sock *sk)
1066 {
1067 	if (has_current_bpf_ctx())
1068 		return;
1069 
1070 	release_sock(sk);
1071 }
1072 EXPORT_SYMBOL(sockopt_release_sock);
1073 
sockopt_ns_capable(struct user_namespace * ns,int cap)1074 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1075 {
1076 	return has_current_bpf_ctx() || ns_capable(ns, cap);
1077 }
1078 EXPORT_SYMBOL(sockopt_ns_capable);
1079 
sockopt_capable(int cap)1080 bool sockopt_capable(int cap)
1081 {
1082 	return has_current_bpf_ctx() || capable(cap);
1083 }
1084 EXPORT_SYMBOL(sockopt_capable);
1085 
1086 /*
1087  *	This is meant for all protocols to use and covers goings on
1088  *	at the socket level. Everything here is generic.
1089  */
1090 
sk_setsockopt(struct sock * sk,int level,int optname,sockptr_t optval,unsigned int optlen)1091 int sk_setsockopt(struct sock *sk, int level, int optname,
1092 		  sockptr_t optval, unsigned int optlen)
1093 {
1094 	struct so_timestamping timestamping;
1095 	struct socket *sock = sk->sk_socket;
1096 	struct sock_txtime sk_txtime;
1097 	int val;
1098 	int valbool;
1099 	struct linger ling;
1100 	int ret = 0;
1101 
1102 	/*
1103 	 *	Options without arguments
1104 	 */
1105 
1106 	if (optname == SO_BINDTODEVICE)
1107 		return sock_setbindtodevice(sk, optval, optlen);
1108 
1109 	if (optlen < sizeof(int))
1110 		return -EINVAL;
1111 
1112 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1113 		return -EFAULT;
1114 
1115 	valbool = val ? 1 : 0;
1116 
1117 	/* handle options which do not require locking the socket. */
1118 	switch (optname) {
1119 	case SO_PRIORITY:
1120 		if ((val >= 0 && val <= 6) ||
1121 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1122 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1123 			sock_set_priority(sk, val);
1124 			return 0;
1125 		}
1126 		return -EPERM;
1127 	case SO_PASSSEC:
1128 		assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1129 		return 0;
1130 	case SO_PASSCRED:
1131 		assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1132 		return 0;
1133 	case SO_PASSPIDFD:
1134 		assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1135 		return 0;
1136 	case SO_TYPE:
1137 	case SO_PROTOCOL:
1138 	case SO_DOMAIN:
1139 	case SO_ERROR:
1140 		return -ENOPROTOOPT;
1141 #ifdef CONFIG_NET_RX_BUSY_POLL
1142 	case SO_BUSY_POLL:
1143 		if (val < 0)
1144 			return -EINVAL;
1145 		WRITE_ONCE(sk->sk_ll_usec, val);
1146 		return 0;
1147 	case SO_PREFER_BUSY_POLL:
1148 		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1149 			return -EPERM;
1150 		WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1151 		return 0;
1152 	case SO_BUSY_POLL_BUDGET:
1153 		if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1154 		    !sockopt_capable(CAP_NET_ADMIN))
1155 			return -EPERM;
1156 		if (val < 0 || val > U16_MAX)
1157 			return -EINVAL;
1158 		WRITE_ONCE(sk->sk_busy_poll_budget, val);
1159 		return 0;
1160 #endif
1161 	case SO_MAX_PACING_RATE:
1162 		{
1163 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1164 		unsigned long pacing_rate;
1165 
1166 		if (sizeof(ulval) != sizeof(val) &&
1167 		    optlen >= sizeof(ulval) &&
1168 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1169 			return -EFAULT;
1170 		}
1171 		if (ulval != ~0UL)
1172 			cmpxchg(&sk->sk_pacing_status,
1173 				SK_PACING_NONE,
1174 				SK_PACING_NEEDED);
1175 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1176 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1177 		pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1178 		if (ulval < pacing_rate)
1179 			WRITE_ONCE(sk->sk_pacing_rate, ulval);
1180 		return 0;
1181 		}
1182 	case SO_TXREHASH:
1183 		if (val < -1 || val > 1)
1184 			return -EINVAL;
1185 		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1186 			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1187 		/* Paired with READ_ONCE() in tcp_rtx_synack()
1188 		 * and sk_getsockopt().
1189 		 */
1190 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1191 		return 0;
1192 	case SO_PEEK_OFF:
1193 		{
1194 		int (*set_peek_off)(struct sock *sk, int val);
1195 
1196 		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1197 		if (set_peek_off)
1198 			ret = set_peek_off(sk, val);
1199 		else
1200 			ret = -EOPNOTSUPP;
1201 		return ret;
1202 		}
1203 	}
1204 
1205 	sockopt_lock_sock(sk);
1206 
1207 	switch (optname) {
1208 	case SO_DEBUG:
1209 		if (val && !sockopt_capable(CAP_NET_ADMIN))
1210 			ret = -EACCES;
1211 		else
1212 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1213 		break;
1214 	case SO_REUSEADDR:
1215 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1216 		break;
1217 	case SO_REUSEPORT:
1218 		sk->sk_reuseport = valbool;
1219 		break;
1220 	case SO_DONTROUTE:
1221 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1222 		sk_dst_reset(sk);
1223 		break;
1224 	case SO_BROADCAST:
1225 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1226 		break;
1227 	case SO_SNDBUF:
1228 		/* Don't error on this BSD doesn't and if you think
1229 		 * about it this is right. Otherwise apps have to
1230 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1231 		 * are treated in BSD as hints
1232 		 */
1233 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1234 set_sndbuf:
1235 		/* Ensure val * 2 fits into an int, to prevent max_t()
1236 		 * from treating it as a negative value.
1237 		 */
1238 		val = min_t(int, val, INT_MAX / 2);
1239 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1240 		WRITE_ONCE(sk->sk_sndbuf,
1241 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1242 		/* Wake up sending tasks if we upped the value. */
1243 		sk->sk_write_space(sk);
1244 		break;
1245 
1246 	case SO_SNDBUFFORCE:
1247 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1248 			ret = -EPERM;
1249 			break;
1250 		}
1251 
1252 		/* No negative values (to prevent underflow, as val will be
1253 		 * multiplied by 2).
1254 		 */
1255 		if (val < 0)
1256 			val = 0;
1257 		goto set_sndbuf;
1258 
1259 	case SO_RCVBUF:
1260 		/* Don't error on this BSD doesn't and if you think
1261 		 * about it this is right. Otherwise apps have to
1262 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1263 		 * are treated in BSD as hints
1264 		 */
1265 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1266 		break;
1267 
1268 	case SO_RCVBUFFORCE:
1269 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1270 			ret = -EPERM;
1271 			break;
1272 		}
1273 
1274 		/* No negative values (to prevent underflow, as val will be
1275 		 * multiplied by 2).
1276 		 */
1277 		__sock_set_rcvbuf(sk, max(val, 0));
1278 		break;
1279 
1280 	case SO_KEEPALIVE:
1281 		if (sk->sk_prot->keepalive)
1282 			sk->sk_prot->keepalive(sk, valbool);
1283 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1284 		break;
1285 
1286 	case SO_OOBINLINE:
1287 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1288 		break;
1289 
1290 	case SO_NO_CHECK:
1291 		sk->sk_no_check_tx = valbool;
1292 		break;
1293 
1294 	case SO_LINGER:
1295 		if (optlen < sizeof(ling)) {
1296 			ret = -EINVAL;	/* 1003.1g */
1297 			break;
1298 		}
1299 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1300 			ret = -EFAULT;
1301 			break;
1302 		}
1303 		if (!ling.l_onoff) {
1304 			sock_reset_flag(sk, SOCK_LINGER);
1305 		} else {
1306 			unsigned long t_sec = ling.l_linger;
1307 
1308 			if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1309 				WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1310 			else
1311 				WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1312 			sock_set_flag(sk, SOCK_LINGER);
1313 		}
1314 		break;
1315 
1316 	case SO_BSDCOMPAT:
1317 		break;
1318 
1319 	case SO_TIMESTAMP_OLD:
1320 	case SO_TIMESTAMP_NEW:
1321 	case SO_TIMESTAMPNS_OLD:
1322 	case SO_TIMESTAMPNS_NEW:
1323 		sock_set_timestamp(sk, optname, valbool);
1324 		break;
1325 
1326 	case SO_TIMESTAMPING_NEW:
1327 	case SO_TIMESTAMPING_OLD:
1328 		if (optlen == sizeof(timestamping)) {
1329 			if (copy_from_sockptr(&timestamping, optval,
1330 					      sizeof(timestamping))) {
1331 				ret = -EFAULT;
1332 				break;
1333 			}
1334 		} else {
1335 			memset(&timestamping, 0, sizeof(timestamping));
1336 			timestamping.flags = val;
1337 		}
1338 		ret = sock_set_timestamping(sk, optname, timestamping);
1339 		break;
1340 
1341 	case SO_RCVLOWAT:
1342 		{
1343 		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1344 
1345 		if (val < 0)
1346 			val = INT_MAX;
1347 		if (sock)
1348 			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1349 		if (set_rcvlowat)
1350 			ret = set_rcvlowat(sk, val);
1351 		else
1352 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1353 		break;
1354 		}
1355 	case SO_RCVTIMEO_OLD:
1356 	case SO_RCVTIMEO_NEW:
1357 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1358 				       optlen, optname == SO_RCVTIMEO_OLD);
1359 		break;
1360 
1361 	case SO_SNDTIMEO_OLD:
1362 	case SO_SNDTIMEO_NEW:
1363 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1364 				       optlen, optname == SO_SNDTIMEO_OLD);
1365 		break;
1366 
1367 	case SO_ATTACH_FILTER: {
1368 		struct sock_fprog fprog;
1369 
1370 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1371 		if (!ret)
1372 			ret = sk_attach_filter(&fprog, sk);
1373 		break;
1374 	}
1375 	case SO_ATTACH_BPF:
1376 		ret = -EINVAL;
1377 		if (optlen == sizeof(u32)) {
1378 			u32 ufd;
1379 
1380 			ret = -EFAULT;
1381 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1382 				break;
1383 
1384 			ret = sk_attach_bpf(ufd, sk);
1385 		}
1386 		break;
1387 
1388 	case SO_ATTACH_REUSEPORT_CBPF: {
1389 		struct sock_fprog fprog;
1390 
1391 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1392 		if (!ret)
1393 			ret = sk_reuseport_attach_filter(&fprog, sk);
1394 		break;
1395 	}
1396 	case SO_ATTACH_REUSEPORT_EBPF:
1397 		ret = -EINVAL;
1398 		if (optlen == sizeof(u32)) {
1399 			u32 ufd;
1400 
1401 			ret = -EFAULT;
1402 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1403 				break;
1404 
1405 			ret = sk_reuseport_attach_bpf(ufd, sk);
1406 		}
1407 		break;
1408 
1409 	case SO_DETACH_REUSEPORT_BPF:
1410 		ret = reuseport_detach_prog(sk);
1411 		break;
1412 
1413 	case SO_DETACH_FILTER:
1414 		ret = sk_detach_filter(sk);
1415 		break;
1416 
1417 	case SO_LOCK_FILTER:
1418 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1419 			ret = -EPERM;
1420 		else
1421 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1422 		break;
1423 
1424 	case SO_MARK:
1425 		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1426 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1427 			ret = -EPERM;
1428 			break;
1429 		}
1430 
1431 		__sock_set_mark(sk, val);
1432 		break;
1433 	case SO_RCVMARK:
1434 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1435 		break;
1436 
1437 	case SO_RXQ_OVFL:
1438 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1439 		break;
1440 
1441 	case SO_WIFI_STATUS:
1442 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1443 		break;
1444 
1445 	case SO_NOFCS:
1446 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1447 		break;
1448 
1449 	case SO_SELECT_ERR_QUEUE:
1450 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1451 		break;
1452 
1453 
1454 	case SO_INCOMING_CPU:
1455 		reuseport_update_incoming_cpu(sk, val);
1456 		break;
1457 
1458 	case SO_CNX_ADVICE:
1459 		if (val == 1)
1460 			dst_negative_advice(sk);
1461 		break;
1462 
1463 	case SO_ZEROCOPY:
1464 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1465 			if (!(sk_is_tcp(sk) ||
1466 			      (sk->sk_type == SOCK_DGRAM &&
1467 			       sk->sk_protocol == IPPROTO_UDP)))
1468 				ret = -EOPNOTSUPP;
1469 		} else if (sk->sk_family != PF_RDS) {
1470 			ret = -EOPNOTSUPP;
1471 		}
1472 		if (!ret) {
1473 			if (val < 0 || val > 1)
1474 				ret = -EINVAL;
1475 			else
1476 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1477 		}
1478 		break;
1479 
1480 	case SO_TXTIME:
1481 		if (optlen != sizeof(struct sock_txtime)) {
1482 			ret = -EINVAL;
1483 			break;
1484 		} else if (copy_from_sockptr(&sk_txtime, optval,
1485 			   sizeof(struct sock_txtime))) {
1486 			ret = -EFAULT;
1487 			break;
1488 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1489 			ret = -EINVAL;
1490 			break;
1491 		}
1492 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1493 		 * scheduler has enough safe guards.
1494 		 */
1495 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1496 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1497 			ret = -EPERM;
1498 			break;
1499 		}
1500 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1501 		sk->sk_clockid = sk_txtime.clockid;
1502 		sk->sk_txtime_deadline_mode =
1503 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1504 		sk->sk_txtime_report_errors =
1505 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1506 		break;
1507 
1508 	case SO_BINDTOIFINDEX:
1509 		ret = sock_bindtoindex_locked(sk, val);
1510 		break;
1511 
1512 	case SO_BUF_LOCK:
1513 		if (val & ~SOCK_BUF_LOCK_MASK) {
1514 			ret = -EINVAL;
1515 			break;
1516 		}
1517 		sk->sk_userlocks = val | (sk->sk_userlocks &
1518 					  ~SOCK_BUF_LOCK_MASK);
1519 		break;
1520 
1521 	case SO_RESERVE_MEM:
1522 	{
1523 		int delta;
1524 
1525 		if (val < 0) {
1526 			ret = -EINVAL;
1527 			break;
1528 		}
1529 
1530 		delta = val - sk->sk_reserved_mem;
1531 		if (delta < 0)
1532 			sock_release_reserved_memory(sk, -delta);
1533 		else
1534 			ret = sock_reserve_memory(sk, delta);
1535 		break;
1536 	}
1537 
1538 	default:
1539 		ret = -ENOPROTOOPT;
1540 		break;
1541 	}
1542 	sockopt_release_sock(sk);
1543 	return ret;
1544 }
1545 
sock_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)1546 int sock_setsockopt(struct socket *sock, int level, int optname,
1547 		    sockptr_t optval, unsigned int optlen)
1548 {
1549 	return sk_setsockopt(sock->sk, level, optname,
1550 			     optval, optlen);
1551 }
1552 EXPORT_SYMBOL(sock_setsockopt);
1553 
sk_get_peer_cred(struct sock * sk)1554 static const struct cred *sk_get_peer_cred(struct sock *sk)
1555 {
1556 	const struct cred *cred;
1557 
1558 	spin_lock(&sk->sk_peer_lock);
1559 	cred = get_cred(sk->sk_peer_cred);
1560 	spin_unlock(&sk->sk_peer_lock);
1561 
1562 	return cred;
1563 }
1564 
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1565 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1566 			  struct ucred *ucred)
1567 {
1568 	ucred->pid = pid_vnr(pid);
1569 	ucred->uid = ucred->gid = -1;
1570 	if (cred) {
1571 		struct user_namespace *current_ns = current_user_ns();
1572 
1573 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1574 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1575 	}
1576 }
1577 
groups_to_user(sockptr_t dst,const struct group_info * src)1578 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1579 {
1580 	struct user_namespace *user_ns = current_user_ns();
1581 	int i;
1582 
1583 	for (i = 0; i < src->ngroups; i++) {
1584 		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1585 
1586 		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1587 			return -EFAULT;
1588 	}
1589 
1590 	return 0;
1591 }
1592 
sk_getsockopt(struct sock * sk,int level,int optname,sockptr_t optval,sockptr_t optlen)1593 int sk_getsockopt(struct sock *sk, int level, int optname,
1594 		  sockptr_t optval, sockptr_t optlen)
1595 {
1596 	struct socket *sock = sk->sk_socket;
1597 
1598 	union {
1599 		int val;
1600 		u64 val64;
1601 		unsigned long ulval;
1602 		struct linger ling;
1603 		struct old_timeval32 tm32;
1604 		struct __kernel_old_timeval tm;
1605 		struct  __kernel_sock_timeval stm;
1606 		struct sock_txtime txtime;
1607 		struct so_timestamping timestamping;
1608 	} v;
1609 
1610 	int lv = sizeof(int);
1611 	int len;
1612 
1613 	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1614 		return -EFAULT;
1615 	if (len < 0)
1616 		return -EINVAL;
1617 
1618 	memset(&v, 0, sizeof(v));
1619 
1620 	switch (optname) {
1621 	case SO_DEBUG:
1622 		v.val = sock_flag(sk, SOCK_DBG);
1623 		break;
1624 
1625 	case SO_DONTROUTE:
1626 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1627 		break;
1628 
1629 	case SO_BROADCAST:
1630 		v.val = sock_flag(sk, SOCK_BROADCAST);
1631 		break;
1632 
1633 	case SO_SNDBUF:
1634 		v.val = READ_ONCE(sk->sk_sndbuf);
1635 		break;
1636 
1637 	case SO_RCVBUF:
1638 		v.val = READ_ONCE(sk->sk_rcvbuf);
1639 		break;
1640 
1641 	case SO_REUSEADDR:
1642 		v.val = sk->sk_reuse;
1643 		break;
1644 
1645 	case SO_REUSEPORT:
1646 		v.val = sk->sk_reuseport;
1647 		break;
1648 
1649 	case SO_KEEPALIVE:
1650 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1651 		break;
1652 
1653 	case SO_TYPE:
1654 		v.val = sk->sk_type;
1655 		break;
1656 
1657 	case SO_PROTOCOL:
1658 		v.val = sk->sk_protocol;
1659 		break;
1660 
1661 	case SO_DOMAIN:
1662 		v.val = sk->sk_family;
1663 		break;
1664 
1665 	case SO_ERROR:
1666 		v.val = -sock_error(sk);
1667 		if (v.val == 0)
1668 			v.val = xchg(&sk->sk_err_soft, 0);
1669 		break;
1670 
1671 	case SO_OOBINLINE:
1672 		v.val = sock_flag(sk, SOCK_URGINLINE);
1673 		break;
1674 
1675 	case SO_NO_CHECK:
1676 		v.val = sk->sk_no_check_tx;
1677 		break;
1678 
1679 	case SO_PRIORITY:
1680 		v.val = READ_ONCE(sk->sk_priority);
1681 		break;
1682 
1683 	case SO_LINGER:
1684 		lv		= sizeof(v.ling);
1685 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1686 		v.ling.l_linger	= READ_ONCE(sk->sk_lingertime) / HZ;
1687 		break;
1688 
1689 	case SO_BSDCOMPAT:
1690 		break;
1691 
1692 	case SO_TIMESTAMP_OLD:
1693 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1694 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1695 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1696 		break;
1697 
1698 	case SO_TIMESTAMPNS_OLD:
1699 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1700 		break;
1701 
1702 	case SO_TIMESTAMP_NEW:
1703 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1704 		break;
1705 
1706 	case SO_TIMESTAMPNS_NEW:
1707 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1708 		break;
1709 
1710 	case SO_TIMESTAMPING_OLD:
1711 	case SO_TIMESTAMPING_NEW:
1712 		lv = sizeof(v.timestamping);
1713 		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1714 		 * returning the flags when they were set through the same option.
1715 		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1716 		 */
1717 		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1718 			v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1719 			v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1720 		}
1721 		break;
1722 
1723 	case SO_RCVTIMEO_OLD:
1724 	case SO_RCVTIMEO_NEW:
1725 		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1726 				      SO_RCVTIMEO_OLD == optname);
1727 		break;
1728 
1729 	case SO_SNDTIMEO_OLD:
1730 	case SO_SNDTIMEO_NEW:
1731 		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1732 				      SO_SNDTIMEO_OLD == optname);
1733 		break;
1734 
1735 	case SO_RCVLOWAT:
1736 		v.val = READ_ONCE(sk->sk_rcvlowat);
1737 		break;
1738 
1739 	case SO_SNDLOWAT:
1740 		v.val = 1;
1741 		break;
1742 
1743 	case SO_PASSCRED:
1744 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1745 		break;
1746 
1747 	case SO_PASSPIDFD:
1748 		v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1749 		break;
1750 
1751 	case SO_PEERCRED:
1752 	{
1753 		struct ucred peercred;
1754 		if (len > sizeof(peercred))
1755 			len = sizeof(peercred);
1756 
1757 		spin_lock(&sk->sk_peer_lock);
1758 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1759 		spin_unlock(&sk->sk_peer_lock);
1760 
1761 		if (copy_to_sockptr(optval, &peercred, len))
1762 			return -EFAULT;
1763 		goto lenout;
1764 	}
1765 
1766 	case SO_PEERPIDFD:
1767 	{
1768 		struct pid *peer_pid;
1769 		struct file *pidfd_file = NULL;
1770 		int pidfd;
1771 
1772 		if (len > sizeof(pidfd))
1773 			len = sizeof(pidfd);
1774 
1775 		spin_lock(&sk->sk_peer_lock);
1776 		peer_pid = get_pid(sk->sk_peer_pid);
1777 		spin_unlock(&sk->sk_peer_lock);
1778 
1779 		if (!peer_pid)
1780 			return -ENODATA;
1781 
1782 		pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1783 		put_pid(peer_pid);
1784 		if (pidfd < 0)
1785 			return pidfd;
1786 
1787 		if (copy_to_sockptr(optval, &pidfd, len) ||
1788 		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1789 			put_unused_fd(pidfd);
1790 			fput(pidfd_file);
1791 
1792 			return -EFAULT;
1793 		}
1794 
1795 		fd_install(pidfd, pidfd_file);
1796 		return 0;
1797 	}
1798 
1799 	case SO_PEERGROUPS:
1800 	{
1801 		const struct cred *cred;
1802 		int ret, n;
1803 
1804 		cred = sk_get_peer_cred(sk);
1805 		if (!cred)
1806 			return -ENODATA;
1807 
1808 		n = cred->group_info->ngroups;
1809 		if (len < n * sizeof(gid_t)) {
1810 			len = n * sizeof(gid_t);
1811 			put_cred(cred);
1812 			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1813 		}
1814 		len = n * sizeof(gid_t);
1815 
1816 		ret = groups_to_user(optval, cred->group_info);
1817 		put_cred(cred);
1818 		if (ret)
1819 			return ret;
1820 		goto lenout;
1821 	}
1822 
1823 	case SO_PEERNAME:
1824 	{
1825 		struct sockaddr_storage address;
1826 
1827 		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1828 		if (lv < 0)
1829 			return -ENOTCONN;
1830 		if (lv < len)
1831 			return -EINVAL;
1832 		if (copy_to_sockptr(optval, &address, len))
1833 			return -EFAULT;
1834 		goto lenout;
1835 	}
1836 
1837 	/* Dubious BSD thing... Probably nobody even uses it, but
1838 	 * the UNIX standard wants it for whatever reason... -DaveM
1839 	 */
1840 	case SO_ACCEPTCONN:
1841 		v.val = sk->sk_state == TCP_LISTEN;
1842 		break;
1843 
1844 	case SO_PASSSEC:
1845 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1846 		break;
1847 
1848 	case SO_PEERSEC:
1849 		return security_socket_getpeersec_stream(sock,
1850 							 optval, optlen, len);
1851 
1852 	case SO_MARK:
1853 		v.val = READ_ONCE(sk->sk_mark);
1854 		break;
1855 
1856 	case SO_RCVMARK:
1857 		v.val = sock_flag(sk, SOCK_RCVMARK);
1858 		break;
1859 
1860 	case SO_RXQ_OVFL:
1861 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1862 		break;
1863 
1864 	case SO_WIFI_STATUS:
1865 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1866 		break;
1867 
1868 	case SO_PEEK_OFF:
1869 		if (!READ_ONCE(sock->ops)->set_peek_off)
1870 			return -EOPNOTSUPP;
1871 
1872 		v.val = READ_ONCE(sk->sk_peek_off);
1873 		break;
1874 	case SO_NOFCS:
1875 		v.val = sock_flag(sk, SOCK_NOFCS);
1876 		break;
1877 
1878 	case SO_BINDTODEVICE:
1879 		return sock_getbindtodevice(sk, optval, optlen, len);
1880 
1881 	case SO_GET_FILTER:
1882 		len = sk_get_filter(sk, optval, len);
1883 		if (len < 0)
1884 			return len;
1885 
1886 		goto lenout;
1887 
1888 	case SO_LOCK_FILTER:
1889 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1890 		break;
1891 
1892 	case SO_BPF_EXTENSIONS:
1893 		v.val = bpf_tell_extensions();
1894 		break;
1895 
1896 	case SO_SELECT_ERR_QUEUE:
1897 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1898 		break;
1899 
1900 #ifdef CONFIG_NET_RX_BUSY_POLL
1901 	case SO_BUSY_POLL:
1902 		v.val = READ_ONCE(sk->sk_ll_usec);
1903 		break;
1904 	case SO_PREFER_BUSY_POLL:
1905 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1906 		break;
1907 #endif
1908 
1909 	case SO_MAX_PACING_RATE:
1910 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1911 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1912 			lv = sizeof(v.ulval);
1913 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1914 		} else {
1915 			/* 32bit version */
1916 			v.val = min_t(unsigned long, ~0U,
1917 				      READ_ONCE(sk->sk_max_pacing_rate));
1918 		}
1919 		break;
1920 
1921 	case SO_INCOMING_CPU:
1922 		v.val = READ_ONCE(sk->sk_incoming_cpu);
1923 		break;
1924 
1925 	case SO_MEMINFO:
1926 	{
1927 		u32 meminfo[SK_MEMINFO_VARS];
1928 
1929 		sk_get_meminfo(sk, meminfo);
1930 
1931 		len = min_t(unsigned int, len, sizeof(meminfo));
1932 		if (copy_to_sockptr(optval, &meminfo, len))
1933 			return -EFAULT;
1934 
1935 		goto lenout;
1936 	}
1937 
1938 #ifdef CONFIG_NET_RX_BUSY_POLL
1939 	case SO_INCOMING_NAPI_ID:
1940 		v.val = READ_ONCE(sk->sk_napi_id);
1941 
1942 		/* aggregate non-NAPI IDs down to 0 */
1943 		if (v.val < MIN_NAPI_ID)
1944 			v.val = 0;
1945 
1946 		break;
1947 #endif
1948 
1949 	case SO_COOKIE:
1950 		lv = sizeof(u64);
1951 		if (len < lv)
1952 			return -EINVAL;
1953 		v.val64 = sock_gen_cookie(sk);
1954 		break;
1955 
1956 	case SO_ZEROCOPY:
1957 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1958 		break;
1959 
1960 	case SO_TXTIME:
1961 		lv = sizeof(v.txtime);
1962 		v.txtime.clockid = sk->sk_clockid;
1963 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1964 				  SOF_TXTIME_DEADLINE_MODE : 0;
1965 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1966 				  SOF_TXTIME_REPORT_ERRORS : 0;
1967 		break;
1968 
1969 	case SO_BINDTOIFINDEX:
1970 		v.val = READ_ONCE(sk->sk_bound_dev_if);
1971 		break;
1972 
1973 	case SO_NETNS_COOKIE:
1974 		lv = sizeof(u64);
1975 		if (len != lv)
1976 			return -EINVAL;
1977 		v.val64 = sock_net(sk)->net_cookie;
1978 		break;
1979 
1980 	case SO_BUF_LOCK:
1981 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1982 		break;
1983 
1984 	case SO_RESERVE_MEM:
1985 		v.val = READ_ONCE(sk->sk_reserved_mem);
1986 		break;
1987 
1988 	case SO_TXREHASH:
1989 		/* Paired with WRITE_ONCE() in sk_setsockopt() */
1990 		v.val = READ_ONCE(sk->sk_txrehash);
1991 		break;
1992 
1993 	default:
1994 		/* We implement the SO_SNDLOWAT etc to not be settable
1995 		 * (1003.1g 7).
1996 		 */
1997 		return -ENOPROTOOPT;
1998 	}
1999 
2000 	if (len > lv)
2001 		len = lv;
2002 	if (copy_to_sockptr(optval, &v, len))
2003 		return -EFAULT;
2004 lenout:
2005 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2006 		return -EFAULT;
2007 	return 0;
2008 }
2009 
2010 /*
2011  * Initialize an sk_lock.
2012  *
2013  * (We also register the sk_lock with the lock validator.)
2014  */
sock_lock_init(struct sock * sk)2015 static inline void sock_lock_init(struct sock *sk)
2016 {
2017 	if (sk->sk_kern_sock)
2018 		sock_lock_init_class_and_name(
2019 			sk,
2020 			af_family_kern_slock_key_strings[sk->sk_family],
2021 			af_family_kern_slock_keys + sk->sk_family,
2022 			af_family_kern_key_strings[sk->sk_family],
2023 			af_family_kern_keys + sk->sk_family);
2024 	else
2025 		sock_lock_init_class_and_name(
2026 			sk,
2027 			af_family_slock_key_strings[sk->sk_family],
2028 			af_family_slock_keys + sk->sk_family,
2029 			af_family_key_strings[sk->sk_family],
2030 			af_family_keys + sk->sk_family);
2031 }
2032 
2033 /*
2034  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2035  * even temporarly, because of RCU lookups. sk_node should also be left as is.
2036  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2037  */
sock_copy(struct sock * nsk,const struct sock * osk)2038 static void sock_copy(struct sock *nsk, const struct sock *osk)
2039 {
2040 	const struct proto *prot = READ_ONCE(osk->sk_prot);
2041 #ifdef CONFIG_SECURITY_NETWORK
2042 	void *sptr = nsk->sk_security;
2043 #endif
2044 
2045 	/* If we move sk_tx_queue_mapping out of the private section,
2046 	 * we must check if sk_tx_queue_clear() is called after
2047 	 * sock_copy() in sk_clone_lock().
2048 	 */
2049 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2050 		     offsetof(struct sock, sk_dontcopy_begin) ||
2051 		     offsetof(struct sock, sk_tx_queue_mapping) >=
2052 		     offsetof(struct sock, sk_dontcopy_end));
2053 
2054 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2055 
2056 	unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2057 		      prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2058 		      /* alloc is larger than struct, see sk_prot_alloc() */);
2059 
2060 #ifdef CONFIG_SECURITY_NETWORK
2061 	nsk->sk_security = sptr;
2062 	security_sk_clone(osk, nsk);
2063 #endif
2064 }
2065 
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)2066 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2067 		int family)
2068 {
2069 	struct sock *sk;
2070 	struct kmem_cache *slab;
2071 
2072 	slab = prot->slab;
2073 	if (slab != NULL) {
2074 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2075 		if (!sk)
2076 			return sk;
2077 		if (want_init_on_alloc(priority))
2078 			sk_prot_clear_nulls(sk, prot->obj_size);
2079 	} else
2080 		sk = kmalloc(prot->obj_size, priority);
2081 
2082 	if (sk != NULL) {
2083 		if (security_sk_alloc(sk, family, priority))
2084 			goto out_free;
2085 
2086 		if (!try_module_get(prot->owner))
2087 			goto out_free_sec;
2088 	}
2089 
2090 	return sk;
2091 
2092 out_free_sec:
2093 	security_sk_free(sk);
2094 out_free:
2095 	if (slab != NULL)
2096 		kmem_cache_free(slab, sk);
2097 	else
2098 		kfree(sk);
2099 	return NULL;
2100 }
2101 
sk_prot_free(struct proto * prot,struct sock * sk)2102 static void sk_prot_free(struct proto *prot, struct sock *sk)
2103 {
2104 	struct kmem_cache *slab;
2105 	struct module *owner;
2106 
2107 	owner = prot->owner;
2108 	slab = prot->slab;
2109 
2110 	cgroup_sk_free(&sk->sk_cgrp_data);
2111 	mem_cgroup_sk_free(sk);
2112 	security_sk_free(sk);
2113 	if (slab != NULL)
2114 		kmem_cache_free(slab, sk);
2115 	else
2116 		kfree(sk);
2117 	module_put(owner);
2118 }
2119 
2120 /**
2121  *	sk_alloc - All socket objects are allocated here
2122  *	@net: the applicable net namespace
2123  *	@family: protocol family
2124  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2125  *	@prot: struct proto associated with this new sock instance
2126  *	@kern: is this to be a kernel socket?
2127  */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)2128 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2129 		      struct proto *prot, int kern)
2130 {
2131 	struct sock *sk;
2132 
2133 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2134 	if (sk) {
2135 		sk->sk_family = family;
2136 		/*
2137 		 * See comment in struct sock definition to understand
2138 		 * why we need sk_prot_creator -acme
2139 		 */
2140 		sk->sk_prot = sk->sk_prot_creator = prot;
2141 		sk->sk_kern_sock = kern;
2142 		sock_lock_init(sk);
2143 		sk->sk_net_refcnt = kern ? 0 : 1;
2144 		if (likely(sk->sk_net_refcnt)) {
2145 			get_net_track(net, &sk->ns_tracker, priority);
2146 			sock_inuse_add(net, 1);
2147 		} else {
2148 			__netns_tracker_alloc(net, &sk->ns_tracker,
2149 					      false, priority);
2150 		}
2151 
2152 		sock_net_set(sk, net);
2153 		refcount_set(&sk->sk_wmem_alloc, 1);
2154 
2155 		mem_cgroup_sk_alloc(sk);
2156 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2157 		sock_update_classid(&sk->sk_cgrp_data);
2158 		sock_update_netprioidx(&sk->sk_cgrp_data);
2159 		sk_tx_queue_clear(sk);
2160 	}
2161 
2162 	return sk;
2163 }
2164 EXPORT_SYMBOL(sk_alloc);
2165 
2166 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2167  * grace period. This is the case for UDP sockets and TCP listeners.
2168  */
__sk_destruct(struct rcu_head * head)2169 static void __sk_destruct(struct rcu_head *head)
2170 {
2171 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2172 	struct sk_filter *filter;
2173 
2174 	if (sk->sk_destruct)
2175 		sk->sk_destruct(sk);
2176 
2177 	filter = rcu_dereference_check(sk->sk_filter,
2178 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2179 	if (filter) {
2180 		sk_filter_uncharge(sk, filter);
2181 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2182 	}
2183 
2184 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2185 
2186 #ifdef CONFIG_BPF_SYSCALL
2187 	bpf_sk_storage_free(sk);
2188 #endif
2189 
2190 	if (atomic_read(&sk->sk_omem_alloc))
2191 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2192 			 __func__, atomic_read(&sk->sk_omem_alloc));
2193 
2194 	if (sk->sk_frag.page) {
2195 		put_page(sk->sk_frag.page);
2196 		sk->sk_frag.page = NULL;
2197 	}
2198 
2199 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2200 	put_cred(sk->sk_peer_cred);
2201 	put_pid(sk->sk_peer_pid);
2202 
2203 	if (likely(sk->sk_net_refcnt))
2204 		put_net_track(sock_net(sk), &sk->ns_tracker);
2205 	else
2206 		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2207 
2208 	sk_prot_free(sk->sk_prot_creator, sk);
2209 }
2210 
sk_destruct(struct sock * sk)2211 void sk_destruct(struct sock *sk)
2212 {
2213 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2214 
2215 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2216 		reuseport_detach_sock(sk);
2217 		use_call_rcu = true;
2218 	}
2219 
2220 	if (use_call_rcu)
2221 		call_rcu(&sk->sk_rcu, __sk_destruct);
2222 	else
2223 		__sk_destruct(&sk->sk_rcu);
2224 }
2225 
__sk_free(struct sock * sk)2226 static void __sk_free(struct sock *sk)
2227 {
2228 	if (likely(sk->sk_net_refcnt))
2229 		sock_inuse_add(sock_net(sk), -1);
2230 
2231 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2232 		sock_diag_broadcast_destroy(sk);
2233 	else
2234 		sk_destruct(sk);
2235 }
2236 
sk_free(struct sock * sk)2237 void sk_free(struct sock *sk)
2238 {
2239 	/*
2240 	 * We subtract one from sk_wmem_alloc and can know if
2241 	 * some packets are still in some tx queue.
2242 	 * If not null, sock_wfree() will call __sk_free(sk) later
2243 	 */
2244 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2245 		__sk_free(sk);
2246 }
2247 EXPORT_SYMBOL(sk_free);
2248 
sk_init_common(struct sock * sk)2249 static void sk_init_common(struct sock *sk)
2250 {
2251 	skb_queue_head_init(&sk->sk_receive_queue);
2252 	skb_queue_head_init(&sk->sk_write_queue);
2253 	skb_queue_head_init(&sk->sk_error_queue);
2254 
2255 	rwlock_init(&sk->sk_callback_lock);
2256 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2257 			af_rlock_keys + sk->sk_family,
2258 			af_family_rlock_key_strings[sk->sk_family]);
2259 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2260 			af_wlock_keys + sk->sk_family,
2261 			af_family_wlock_key_strings[sk->sk_family]);
2262 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2263 			af_elock_keys + sk->sk_family,
2264 			af_family_elock_key_strings[sk->sk_family]);
2265 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2266 			af_callback_keys + sk->sk_family,
2267 			af_family_clock_key_strings[sk->sk_family]);
2268 }
2269 
2270 /**
2271  *	sk_clone_lock - clone a socket, and lock its clone
2272  *	@sk: the socket to clone
2273  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2274  *
2275  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2276  */
sk_clone_lock(const struct sock * sk,const gfp_t priority)2277 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2278 {
2279 	struct proto *prot = READ_ONCE(sk->sk_prot);
2280 	struct sk_filter *filter;
2281 	bool is_charged = true;
2282 	struct sock *newsk;
2283 
2284 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2285 	if (!newsk)
2286 		goto out;
2287 
2288 	sock_copy(newsk, sk);
2289 
2290 	newsk->sk_prot_creator = prot;
2291 
2292 	/* SANITY */
2293 	if (likely(newsk->sk_net_refcnt)) {
2294 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2295 		sock_inuse_add(sock_net(newsk), 1);
2296 	} else {
2297 		/* Kernel sockets are not elevating the struct net refcount.
2298 		 * Instead, use a tracker to more easily detect if a layer
2299 		 * is not properly dismantling its kernel sockets at netns
2300 		 * destroy time.
2301 		 */
2302 		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2303 				      false, priority);
2304 	}
2305 	sk_node_init(&newsk->sk_node);
2306 	sock_lock_init(newsk);
2307 	bh_lock_sock(newsk);
2308 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2309 	newsk->sk_backlog.len = 0;
2310 
2311 	atomic_set(&newsk->sk_rmem_alloc, 0);
2312 
2313 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2314 	refcount_set(&newsk->sk_wmem_alloc, 1);
2315 
2316 	atomic_set(&newsk->sk_omem_alloc, 0);
2317 	sk_init_common(newsk);
2318 
2319 	newsk->sk_dst_cache	= NULL;
2320 	newsk->sk_dst_pending_confirm = 0;
2321 	newsk->sk_wmem_queued	= 0;
2322 	newsk->sk_forward_alloc = 0;
2323 	newsk->sk_reserved_mem  = 0;
2324 	atomic_set(&newsk->sk_drops, 0);
2325 	newsk->sk_send_head	= NULL;
2326 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2327 	atomic_set(&newsk->sk_zckey, 0);
2328 
2329 	sock_reset_flag(newsk, SOCK_DONE);
2330 
2331 	/* sk->sk_memcg will be populated at accept() time */
2332 	newsk->sk_memcg = NULL;
2333 
2334 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2335 
2336 	rcu_read_lock();
2337 	filter = rcu_dereference(sk->sk_filter);
2338 	if (filter != NULL)
2339 		/* though it's an empty new sock, the charging may fail
2340 		 * if sysctl_optmem_max was changed between creation of
2341 		 * original socket and cloning
2342 		 */
2343 		is_charged = sk_filter_charge(newsk, filter);
2344 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2345 	rcu_read_unlock();
2346 
2347 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2348 		/* We need to make sure that we don't uncharge the new
2349 		 * socket if we couldn't charge it in the first place
2350 		 * as otherwise we uncharge the parent's filter.
2351 		 */
2352 		if (!is_charged)
2353 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2354 		sk_free_unlock_clone(newsk);
2355 		newsk = NULL;
2356 		goto out;
2357 	}
2358 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2359 
2360 	if (bpf_sk_storage_clone(sk, newsk)) {
2361 		sk_free_unlock_clone(newsk);
2362 		newsk = NULL;
2363 		goto out;
2364 	}
2365 
2366 	/* Clear sk_user_data if parent had the pointer tagged
2367 	 * as not suitable for copying when cloning.
2368 	 */
2369 	if (sk_user_data_is_nocopy(newsk))
2370 		newsk->sk_user_data = NULL;
2371 
2372 	newsk->sk_err	   = 0;
2373 	newsk->sk_err_soft = 0;
2374 	newsk->sk_priority = 0;
2375 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2376 
2377 	/* Before updating sk_refcnt, we must commit prior changes to memory
2378 	 * (Documentation/RCU/rculist_nulls.rst for details)
2379 	 */
2380 	smp_wmb();
2381 	refcount_set(&newsk->sk_refcnt, 2);
2382 
2383 	sk_set_socket(newsk, NULL);
2384 	sk_tx_queue_clear(newsk);
2385 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2386 
2387 	if (newsk->sk_prot->sockets_allocated)
2388 		sk_sockets_allocated_inc(newsk);
2389 
2390 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2391 		net_enable_timestamp();
2392 out:
2393 	return newsk;
2394 }
2395 EXPORT_SYMBOL_GPL(sk_clone_lock);
2396 
sk_free_unlock_clone(struct sock * sk)2397 void sk_free_unlock_clone(struct sock *sk)
2398 {
2399 	/* It is still raw copy of parent, so invalidate
2400 	 * destructor and make plain sk_free() */
2401 	sk->sk_destruct = NULL;
2402 	bh_unlock_sock(sk);
2403 	sk_free(sk);
2404 }
2405 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2406 
sk_dst_gso_max_size(struct sock * sk,struct dst_entry * dst)2407 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2408 {
2409 	bool is_ipv6 = false;
2410 	u32 max_size;
2411 
2412 #if IS_ENABLED(CONFIG_IPV6)
2413 	is_ipv6 = (sk->sk_family == AF_INET6 &&
2414 		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2415 #endif
2416 	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2417 	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2418 			READ_ONCE(dst->dev->gso_ipv4_max_size);
2419 	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2420 		max_size = GSO_LEGACY_MAX_SIZE;
2421 
2422 	return max_size - (MAX_TCP_HEADER + 1);
2423 }
2424 
sk_setup_caps(struct sock * sk,struct dst_entry * dst)2425 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2426 {
2427 	u32 max_segs = 1;
2428 
2429 	sk->sk_route_caps = dst->dev->features;
2430 	if (sk_is_tcp(sk))
2431 		sk->sk_route_caps |= NETIF_F_GSO;
2432 	if (sk->sk_route_caps & NETIF_F_GSO)
2433 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2434 	if (unlikely(sk->sk_gso_disabled))
2435 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2436 	if (sk_can_gso(sk)) {
2437 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2438 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2439 		} else {
2440 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2441 			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2442 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2443 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2444 		}
2445 	}
2446 	sk->sk_gso_max_segs = max_segs;
2447 	sk_dst_set(sk, dst);
2448 }
2449 EXPORT_SYMBOL_GPL(sk_setup_caps);
2450 
2451 /*
2452  *	Simple resource managers for sockets.
2453  */
2454 
2455 
2456 /*
2457  * Write buffer destructor automatically called from kfree_skb.
2458  */
sock_wfree(struct sk_buff * skb)2459 void sock_wfree(struct sk_buff *skb)
2460 {
2461 	struct sock *sk = skb->sk;
2462 	unsigned int len = skb->truesize;
2463 	bool free;
2464 
2465 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2466 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2467 		    sk->sk_write_space == sock_def_write_space) {
2468 			rcu_read_lock();
2469 			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2470 			sock_def_write_space_wfree(sk);
2471 			rcu_read_unlock();
2472 			if (unlikely(free))
2473 				__sk_free(sk);
2474 			return;
2475 		}
2476 
2477 		/*
2478 		 * Keep a reference on sk_wmem_alloc, this will be released
2479 		 * after sk_write_space() call
2480 		 */
2481 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2482 		sk->sk_write_space(sk);
2483 		len = 1;
2484 	}
2485 	/*
2486 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2487 	 * could not do because of in-flight packets
2488 	 */
2489 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2490 		__sk_free(sk);
2491 }
2492 EXPORT_SYMBOL(sock_wfree);
2493 
2494 /* This variant of sock_wfree() is used by TCP,
2495  * since it sets SOCK_USE_WRITE_QUEUE.
2496  */
__sock_wfree(struct sk_buff * skb)2497 void __sock_wfree(struct sk_buff *skb)
2498 {
2499 	struct sock *sk = skb->sk;
2500 
2501 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2502 		__sk_free(sk);
2503 }
2504 
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)2505 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2506 {
2507 	skb_orphan(skb);
2508 	skb->sk = sk;
2509 #ifdef CONFIG_INET
2510 	if (unlikely(!sk_fullsock(sk))) {
2511 		skb->destructor = sock_edemux;
2512 		sock_hold(sk);
2513 		return;
2514 	}
2515 #endif
2516 	skb->destructor = sock_wfree;
2517 	skb_set_hash_from_sk(skb, sk);
2518 	/*
2519 	 * We used to take a refcount on sk, but following operation
2520 	 * is enough to guarantee sk_free() wont free this sock until
2521 	 * all in-flight packets are completed
2522 	 */
2523 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2524 }
2525 EXPORT_SYMBOL(skb_set_owner_w);
2526 
can_skb_orphan_partial(const struct sk_buff * skb)2527 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2528 {
2529 	/* Drivers depend on in-order delivery for crypto offload,
2530 	 * partial orphan breaks out-of-order-OK logic.
2531 	 */
2532 	if (skb_is_decrypted(skb))
2533 		return false;
2534 
2535 	return (skb->destructor == sock_wfree ||
2536 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2537 }
2538 
2539 /* This helper is used by netem, as it can hold packets in its
2540  * delay queue. We want to allow the owner socket to send more
2541  * packets, as if they were already TX completed by a typical driver.
2542  * But we also want to keep skb->sk set because some packet schedulers
2543  * rely on it (sch_fq for example).
2544  */
skb_orphan_partial(struct sk_buff * skb)2545 void skb_orphan_partial(struct sk_buff *skb)
2546 {
2547 	if (skb_is_tcp_pure_ack(skb))
2548 		return;
2549 
2550 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2551 		return;
2552 
2553 	skb_orphan(skb);
2554 }
2555 EXPORT_SYMBOL(skb_orphan_partial);
2556 
2557 /*
2558  * Read buffer destructor automatically called from kfree_skb.
2559  */
sock_rfree(struct sk_buff * skb)2560 void sock_rfree(struct sk_buff *skb)
2561 {
2562 	struct sock *sk = skb->sk;
2563 	unsigned int len = skb->truesize;
2564 
2565 	atomic_sub(len, &sk->sk_rmem_alloc);
2566 	sk_mem_uncharge(sk, len);
2567 }
2568 EXPORT_SYMBOL(sock_rfree);
2569 
2570 /*
2571  * Buffer destructor for skbs that are not used directly in read or write
2572  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2573  */
sock_efree(struct sk_buff * skb)2574 void sock_efree(struct sk_buff *skb)
2575 {
2576 	sock_put(skb->sk);
2577 }
2578 EXPORT_SYMBOL(sock_efree);
2579 
2580 /* Buffer destructor for prefetch/receive path where reference count may
2581  * not be held, e.g. for listen sockets.
2582  */
2583 #ifdef CONFIG_INET
sock_pfree(struct sk_buff * skb)2584 void sock_pfree(struct sk_buff *skb)
2585 {
2586 	struct sock *sk = skb->sk;
2587 
2588 	if (!sk_is_refcounted(sk))
2589 		return;
2590 
2591 	if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2592 		inet_reqsk(sk)->rsk_listener = NULL;
2593 		reqsk_free(inet_reqsk(sk));
2594 		return;
2595 	}
2596 
2597 	sock_gen_put(sk);
2598 }
2599 EXPORT_SYMBOL(sock_pfree);
2600 #endif /* CONFIG_INET */
2601 
sock_i_uid(struct sock * sk)2602 kuid_t sock_i_uid(struct sock *sk)
2603 {
2604 	kuid_t uid;
2605 
2606 	read_lock_bh(&sk->sk_callback_lock);
2607 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2608 	read_unlock_bh(&sk->sk_callback_lock);
2609 	return uid;
2610 }
2611 EXPORT_SYMBOL(sock_i_uid);
2612 
__sock_i_ino(struct sock * sk)2613 unsigned long __sock_i_ino(struct sock *sk)
2614 {
2615 	unsigned long ino;
2616 
2617 	read_lock(&sk->sk_callback_lock);
2618 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2619 	read_unlock(&sk->sk_callback_lock);
2620 	return ino;
2621 }
2622 EXPORT_SYMBOL(__sock_i_ino);
2623 
sock_i_ino(struct sock * sk)2624 unsigned long sock_i_ino(struct sock *sk)
2625 {
2626 	unsigned long ino;
2627 
2628 	local_bh_disable();
2629 	ino = __sock_i_ino(sk);
2630 	local_bh_enable();
2631 	return ino;
2632 }
2633 EXPORT_SYMBOL(sock_i_ino);
2634 
2635 /*
2636  * Allocate a skb from the socket's send buffer.
2637  */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)2638 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2639 			     gfp_t priority)
2640 {
2641 	if (force ||
2642 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2643 		struct sk_buff *skb = alloc_skb(size, priority);
2644 
2645 		if (skb) {
2646 			skb_set_owner_w(skb, sk);
2647 			return skb;
2648 		}
2649 	}
2650 	return NULL;
2651 }
2652 EXPORT_SYMBOL(sock_wmalloc);
2653 
sock_ofree(struct sk_buff * skb)2654 static void sock_ofree(struct sk_buff *skb)
2655 {
2656 	struct sock *sk = skb->sk;
2657 
2658 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2659 }
2660 
sock_omalloc(struct sock * sk,unsigned long size,gfp_t priority)2661 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2662 			     gfp_t priority)
2663 {
2664 	struct sk_buff *skb;
2665 
2666 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2667 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2668 	    READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2669 		return NULL;
2670 
2671 	skb = alloc_skb(size, priority);
2672 	if (!skb)
2673 		return NULL;
2674 
2675 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2676 	skb->sk = sk;
2677 	skb->destructor = sock_ofree;
2678 	return skb;
2679 }
2680 
2681 /*
2682  * Allocate a memory block from the socket's option memory buffer.
2683  */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)2684 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2685 {
2686 	int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2687 
2688 	if ((unsigned int)size <= optmem_max &&
2689 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2690 		void *mem;
2691 		/* First do the add, to avoid the race if kmalloc
2692 		 * might sleep.
2693 		 */
2694 		atomic_add(size, &sk->sk_omem_alloc);
2695 		mem = kmalloc(size, priority);
2696 		if (mem)
2697 			return mem;
2698 		atomic_sub(size, &sk->sk_omem_alloc);
2699 	}
2700 	return NULL;
2701 }
2702 EXPORT_SYMBOL(sock_kmalloc);
2703 
2704 /* Free an option memory block. Note, we actually want the inline
2705  * here as this allows gcc to detect the nullify and fold away the
2706  * condition entirely.
2707  */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)2708 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2709 				  const bool nullify)
2710 {
2711 	if (WARN_ON_ONCE(!mem))
2712 		return;
2713 	if (nullify)
2714 		kfree_sensitive(mem);
2715 	else
2716 		kfree(mem);
2717 	atomic_sub(size, &sk->sk_omem_alloc);
2718 }
2719 
sock_kfree_s(struct sock * sk,void * mem,int size)2720 void sock_kfree_s(struct sock *sk, void *mem, int size)
2721 {
2722 	__sock_kfree_s(sk, mem, size, false);
2723 }
2724 EXPORT_SYMBOL(sock_kfree_s);
2725 
sock_kzfree_s(struct sock * sk,void * mem,int size)2726 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2727 {
2728 	__sock_kfree_s(sk, mem, size, true);
2729 }
2730 EXPORT_SYMBOL(sock_kzfree_s);
2731 
2732 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2733    I think, these locks should be removed for datagram sockets.
2734  */
sock_wait_for_wmem(struct sock * sk,long timeo)2735 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2736 {
2737 	DEFINE_WAIT(wait);
2738 
2739 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2740 	for (;;) {
2741 		if (!timeo)
2742 			break;
2743 		if (signal_pending(current))
2744 			break;
2745 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2746 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2747 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2748 			break;
2749 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2750 			break;
2751 		if (READ_ONCE(sk->sk_err))
2752 			break;
2753 		timeo = schedule_timeout(timeo);
2754 	}
2755 	finish_wait(sk_sleep(sk), &wait);
2756 	return timeo;
2757 }
2758 
2759 
2760 /*
2761  *	Generic send/receive buffer handlers
2762  */
2763 
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)2764 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2765 				     unsigned long data_len, int noblock,
2766 				     int *errcode, int max_page_order)
2767 {
2768 	struct sk_buff *skb;
2769 	long timeo;
2770 	int err;
2771 
2772 	timeo = sock_sndtimeo(sk, noblock);
2773 	for (;;) {
2774 		err = sock_error(sk);
2775 		if (err != 0)
2776 			goto failure;
2777 
2778 		err = -EPIPE;
2779 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2780 			goto failure;
2781 
2782 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2783 			break;
2784 
2785 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2786 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2787 		err = -EAGAIN;
2788 		if (!timeo)
2789 			goto failure;
2790 		if (signal_pending(current))
2791 			goto interrupted;
2792 		timeo = sock_wait_for_wmem(sk, timeo);
2793 	}
2794 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2795 				   errcode, sk->sk_allocation);
2796 	if (skb)
2797 		skb_set_owner_w(skb, sk);
2798 	return skb;
2799 
2800 interrupted:
2801 	err = sock_intr_errno(timeo);
2802 failure:
2803 	*errcode = err;
2804 	return NULL;
2805 }
2806 EXPORT_SYMBOL(sock_alloc_send_pskb);
2807 
__sock_cmsg_send(struct sock * sk,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)2808 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2809 		     struct sockcm_cookie *sockc)
2810 {
2811 	u32 tsflags;
2812 
2813 	switch (cmsg->cmsg_type) {
2814 	case SO_MARK:
2815 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2816 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2817 			return -EPERM;
2818 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2819 			return -EINVAL;
2820 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2821 		break;
2822 	case SO_TIMESTAMPING_OLD:
2823 	case SO_TIMESTAMPING_NEW:
2824 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2825 			return -EINVAL;
2826 
2827 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2828 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2829 			return -EINVAL;
2830 
2831 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2832 		sockc->tsflags |= tsflags;
2833 		break;
2834 	case SCM_TXTIME:
2835 		if (!sock_flag(sk, SOCK_TXTIME))
2836 			return -EINVAL;
2837 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2838 			return -EINVAL;
2839 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2840 		break;
2841 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2842 	case SCM_RIGHTS:
2843 	case SCM_CREDENTIALS:
2844 		break;
2845 	default:
2846 		return -EINVAL;
2847 	}
2848 	return 0;
2849 }
2850 EXPORT_SYMBOL(__sock_cmsg_send);
2851 
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)2852 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2853 		   struct sockcm_cookie *sockc)
2854 {
2855 	struct cmsghdr *cmsg;
2856 	int ret;
2857 
2858 	for_each_cmsghdr(cmsg, msg) {
2859 		if (!CMSG_OK(msg, cmsg))
2860 			return -EINVAL;
2861 		if (cmsg->cmsg_level != SOL_SOCKET)
2862 			continue;
2863 		ret = __sock_cmsg_send(sk, cmsg, sockc);
2864 		if (ret)
2865 			return ret;
2866 	}
2867 	return 0;
2868 }
2869 EXPORT_SYMBOL(sock_cmsg_send);
2870 
sk_enter_memory_pressure(struct sock * sk)2871 static void sk_enter_memory_pressure(struct sock *sk)
2872 {
2873 	if (!sk->sk_prot->enter_memory_pressure)
2874 		return;
2875 
2876 	sk->sk_prot->enter_memory_pressure(sk);
2877 }
2878 
sk_leave_memory_pressure(struct sock * sk)2879 static void sk_leave_memory_pressure(struct sock *sk)
2880 {
2881 	if (sk->sk_prot->leave_memory_pressure) {
2882 		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2883 				     tcp_leave_memory_pressure, sk);
2884 	} else {
2885 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2886 
2887 		if (memory_pressure && READ_ONCE(*memory_pressure))
2888 			WRITE_ONCE(*memory_pressure, 0);
2889 	}
2890 }
2891 
2892 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2893 
2894 /**
2895  * skb_page_frag_refill - check that a page_frag contains enough room
2896  * @sz: minimum size of the fragment we want to get
2897  * @pfrag: pointer to page_frag
2898  * @gfp: priority for memory allocation
2899  *
2900  * Note: While this allocator tries to use high order pages, there is
2901  * no guarantee that allocations succeed. Therefore, @sz MUST be
2902  * less or equal than PAGE_SIZE.
2903  */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)2904 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2905 {
2906 	if (pfrag->page) {
2907 		if (page_ref_count(pfrag->page) == 1) {
2908 			pfrag->offset = 0;
2909 			return true;
2910 		}
2911 		if (pfrag->offset + sz <= pfrag->size)
2912 			return true;
2913 		put_page(pfrag->page);
2914 	}
2915 
2916 	pfrag->offset = 0;
2917 	if (SKB_FRAG_PAGE_ORDER &&
2918 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2919 		/* Avoid direct reclaim but allow kswapd to wake */
2920 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2921 					  __GFP_COMP | __GFP_NOWARN |
2922 					  __GFP_NORETRY,
2923 					  SKB_FRAG_PAGE_ORDER);
2924 		if (likely(pfrag->page)) {
2925 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2926 			return true;
2927 		}
2928 	}
2929 	pfrag->page = alloc_page(gfp);
2930 	if (likely(pfrag->page)) {
2931 		pfrag->size = PAGE_SIZE;
2932 		return true;
2933 	}
2934 	return false;
2935 }
2936 EXPORT_SYMBOL(skb_page_frag_refill);
2937 
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)2938 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2939 {
2940 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2941 		return true;
2942 
2943 	sk_enter_memory_pressure(sk);
2944 	sk_stream_moderate_sndbuf(sk);
2945 	return false;
2946 }
2947 EXPORT_SYMBOL(sk_page_frag_refill);
2948 
__lock_sock(struct sock * sk)2949 void __lock_sock(struct sock *sk)
2950 	__releases(&sk->sk_lock.slock)
2951 	__acquires(&sk->sk_lock.slock)
2952 {
2953 	DEFINE_WAIT(wait);
2954 
2955 	for (;;) {
2956 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2957 					TASK_UNINTERRUPTIBLE);
2958 		spin_unlock_bh(&sk->sk_lock.slock);
2959 		schedule();
2960 		spin_lock_bh(&sk->sk_lock.slock);
2961 		if (!sock_owned_by_user(sk))
2962 			break;
2963 	}
2964 	finish_wait(&sk->sk_lock.wq, &wait);
2965 }
2966 
__release_sock(struct sock * sk)2967 void __release_sock(struct sock *sk)
2968 	__releases(&sk->sk_lock.slock)
2969 	__acquires(&sk->sk_lock.slock)
2970 {
2971 	struct sk_buff *skb, *next;
2972 
2973 	while ((skb = sk->sk_backlog.head) != NULL) {
2974 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2975 
2976 		spin_unlock_bh(&sk->sk_lock.slock);
2977 
2978 		do {
2979 			next = skb->next;
2980 			prefetch(next);
2981 			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2982 			skb_mark_not_on_list(skb);
2983 			sk_backlog_rcv(sk, skb);
2984 
2985 			cond_resched();
2986 
2987 			skb = next;
2988 		} while (skb != NULL);
2989 
2990 		spin_lock_bh(&sk->sk_lock.slock);
2991 	}
2992 
2993 	/*
2994 	 * Doing the zeroing here guarantee we can not loop forever
2995 	 * while a wild producer attempts to flood us.
2996 	 */
2997 	sk->sk_backlog.len = 0;
2998 }
2999 
__sk_flush_backlog(struct sock * sk)3000 void __sk_flush_backlog(struct sock *sk)
3001 {
3002 	spin_lock_bh(&sk->sk_lock.slock);
3003 	__release_sock(sk);
3004 
3005 	if (sk->sk_prot->release_cb)
3006 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3007 				     tcp_release_cb, sk);
3008 
3009 	spin_unlock_bh(&sk->sk_lock.slock);
3010 }
3011 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3012 
3013 /**
3014  * sk_wait_data - wait for data to arrive at sk_receive_queue
3015  * @sk:    sock to wait on
3016  * @timeo: for how long
3017  * @skb:   last skb seen on sk_receive_queue
3018  *
3019  * Now socket state including sk->sk_err is changed only under lock,
3020  * hence we may omit checks after joining wait queue.
3021  * We check receive queue before schedule() only as optimization;
3022  * it is very likely that release_sock() added new data.
3023  */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)3024 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3025 {
3026 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3027 	int rc;
3028 
3029 	add_wait_queue(sk_sleep(sk), &wait);
3030 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3031 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3032 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3033 	remove_wait_queue(sk_sleep(sk), &wait);
3034 	return rc;
3035 }
3036 EXPORT_SYMBOL(sk_wait_data);
3037 
3038 /**
3039  *	__sk_mem_raise_allocated - increase memory_allocated
3040  *	@sk: socket
3041  *	@size: memory size to allocate
3042  *	@amt: pages to allocate
3043  *	@kind: allocation type
3044  *
3045  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3046  *
3047  *	Unlike the globally shared limits among the sockets under same protocol,
3048  *	consuming the budget of a memcg won't have direct effect on other ones.
3049  *	So be optimistic about memcg's tolerance, and leave the callers to decide
3050  *	whether or not to raise allocated through sk_under_memory_pressure() or
3051  *	its variants.
3052  */
__sk_mem_raise_allocated(struct sock * sk,int size,int amt,int kind)3053 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3054 {
3055 	struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3056 	struct proto *prot = sk->sk_prot;
3057 	bool charged = false;
3058 	long allocated;
3059 
3060 	sk_memory_allocated_add(sk, amt);
3061 	allocated = sk_memory_allocated(sk);
3062 
3063 	if (memcg) {
3064 		if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
3065 			goto suppress_allocation;
3066 		charged = true;
3067 	}
3068 
3069 	/* Under limit. */
3070 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3071 		sk_leave_memory_pressure(sk);
3072 		return 1;
3073 	}
3074 
3075 	/* Under pressure. */
3076 	if (allocated > sk_prot_mem_limits(sk, 1))
3077 		sk_enter_memory_pressure(sk);
3078 
3079 	/* Over hard limit. */
3080 	if (allocated > sk_prot_mem_limits(sk, 2))
3081 		goto suppress_allocation;
3082 
3083 	/* Guarantee minimum buffer size under pressure (either global
3084 	 * or memcg) to make sure features described in RFC 7323 (TCP
3085 	 * Extensions for High Performance) work properly.
3086 	 *
3087 	 * This rule does NOT stand when exceeds global or memcg's hard
3088 	 * limit, or else a DoS attack can be taken place by spawning
3089 	 * lots of sockets whose usage are under minimum buffer size.
3090 	 */
3091 	if (kind == SK_MEM_RECV) {
3092 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3093 			return 1;
3094 
3095 	} else { /* SK_MEM_SEND */
3096 		int wmem0 = sk_get_wmem0(sk, prot);
3097 
3098 		if (sk->sk_type == SOCK_STREAM) {
3099 			if (sk->sk_wmem_queued < wmem0)
3100 				return 1;
3101 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3102 				return 1;
3103 		}
3104 	}
3105 
3106 	if (sk_has_memory_pressure(sk)) {
3107 		u64 alloc;
3108 
3109 		/* The following 'average' heuristic is within the
3110 		 * scope of global accounting, so it only makes
3111 		 * sense for global memory pressure.
3112 		 */
3113 		if (!sk_under_global_memory_pressure(sk))
3114 			return 1;
3115 
3116 		/* Try to be fair among all the sockets under global
3117 		 * pressure by allowing the ones that below average
3118 		 * usage to raise.
3119 		 */
3120 		alloc = sk_sockets_allocated_read_positive(sk);
3121 		if (sk_prot_mem_limits(sk, 2) > alloc *
3122 		    sk_mem_pages(sk->sk_wmem_queued +
3123 				 atomic_read(&sk->sk_rmem_alloc) +
3124 				 sk->sk_forward_alloc))
3125 			return 1;
3126 	}
3127 
3128 suppress_allocation:
3129 
3130 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3131 		sk_stream_moderate_sndbuf(sk);
3132 
3133 		/* Fail only if socket is _under_ its sndbuf.
3134 		 * In this case we cannot block, so that we have to fail.
3135 		 */
3136 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3137 			/* Force charge with __GFP_NOFAIL */
3138 			if (memcg && !charged) {
3139 				mem_cgroup_charge_skmem(memcg, amt,
3140 					gfp_memcg_charge() | __GFP_NOFAIL);
3141 			}
3142 			return 1;
3143 		}
3144 	}
3145 
3146 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3147 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3148 
3149 	sk_memory_allocated_sub(sk, amt);
3150 
3151 	if (charged)
3152 		mem_cgroup_uncharge_skmem(memcg, amt);
3153 
3154 	return 0;
3155 }
3156 
3157 /**
3158  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3159  *	@sk: socket
3160  *	@size: memory size to allocate
3161  *	@kind: allocation type
3162  *
3163  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3164  *	rmem allocation. This function assumes that protocols which have
3165  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3166  */
__sk_mem_schedule(struct sock * sk,int size,int kind)3167 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3168 {
3169 	int ret, amt = sk_mem_pages(size);
3170 
3171 	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3172 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3173 	if (!ret)
3174 		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3175 	return ret;
3176 }
3177 EXPORT_SYMBOL(__sk_mem_schedule);
3178 
3179 /**
3180  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3181  *	@sk: socket
3182  *	@amount: number of quanta
3183  *
3184  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3185  */
__sk_mem_reduce_allocated(struct sock * sk,int amount)3186 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3187 {
3188 	sk_memory_allocated_sub(sk, amount);
3189 
3190 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3191 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3192 
3193 	if (sk_under_global_memory_pressure(sk) &&
3194 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3195 		sk_leave_memory_pressure(sk);
3196 }
3197 
3198 /**
3199  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3200  *	@sk: socket
3201  *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3202  */
__sk_mem_reclaim(struct sock * sk,int amount)3203 void __sk_mem_reclaim(struct sock *sk, int amount)
3204 {
3205 	amount >>= PAGE_SHIFT;
3206 	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3207 	__sk_mem_reduce_allocated(sk, amount);
3208 }
3209 EXPORT_SYMBOL(__sk_mem_reclaim);
3210 
sk_set_peek_off(struct sock * sk,int val)3211 int sk_set_peek_off(struct sock *sk, int val)
3212 {
3213 	WRITE_ONCE(sk->sk_peek_off, val);
3214 	return 0;
3215 }
3216 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3217 
3218 /*
3219  * Set of default routines for initialising struct proto_ops when
3220  * the protocol does not support a particular function. In certain
3221  * cases where it makes no sense for a protocol to have a "do nothing"
3222  * function, some default processing is provided.
3223  */
3224 
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)3225 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3226 {
3227 	return -EOPNOTSUPP;
3228 }
3229 EXPORT_SYMBOL(sock_no_bind);
3230 
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)3231 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3232 		    int len, int flags)
3233 {
3234 	return -EOPNOTSUPP;
3235 }
3236 EXPORT_SYMBOL(sock_no_connect);
3237 
sock_no_socketpair(struct socket * sock1,struct socket * sock2)3238 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3239 {
3240 	return -EOPNOTSUPP;
3241 }
3242 EXPORT_SYMBOL(sock_no_socketpair);
3243 
sock_no_accept(struct socket * sock,struct socket * newsock,struct proto_accept_arg * arg)3244 int sock_no_accept(struct socket *sock, struct socket *newsock,
3245 		   struct proto_accept_arg *arg)
3246 {
3247 	return -EOPNOTSUPP;
3248 }
3249 EXPORT_SYMBOL(sock_no_accept);
3250 
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int peer)3251 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3252 		    int peer)
3253 {
3254 	return -EOPNOTSUPP;
3255 }
3256 EXPORT_SYMBOL(sock_no_getname);
3257 
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3258 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3259 {
3260 	return -EOPNOTSUPP;
3261 }
3262 EXPORT_SYMBOL(sock_no_ioctl);
3263 
sock_no_listen(struct socket * sock,int backlog)3264 int sock_no_listen(struct socket *sock, int backlog)
3265 {
3266 	return -EOPNOTSUPP;
3267 }
3268 EXPORT_SYMBOL(sock_no_listen);
3269 
sock_no_shutdown(struct socket * sock,int how)3270 int sock_no_shutdown(struct socket *sock, int how)
3271 {
3272 	return -EOPNOTSUPP;
3273 }
3274 EXPORT_SYMBOL(sock_no_shutdown);
3275 
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)3276 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3277 {
3278 	return -EOPNOTSUPP;
3279 }
3280 EXPORT_SYMBOL(sock_no_sendmsg);
3281 
sock_no_sendmsg_locked(struct sock * sk,struct msghdr * m,size_t len)3282 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3283 {
3284 	return -EOPNOTSUPP;
3285 }
3286 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3287 
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)3288 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3289 		    int flags)
3290 {
3291 	return -EOPNOTSUPP;
3292 }
3293 EXPORT_SYMBOL(sock_no_recvmsg);
3294 
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)3295 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3296 {
3297 	/* Mirror missing mmap method error code */
3298 	return -ENODEV;
3299 }
3300 EXPORT_SYMBOL(sock_no_mmap);
3301 
3302 /*
3303  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3304  * various sock-based usage counts.
3305  */
__receive_sock(struct file * file)3306 void __receive_sock(struct file *file)
3307 {
3308 	struct socket *sock;
3309 
3310 	sock = sock_from_file(file);
3311 	if (sock) {
3312 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3313 		sock_update_classid(&sock->sk->sk_cgrp_data);
3314 	}
3315 }
3316 
3317 /*
3318  *	Default Socket Callbacks
3319  */
3320 
sock_def_wakeup(struct sock * sk)3321 static void sock_def_wakeup(struct sock *sk)
3322 {
3323 	struct socket_wq *wq;
3324 
3325 	rcu_read_lock();
3326 	wq = rcu_dereference(sk->sk_wq);
3327 	if (skwq_has_sleeper(wq))
3328 		wake_up_interruptible_all(&wq->wait);
3329 	rcu_read_unlock();
3330 }
3331 
sock_def_error_report(struct sock * sk)3332 static void sock_def_error_report(struct sock *sk)
3333 {
3334 	struct socket_wq *wq;
3335 
3336 	rcu_read_lock();
3337 	wq = rcu_dereference(sk->sk_wq);
3338 	if (skwq_has_sleeper(wq))
3339 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3340 	sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3341 	rcu_read_unlock();
3342 }
3343 
sock_def_readable(struct sock * sk)3344 void sock_def_readable(struct sock *sk)
3345 {
3346 	struct socket_wq *wq;
3347 
3348 	trace_sk_data_ready(sk);
3349 
3350 	rcu_read_lock();
3351 	wq = rcu_dereference(sk->sk_wq);
3352 	if (skwq_has_sleeper(wq))
3353 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3354 						EPOLLRDNORM | EPOLLRDBAND);
3355 	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3356 	rcu_read_unlock();
3357 }
3358 
sock_def_write_space(struct sock * sk)3359 static void sock_def_write_space(struct sock *sk)
3360 {
3361 	struct socket_wq *wq;
3362 
3363 	rcu_read_lock();
3364 
3365 	/* Do not wake up a writer until he can make "significant"
3366 	 * progress.  --DaveM
3367 	 */
3368 	if (sock_writeable(sk)) {
3369 		wq = rcu_dereference(sk->sk_wq);
3370 		if (skwq_has_sleeper(wq))
3371 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3372 						EPOLLWRNORM | EPOLLWRBAND);
3373 
3374 		/* Should agree with poll, otherwise some programs break */
3375 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3376 	}
3377 
3378 	rcu_read_unlock();
3379 }
3380 
3381 /* An optimised version of sock_def_write_space(), should only be called
3382  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3383  * ->sk_wmem_alloc.
3384  */
sock_def_write_space_wfree(struct sock * sk)3385 static void sock_def_write_space_wfree(struct sock *sk)
3386 {
3387 	/* Do not wake up a writer until he can make "significant"
3388 	 * progress.  --DaveM
3389 	 */
3390 	if (sock_writeable(sk)) {
3391 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3392 
3393 		/* rely on refcount_sub from sock_wfree() */
3394 		smp_mb__after_atomic();
3395 		if (wq && waitqueue_active(&wq->wait))
3396 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3397 						EPOLLWRNORM | EPOLLWRBAND);
3398 
3399 		/* Should agree with poll, otherwise some programs break */
3400 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3401 	}
3402 }
3403 
sock_def_destruct(struct sock * sk)3404 static void sock_def_destruct(struct sock *sk)
3405 {
3406 }
3407 
sk_send_sigurg(struct sock * sk)3408 void sk_send_sigurg(struct sock *sk)
3409 {
3410 	if (sk->sk_socket && sk->sk_socket->file)
3411 		if (send_sigurg(&sk->sk_socket->file->f_owner))
3412 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3413 }
3414 EXPORT_SYMBOL(sk_send_sigurg);
3415 
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)3416 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3417 		    unsigned long expires)
3418 {
3419 	if (!mod_timer(timer, expires))
3420 		sock_hold(sk);
3421 }
3422 EXPORT_SYMBOL(sk_reset_timer);
3423 
sk_stop_timer(struct sock * sk,struct timer_list * timer)3424 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3425 {
3426 	if (del_timer(timer))
3427 		__sock_put(sk);
3428 }
3429 EXPORT_SYMBOL(sk_stop_timer);
3430 
sk_stop_timer_sync(struct sock * sk,struct timer_list * timer)3431 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3432 {
3433 	if (del_timer_sync(timer))
3434 		__sock_put(sk);
3435 }
3436 EXPORT_SYMBOL(sk_stop_timer_sync);
3437 
sock_init_data_uid(struct socket * sock,struct sock * sk,kuid_t uid)3438 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3439 {
3440 	sk_init_common(sk);
3441 	sk->sk_send_head	=	NULL;
3442 
3443 	timer_setup(&sk->sk_timer, NULL, 0);
3444 
3445 	sk->sk_allocation	=	GFP_KERNEL;
3446 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3447 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3448 	sk->sk_state		=	TCP_CLOSE;
3449 	sk->sk_use_task_frag	=	true;
3450 	sk_set_socket(sk, sock);
3451 
3452 	sock_set_flag(sk, SOCK_ZAPPED);
3453 
3454 	if (sock) {
3455 		sk->sk_type	=	sock->type;
3456 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3457 		sock->sk	=	sk;
3458 	} else {
3459 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3460 	}
3461 	sk->sk_uid	=	uid;
3462 
3463 	rwlock_init(&sk->sk_callback_lock);
3464 	if (sk->sk_kern_sock)
3465 		lockdep_set_class_and_name(
3466 			&sk->sk_callback_lock,
3467 			af_kern_callback_keys + sk->sk_family,
3468 			af_family_kern_clock_key_strings[sk->sk_family]);
3469 	else
3470 		lockdep_set_class_and_name(
3471 			&sk->sk_callback_lock,
3472 			af_callback_keys + sk->sk_family,
3473 			af_family_clock_key_strings[sk->sk_family]);
3474 
3475 	sk->sk_state_change	=	sock_def_wakeup;
3476 	sk->sk_data_ready	=	sock_def_readable;
3477 	sk->sk_write_space	=	sock_def_write_space;
3478 	sk->sk_error_report	=	sock_def_error_report;
3479 	sk->sk_destruct		=	sock_def_destruct;
3480 
3481 	sk->sk_frag.page	=	NULL;
3482 	sk->sk_frag.offset	=	0;
3483 	sk->sk_peek_off		=	-1;
3484 
3485 	sk->sk_peer_pid 	=	NULL;
3486 	sk->sk_peer_cred	=	NULL;
3487 	spin_lock_init(&sk->sk_peer_lock);
3488 
3489 	sk->sk_write_pending	=	0;
3490 	sk->sk_rcvlowat		=	1;
3491 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3492 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3493 
3494 	sk->sk_stamp = SK_DEFAULT_STAMP;
3495 #if BITS_PER_LONG==32
3496 	seqlock_init(&sk->sk_stamp_seq);
3497 #endif
3498 	atomic_set(&sk->sk_zckey, 0);
3499 
3500 #ifdef CONFIG_NET_RX_BUSY_POLL
3501 	sk->sk_napi_id		=	0;
3502 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3503 #endif
3504 
3505 	sk->sk_max_pacing_rate = ~0UL;
3506 	sk->sk_pacing_rate = ~0UL;
3507 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3508 	sk->sk_incoming_cpu = -1;
3509 
3510 	sk_rx_queue_clear(sk);
3511 	/*
3512 	 * Before updating sk_refcnt, we must commit prior changes to memory
3513 	 * (Documentation/RCU/rculist_nulls.rst for details)
3514 	 */
3515 	smp_wmb();
3516 	refcount_set(&sk->sk_refcnt, 1);
3517 	atomic_set(&sk->sk_drops, 0);
3518 }
3519 EXPORT_SYMBOL(sock_init_data_uid);
3520 
sock_init_data(struct socket * sock,struct sock * sk)3521 void sock_init_data(struct socket *sock, struct sock *sk)
3522 {
3523 	kuid_t uid = sock ?
3524 		SOCK_INODE(sock)->i_uid :
3525 		make_kuid(sock_net(sk)->user_ns, 0);
3526 
3527 	sock_init_data_uid(sock, sk, uid);
3528 }
3529 EXPORT_SYMBOL(sock_init_data);
3530 
lock_sock_nested(struct sock * sk,int subclass)3531 void lock_sock_nested(struct sock *sk, int subclass)
3532 {
3533 	/* The sk_lock has mutex_lock() semantics here. */
3534 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3535 
3536 	might_sleep();
3537 	spin_lock_bh(&sk->sk_lock.slock);
3538 	if (sock_owned_by_user_nocheck(sk))
3539 		__lock_sock(sk);
3540 	sk->sk_lock.owned = 1;
3541 	spin_unlock_bh(&sk->sk_lock.slock);
3542 }
3543 EXPORT_SYMBOL(lock_sock_nested);
3544 
release_sock(struct sock * sk)3545 void release_sock(struct sock *sk)
3546 {
3547 	spin_lock_bh(&sk->sk_lock.slock);
3548 	if (sk->sk_backlog.tail)
3549 		__release_sock(sk);
3550 
3551 	if (sk->sk_prot->release_cb)
3552 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3553 				     tcp_release_cb, sk);
3554 
3555 	sock_release_ownership(sk);
3556 	if (waitqueue_active(&sk->sk_lock.wq))
3557 		wake_up(&sk->sk_lock.wq);
3558 	spin_unlock_bh(&sk->sk_lock.slock);
3559 }
3560 EXPORT_SYMBOL(release_sock);
3561 
__lock_sock_fast(struct sock * sk)3562 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3563 {
3564 	might_sleep();
3565 	spin_lock_bh(&sk->sk_lock.slock);
3566 
3567 	if (!sock_owned_by_user_nocheck(sk)) {
3568 		/*
3569 		 * Fast path return with bottom halves disabled and
3570 		 * sock::sk_lock.slock held.
3571 		 *
3572 		 * The 'mutex' is not contended and holding
3573 		 * sock::sk_lock.slock prevents all other lockers to
3574 		 * proceed so the corresponding unlock_sock_fast() can
3575 		 * avoid the slow path of release_sock() completely and
3576 		 * just release slock.
3577 		 *
3578 		 * From a semantical POV this is equivalent to 'acquiring'
3579 		 * the 'mutex', hence the corresponding lockdep
3580 		 * mutex_release() has to happen in the fast path of
3581 		 * unlock_sock_fast().
3582 		 */
3583 		return false;
3584 	}
3585 
3586 	__lock_sock(sk);
3587 	sk->sk_lock.owned = 1;
3588 	__acquire(&sk->sk_lock.slock);
3589 	spin_unlock_bh(&sk->sk_lock.slock);
3590 	return true;
3591 }
3592 EXPORT_SYMBOL(__lock_sock_fast);
3593 
sock_gettstamp(struct socket * sock,void __user * userstamp,bool timeval,bool time32)3594 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3595 		   bool timeval, bool time32)
3596 {
3597 	struct sock *sk = sock->sk;
3598 	struct timespec64 ts;
3599 
3600 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3601 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3602 	if (ts.tv_sec == -1)
3603 		return -ENOENT;
3604 	if (ts.tv_sec == 0) {
3605 		ktime_t kt = ktime_get_real();
3606 		sock_write_timestamp(sk, kt);
3607 		ts = ktime_to_timespec64(kt);
3608 	}
3609 
3610 	if (timeval)
3611 		ts.tv_nsec /= 1000;
3612 
3613 #ifdef CONFIG_COMPAT_32BIT_TIME
3614 	if (time32)
3615 		return put_old_timespec32(&ts, userstamp);
3616 #endif
3617 #ifdef CONFIG_SPARC64
3618 	/* beware of padding in sparc64 timeval */
3619 	if (timeval && !in_compat_syscall()) {
3620 		struct __kernel_old_timeval __user tv = {
3621 			.tv_sec = ts.tv_sec,
3622 			.tv_usec = ts.tv_nsec,
3623 		};
3624 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3625 			return -EFAULT;
3626 		return 0;
3627 	}
3628 #endif
3629 	return put_timespec64(&ts, userstamp);
3630 }
3631 EXPORT_SYMBOL(sock_gettstamp);
3632 
sock_enable_timestamp(struct sock * sk,enum sock_flags flag)3633 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3634 {
3635 	if (!sock_flag(sk, flag)) {
3636 		unsigned long previous_flags = sk->sk_flags;
3637 
3638 		sock_set_flag(sk, flag);
3639 		/*
3640 		 * we just set one of the two flags which require net
3641 		 * time stamping, but time stamping might have been on
3642 		 * already because of the other one
3643 		 */
3644 		if (sock_needs_netstamp(sk) &&
3645 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3646 			net_enable_timestamp();
3647 	}
3648 }
3649 
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)3650 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3651 		       int level, int type)
3652 {
3653 	struct sock_exterr_skb *serr;
3654 	struct sk_buff *skb;
3655 	int copied, err;
3656 
3657 	err = -EAGAIN;
3658 	skb = sock_dequeue_err_skb(sk);
3659 	if (skb == NULL)
3660 		goto out;
3661 
3662 	copied = skb->len;
3663 	if (copied > len) {
3664 		msg->msg_flags |= MSG_TRUNC;
3665 		copied = len;
3666 	}
3667 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3668 	if (err)
3669 		goto out_free_skb;
3670 
3671 	sock_recv_timestamp(msg, sk, skb);
3672 
3673 	serr = SKB_EXT_ERR(skb);
3674 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3675 
3676 	msg->msg_flags |= MSG_ERRQUEUE;
3677 	err = copied;
3678 
3679 out_free_skb:
3680 	kfree_skb(skb);
3681 out:
3682 	return err;
3683 }
3684 EXPORT_SYMBOL(sock_recv_errqueue);
3685 
3686 /*
3687  *	Get a socket option on an socket.
3688  *
3689  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3690  *	asynchronous errors should be reported by getsockopt. We assume
3691  *	this means if you specify SO_ERROR (otherwise whats the point of it).
3692  */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3693 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3694 			   char __user *optval, int __user *optlen)
3695 {
3696 	struct sock *sk = sock->sk;
3697 
3698 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3699 	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3700 }
3701 EXPORT_SYMBOL(sock_common_getsockopt);
3702 
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)3703 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3704 			int flags)
3705 {
3706 	struct sock *sk = sock->sk;
3707 	int addr_len = 0;
3708 	int err;
3709 
3710 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3711 	if (err >= 0)
3712 		msg->msg_namelen = addr_len;
3713 	return err;
3714 }
3715 EXPORT_SYMBOL(sock_common_recvmsg);
3716 
3717 /*
3718  *	Set socket options on an inet socket.
3719  */
sock_common_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)3720 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3721 			   sockptr_t optval, unsigned int optlen)
3722 {
3723 	struct sock *sk = sock->sk;
3724 
3725 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3726 	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3727 }
3728 EXPORT_SYMBOL(sock_common_setsockopt);
3729 
sk_common_release(struct sock * sk)3730 void sk_common_release(struct sock *sk)
3731 {
3732 	if (sk->sk_prot->destroy)
3733 		sk->sk_prot->destroy(sk);
3734 
3735 	/*
3736 	 * Observation: when sk_common_release is called, processes have
3737 	 * no access to socket. But net still has.
3738 	 * Step one, detach it from networking:
3739 	 *
3740 	 * A. Remove from hash tables.
3741 	 */
3742 
3743 	sk->sk_prot->unhash(sk);
3744 
3745 	/*
3746 	 * In this point socket cannot receive new packets, but it is possible
3747 	 * that some packets are in flight because some CPU runs receiver and
3748 	 * did hash table lookup before we unhashed socket. They will achieve
3749 	 * receive queue and will be purged by socket destructor.
3750 	 *
3751 	 * Also we still have packets pending on receive queue and probably,
3752 	 * our own packets waiting in device queues. sock_destroy will drain
3753 	 * receive queue, but transmitted packets will delay socket destruction
3754 	 * until the last reference will be released.
3755 	 */
3756 
3757 	sock_orphan(sk);
3758 
3759 	xfrm_sk_free_policy(sk);
3760 
3761 	sock_put(sk);
3762 }
3763 EXPORT_SYMBOL(sk_common_release);
3764 
sk_get_meminfo(const struct sock * sk,u32 * mem)3765 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3766 {
3767 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3768 
3769 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3770 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3771 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3772 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3773 	mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3774 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3775 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3776 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3777 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3778 }
3779 
3780 #ifdef CONFIG_PROC_FS
3781 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3782 
sock_prot_inuse_get(struct net * net,struct proto * prot)3783 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3784 {
3785 	int cpu, idx = prot->inuse_idx;
3786 	int res = 0;
3787 
3788 	for_each_possible_cpu(cpu)
3789 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3790 
3791 	return res >= 0 ? res : 0;
3792 }
3793 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3794 
sock_inuse_get(struct net * net)3795 int sock_inuse_get(struct net *net)
3796 {
3797 	int cpu, res = 0;
3798 
3799 	for_each_possible_cpu(cpu)
3800 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3801 
3802 	return res;
3803 }
3804 
3805 EXPORT_SYMBOL_GPL(sock_inuse_get);
3806 
sock_inuse_init_net(struct net * net)3807 static int __net_init sock_inuse_init_net(struct net *net)
3808 {
3809 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3810 	if (net->core.prot_inuse == NULL)
3811 		return -ENOMEM;
3812 	return 0;
3813 }
3814 
sock_inuse_exit_net(struct net * net)3815 static void __net_exit sock_inuse_exit_net(struct net *net)
3816 {
3817 	free_percpu(net->core.prot_inuse);
3818 }
3819 
3820 static struct pernet_operations net_inuse_ops = {
3821 	.init = sock_inuse_init_net,
3822 	.exit = sock_inuse_exit_net,
3823 };
3824 
net_inuse_init(void)3825 static __init int net_inuse_init(void)
3826 {
3827 	if (register_pernet_subsys(&net_inuse_ops))
3828 		panic("Cannot initialize net inuse counters");
3829 
3830 	return 0;
3831 }
3832 
3833 core_initcall(net_inuse_init);
3834 
assign_proto_idx(struct proto * prot)3835 static int assign_proto_idx(struct proto *prot)
3836 {
3837 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3838 
3839 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3840 		pr_err("PROTO_INUSE_NR exhausted\n");
3841 		return -ENOSPC;
3842 	}
3843 
3844 	set_bit(prot->inuse_idx, proto_inuse_idx);
3845 	return 0;
3846 }
3847 
release_proto_idx(struct proto * prot)3848 static void release_proto_idx(struct proto *prot)
3849 {
3850 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3851 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3852 }
3853 #else
assign_proto_idx(struct proto * prot)3854 static inline int assign_proto_idx(struct proto *prot)
3855 {
3856 	return 0;
3857 }
3858 
release_proto_idx(struct proto * prot)3859 static inline void release_proto_idx(struct proto *prot)
3860 {
3861 }
3862 
3863 #endif
3864 
tw_prot_cleanup(struct timewait_sock_ops * twsk_prot)3865 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3866 {
3867 	if (!twsk_prot)
3868 		return;
3869 	kfree(twsk_prot->twsk_slab_name);
3870 	twsk_prot->twsk_slab_name = NULL;
3871 	kmem_cache_destroy(twsk_prot->twsk_slab);
3872 	twsk_prot->twsk_slab = NULL;
3873 }
3874 
tw_prot_init(const struct proto * prot)3875 static int tw_prot_init(const struct proto *prot)
3876 {
3877 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3878 
3879 	if (!twsk_prot)
3880 		return 0;
3881 
3882 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3883 					      prot->name);
3884 	if (!twsk_prot->twsk_slab_name)
3885 		return -ENOMEM;
3886 
3887 	twsk_prot->twsk_slab =
3888 		kmem_cache_create(twsk_prot->twsk_slab_name,
3889 				  twsk_prot->twsk_obj_size, 0,
3890 				  SLAB_ACCOUNT | prot->slab_flags,
3891 				  NULL);
3892 	if (!twsk_prot->twsk_slab) {
3893 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3894 			prot->name);
3895 		return -ENOMEM;
3896 	}
3897 
3898 	return 0;
3899 }
3900 
req_prot_cleanup(struct request_sock_ops * rsk_prot)3901 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3902 {
3903 	if (!rsk_prot)
3904 		return;
3905 	kfree(rsk_prot->slab_name);
3906 	rsk_prot->slab_name = NULL;
3907 	kmem_cache_destroy(rsk_prot->slab);
3908 	rsk_prot->slab = NULL;
3909 }
3910 
req_prot_init(const struct proto * prot)3911 static int req_prot_init(const struct proto *prot)
3912 {
3913 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3914 
3915 	if (!rsk_prot)
3916 		return 0;
3917 
3918 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3919 					prot->name);
3920 	if (!rsk_prot->slab_name)
3921 		return -ENOMEM;
3922 
3923 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3924 					   rsk_prot->obj_size, 0,
3925 					   SLAB_ACCOUNT | prot->slab_flags,
3926 					   NULL);
3927 
3928 	if (!rsk_prot->slab) {
3929 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3930 			prot->name);
3931 		return -ENOMEM;
3932 	}
3933 	return 0;
3934 }
3935 
proto_register(struct proto * prot,int alloc_slab)3936 int proto_register(struct proto *prot, int alloc_slab)
3937 {
3938 	int ret = -ENOBUFS;
3939 
3940 	if (prot->memory_allocated && !prot->sysctl_mem) {
3941 		pr_err("%s: missing sysctl_mem\n", prot->name);
3942 		return -EINVAL;
3943 	}
3944 	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3945 		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3946 		return -EINVAL;
3947 	}
3948 	if (alloc_slab) {
3949 		prot->slab = kmem_cache_create_usercopy(prot->name,
3950 					prot->obj_size, 0,
3951 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3952 					prot->slab_flags,
3953 					prot->useroffset, prot->usersize,
3954 					NULL);
3955 
3956 		if (prot->slab == NULL) {
3957 			pr_crit("%s: Can't create sock SLAB cache!\n",
3958 				prot->name);
3959 			goto out;
3960 		}
3961 
3962 		if (req_prot_init(prot))
3963 			goto out_free_request_sock_slab;
3964 
3965 		if (tw_prot_init(prot))
3966 			goto out_free_timewait_sock_slab;
3967 	}
3968 
3969 	mutex_lock(&proto_list_mutex);
3970 	ret = assign_proto_idx(prot);
3971 	if (ret) {
3972 		mutex_unlock(&proto_list_mutex);
3973 		goto out_free_timewait_sock_slab;
3974 	}
3975 	list_add(&prot->node, &proto_list);
3976 	mutex_unlock(&proto_list_mutex);
3977 	return ret;
3978 
3979 out_free_timewait_sock_slab:
3980 	if (alloc_slab)
3981 		tw_prot_cleanup(prot->twsk_prot);
3982 out_free_request_sock_slab:
3983 	if (alloc_slab) {
3984 		req_prot_cleanup(prot->rsk_prot);
3985 
3986 		kmem_cache_destroy(prot->slab);
3987 		prot->slab = NULL;
3988 	}
3989 out:
3990 	return ret;
3991 }
3992 EXPORT_SYMBOL(proto_register);
3993 
proto_unregister(struct proto * prot)3994 void proto_unregister(struct proto *prot)
3995 {
3996 	mutex_lock(&proto_list_mutex);
3997 	release_proto_idx(prot);
3998 	list_del(&prot->node);
3999 	mutex_unlock(&proto_list_mutex);
4000 
4001 	kmem_cache_destroy(prot->slab);
4002 	prot->slab = NULL;
4003 
4004 	req_prot_cleanup(prot->rsk_prot);
4005 	tw_prot_cleanup(prot->twsk_prot);
4006 }
4007 EXPORT_SYMBOL(proto_unregister);
4008 
sock_load_diag_module(int family,int protocol)4009 int sock_load_diag_module(int family, int protocol)
4010 {
4011 	if (!protocol) {
4012 		if (!sock_is_registered(family))
4013 			return -ENOENT;
4014 
4015 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4016 				      NETLINK_SOCK_DIAG, family);
4017 	}
4018 
4019 #ifdef CONFIG_INET
4020 	if (family == AF_INET &&
4021 	    protocol != IPPROTO_RAW &&
4022 	    protocol < MAX_INET_PROTOS &&
4023 	    !rcu_access_pointer(inet_protos[protocol]))
4024 		return -ENOENT;
4025 #endif
4026 
4027 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4028 			      NETLINK_SOCK_DIAG, family, protocol);
4029 }
4030 EXPORT_SYMBOL(sock_load_diag_module);
4031 
4032 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)4033 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4034 	__acquires(proto_list_mutex)
4035 {
4036 	mutex_lock(&proto_list_mutex);
4037 	return seq_list_start_head(&proto_list, *pos);
4038 }
4039 
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)4040 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4041 {
4042 	return seq_list_next(v, &proto_list, pos);
4043 }
4044 
proto_seq_stop(struct seq_file * seq,void * v)4045 static void proto_seq_stop(struct seq_file *seq, void *v)
4046 	__releases(proto_list_mutex)
4047 {
4048 	mutex_unlock(&proto_list_mutex);
4049 }
4050 
proto_method_implemented(const void * method)4051 static char proto_method_implemented(const void *method)
4052 {
4053 	return method == NULL ? 'n' : 'y';
4054 }
sock_prot_memory_allocated(struct proto * proto)4055 static long sock_prot_memory_allocated(struct proto *proto)
4056 {
4057 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4058 }
4059 
sock_prot_memory_pressure(struct proto * proto)4060 static const char *sock_prot_memory_pressure(struct proto *proto)
4061 {
4062 	return proto->memory_pressure != NULL ?
4063 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4064 }
4065 
proto_seq_printf(struct seq_file * seq,struct proto * proto)4066 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4067 {
4068 
4069 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4070 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4071 		   proto->name,
4072 		   proto->obj_size,
4073 		   sock_prot_inuse_get(seq_file_net(seq), proto),
4074 		   sock_prot_memory_allocated(proto),
4075 		   sock_prot_memory_pressure(proto),
4076 		   proto->max_header,
4077 		   proto->slab == NULL ? "no" : "yes",
4078 		   module_name(proto->owner),
4079 		   proto_method_implemented(proto->close),
4080 		   proto_method_implemented(proto->connect),
4081 		   proto_method_implemented(proto->disconnect),
4082 		   proto_method_implemented(proto->accept),
4083 		   proto_method_implemented(proto->ioctl),
4084 		   proto_method_implemented(proto->init),
4085 		   proto_method_implemented(proto->destroy),
4086 		   proto_method_implemented(proto->shutdown),
4087 		   proto_method_implemented(proto->setsockopt),
4088 		   proto_method_implemented(proto->getsockopt),
4089 		   proto_method_implemented(proto->sendmsg),
4090 		   proto_method_implemented(proto->recvmsg),
4091 		   proto_method_implemented(proto->bind),
4092 		   proto_method_implemented(proto->backlog_rcv),
4093 		   proto_method_implemented(proto->hash),
4094 		   proto_method_implemented(proto->unhash),
4095 		   proto_method_implemented(proto->get_port),
4096 		   proto_method_implemented(proto->enter_memory_pressure));
4097 }
4098 
proto_seq_show(struct seq_file * seq,void * v)4099 static int proto_seq_show(struct seq_file *seq, void *v)
4100 {
4101 	if (v == &proto_list)
4102 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4103 			   "protocol",
4104 			   "size",
4105 			   "sockets",
4106 			   "memory",
4107 			   "press",
4108 			   "maxhdr",
4109 			   "slab",
4110 			   "module",
4111 			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4112 	else
4113 		proto_seq_printf(seq, list_entry(v, struct proto, node));
4114 	return 0;
4115 }
4116 
4117 static const struct seq_operations proto_seq_ops = {
4118 	.start  = proto_seq_start,
4119 	.next   = proto_seq_next,
4120 	.stop   = proto_seq_stop,
4121 	.show   = proto_seq_show,
4122 };
4123 
proto_init_net(struct net * net)4124 static __net_init int proto_init_net(struct net *net)
4125 {
4126 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4127 			sizeof(struct seq_net_private)))
4128 		return -ENOMEM;
4129 
4130 	return 0;
4131 }
4132 
proto_exit_net(struct net * net)4133 static __net_exit void proto_exit_net(struct net *net)
4134 {
4135 	remove_proc_entry("protocols", net->proc_net);
4136 }
4137 
4138 
4139 static __net_initdata struct pernet_operations proto_net_ops = {
4140 	.init = proto_init_net,
4141 	.exit = proto_exit_net,
4142 };
4143 
proto_init(void)4144 static int __init proto_init(void)
4145 {
4146 	return register_pernet_subsys(&proto_net_ops);
4147 }
4148 
4149 subsys_initcall(proto_init);
4150 
4151 #endif /* PROC_FS */
4152 
4153 #ifdef CONFIG_NET_RX_BUSY_POLL
sk_busy_loop_end(void * p,unsigned long start_time)4154 bool sk_busy_loop_end(void *p, unsigned long start_time)
4155 {
4156 	struct sock *sk = p;
4157 
4158 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4159 		return true;
4160 
4161 	if (sk_is_udp(sk) &&
4162 	    !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4163 		return true;
4164 
4165 	return sk_busy_loop_timeout(sk, start_time);
4166 }
4167 EXPORT_SYMBOL(sk_busy_loop_end);
4168 #endif /* CONFIG_NET_RX_BUSY_POLL */
4169 
sock_bind_add(struct sock * sk,struct sockaddr * addr,int addr_len)4170 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4171 {
4172 	if (!sk->sk_prot->bind_add)
4173 		return -EOPNOTSUPP;
4174 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4175 }
4176 EXPORT_SYMBOL(sock_bind_add);
4177 
4178 /* Copy 'size' bytes from userspace and return `size` back to userspace */
sock_ioctl_inout(struct sock * sk,unsigned int cmd,void __user * arg,void * karg,size_t size)4179 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4180 		     void __user *arg, void *karg, size_t size)
4181 {
4182 	int ret;
4183 
4184 	if (copy_from_user(karg, arg, size))
4185 		return -EFAULT;
4186 
4187 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4188 	if (ret)
4189 		return ret;
4190 
4191 	if (copy_to_user(arg, karg, size))
4192 		return -EFAULT;
4193 
4194 	return 0;
4195 }
4196 EXPORT_SYMBOL(sock_ioctl_inout);
4197 
4198 /* This is the most common ioctl prep function, where the result (4 bytes) is
4199  * copied back to userspace if the ioctl() returns successfully. No input is
4200  * copied from userspace as input argument.
4201  */
sock_ioctl_out(struct sock * sk,unsigned int cmd,void __user * arg)4202 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4203 {
4204 	int ret, karg = 0;
4205 
4206 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4207 	if (ret)
4208 		return ret;
4209 
4210 	return put_user(karg, (int __user *)arg);
4211 }
4212 
4213 /* A wrapper around sock ioctls, which copies the data from userspace
4214  * (depending on the protocol/ioctl), and copies back the result to userspace.
4215  * The main motivation for this function is to pass kernel memory to the
4216  * protocol ioctl callbacks, instead of userspace memory.
4217  */
sk_ioctl(struct sock * sk,unsigned int cmd,void __user * arg)4218 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4219 {
4220 	int rc = 1;
4221 
4222 	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4223 		rc = ipmr_sk_ioctl(sk, cmd, arg);
4224 	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4225 		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4226 	else if (sk_is_phonet(sk))
4227 		rc = phonet_sk_ioctl(sk, cmd, arg);
4228 
4229 	/* If ioctl was processed, returns its value */
4230 	if (rc <= 0)
4231 		return rc;
4232 
4233 	/* Otherwise call the default handler */
4234 	return sock_ioctl_out(sk, cmd, arg);
4235 }
4236 EXPORT_SYMBOL(sk_ioctl);
4237 
sock_struct_check(void)4238 static int __init sock_struct_check(void)
4239 {
4240 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4241 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4242 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4243 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4244 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4245 
4246 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4247 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4248 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4249 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4250 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4251 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4252 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4253 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4254 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4255 
4256 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4257 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4258 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4259 
4260 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4261 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4262 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4263 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4264 
4265 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4266 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4267 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
4268 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4269 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4270 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4271 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4272 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4273 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4274 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
4275 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
4276 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4277 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4278 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4279 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4280 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4281 
4282 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4283 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4284 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4285 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4286 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4287 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4288 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4289 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4290 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4291 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4292 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4293 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4294 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4295 	return 0;
4296 }
4297 
4298 core_initcall(sock_struct_check);
4299