xref: /linux/net/core/sock.c (revision cfaaa7d010d1fc58f9717fcc8591201e741d2d49)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		Generic socket support routines. Memory allocators, socket lock/release
8  *		handler for protocols to use and generic option handler.
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  */
85 
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87 
88 #include <linux/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
121 
122 #include <linux/uaccess.h>
123 
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <linux/skbuff_ref.h>
128 #include <net/net_namespace.h>
129 #include <net/request_sock.h>
130 #include <net/sock.h>
131 #include <net/proto_memory.h>
132 #include <linux/net_tstamp.h>
133 #include <net/xfrm.h>
134 #include <linux/ipsec.h>
135 #include <net/cls_cgroup.h>
136 #include <net/netprio_cgroup.h>
137 #include <linux/sock_diag.h>
138 
139 #include <linux/filter.h>
140 #include <net/sock_reuseport.h>
141 #include <net/bpf_sk_storage.h>
142 
143 #include <trace/events/sock.h>
144 
145 #include <net/tcp.h>
146 #include <net/busy_poll.h>
147 #include <net/phonet/phonet.h>
148 
149 #include <linux/ethtool.h>
150 
151 #include "dev.h"
152 
153 static DEFINE_MUTEX(proto_list_mutex);
154 static LIST_HEAD(proto_list);
155 
156 static void sock_def_write_space_wfree(struct sock *sk);
157 static void sock_def_write_space(struct sock *sk);
158 
159 /**
160  * sk_ns_capable - General socket capability test
161  * @sk: Socket to use a capability on or through
162  * @user_ns: The user namespace of the capability to use
163  * @cap: The capability to use
164  *
165  * Test to see if the opener of the socket had when the socket was
166  * created and the current process has the capability @cap in the user
167  * namespace @user_ns.
168  */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)169 bool sk_ns_capable(const struct sock *sk,
170 		   struct user_namespace *user_ns, int cap)
171 {
172 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
173 		ns_capable(user_ns, cap);
174 }
175 EXPORT_SYMBOL(sk_ns_capable);
176 
177 /**
178  * sk_capable - Socket global capability test
179  * @sk: Socket to use a capability on or through
180  * @cap: The global capability to use
181  *
182  * Test to see if the opener of the socket had when the socket was
183  * created and the current process has the capability @cap in all user
184  * namespaces.
185  */
sk_capable(const struct sock * sk,int cap)186 bool sk_capable(const struct sock *sk, int cap)
187 {
188 	return sk_ns_capable(sk, &init_user_ns, cap);
189 }
190 EXPORT_SYMBOL(sk_capable);
191 
192 /**
193  * sk_net_capable - Network namespace socket capability test
194  * @sk: Socket to use a capability on or through
195  * @cap: The capability to use
196  *
197  * Test to see if the opener of the socket had when the socket was created
198  * and the current process has the capability @cap over the network namespace
199  * the socket is a member of.
200  */
sk_net_capable(const struct sock * sk,int cap)201 bool sk_net_capable(const struct sock *sk, int cap)
202 {
203 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
204 }
205 EXPORT_SYMBOL(sk_net_capable);
206 
207 /*
208  * Each address family might have different locking rules, so we have
209  * one slock key per address family and separate keys for internal and
210  * userspace sockets.
211  */
212 static struct lock_class_key af_family_keys[AF_MAX];
213 static struct lock_class_key af_family_kern_keys[AF_MAX];
214 static struct lock_class_key af_family_slock_keys[AF_MAX];
215 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
216 
217 /*
218  * Make lock validator output more readable. (we pre-construct these
219  * strings build-time, so that runtime initialization of socket
220  * locks is fast):
221  */
222 
223 #define _sock_locks(x)						  \
224   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
225   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
226   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
227   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
228   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
229   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
230   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
231   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
232   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
233   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
234   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
235   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
236   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
237   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
238   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
239   x "AF_MCTP"  , \
240   x "AF_MAX"
241 
242 static const char *const af_family_key_strings[AF_MAX+1] = {
243 	_sock_locks("sk_lock-")
244 };
245 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
246 	_sock_locks("slock-")
247 };
248 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
249 	_sock_locks("clock-")
250 };
251 
252 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
253 	_sock_locks("k-sk_lock-")
254 };
255 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
256 	_sock_locks("k-slock-")
257 };
258 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
259 	_sock_locks("k-clock-")
260 };
261 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
262 	_sock_locks("rlock-")
263 };
264 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
265 	_sock_locks("wlock-")
266 };
267 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
268 	_sock_locks("elock-")
269 };
270 
271 /*
272  * sk_callback_lock and sk queues locking rules are per-address-family,
273  * so split the lock classes by using a per-AF key:
274  */
275 static struct lock_class_key af_callback_keys[AF_MAX];
276 static struct lock_class_key af_rlock_keys[AF_MAX];
277 static struct lock_class_key af_wlock_keys[AF_MAX];
278 static struct lock_class_key af_elock_keys[AF_MAX];
279 static struct lock_class_key af_kern_callback_keys[AF_MAX];
280 
281 /* Run time adjustable parameters. */
282 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
283 EXPORT_SYMBOL(sysctl_wmem_max);
284 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
285 EXPORT_SYMBOL(sysctl_rmem_max);
286 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
287 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
288 
289 int sysctl_tstamp_allow_data __read_mostly = 1;
290 
291 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
292 EXPORT_SYMBOL_GPL(memalloc_socks_key);
293 
294 /**
295  * sk_set_memalloc - sets %SOCK_MEMALLOC
296  * @sk: socket to set it on
297  *
298  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
299  * It's the responsibility of the admin to adjust min_free_kbytes
300  * to meet the requirements
301  */
sk_set_memalloc(struct sock * sk)302 void sk_set_memalloc(struct sock *sk)
303 {
304 	sock_set_flag(sk, SOCK_MEMALLOC);
305 	sk->sk_allocation |= __GFP_MEMALLOC;
306 	static_branch_inc(&memalloc_socks_key);
307 }
308 EXPORT_SYMBOL_GPL(sk_set_memalloc);
309 
sk_clear_memalloc(struct sock * sk)310 void sk_clear_memalloc(struct sock *sk)
311 {
312 	sock_reset_flag(sk, SOCK_MEMALLOC);
313 	sk->sk_allocation &= ~__GFP_MEMALLOC;
314 	static_branch_dec(&memalloc_socks_key);
315 
316 	/*
317 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
318 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
319 	 * it has rmem allocations due to the last swapfile being deactivated
320 	 * but there is a risk that the socket is unusable due to exceeding
321 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
322 	 */
323 	sk_mem_reclaim(sk);
324 }
325 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
326 
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)327 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
328 {
329 	int ret;
330 	unsigned int noreclaim_flag;
331 
332 	/* these should have been dropped before queueing */
333 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
334 
335 	noreclaim_flag = memalloc_noreclaim_save();
336 	ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
337 				 tcp_v6_do_rcv,
338 				 tcp_v4_do_rcv,
339 				 sk, skb);
340 	memalloc_noreclaim_restore(noreclaim_flag);
341 
342 	return ret;
343 }
344 EXPORT_SYMBOL(__sk_backlog_rcv);
345 
sk_error_report(struct sock * sk)346 void sk_error_report(struct sock *sk)
347 {
348 	sk->sk_error_report(sk);
349 
350 	switch (sk->sk_family) {
351 	case AF_INET:
352 		fallthrough;
353 	case AF_INET6:
354 		trace_inet_sk_error_report(sk);
355 		break;
356 	default:
357 		break;
358 	}
359 }
360 EXPORT_SYMBOL(sk_error_report);
361 
sock_get_timeout(long timeo,void * optval,bool old_timeval)362 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
363 {
364 	struct __kernel_sock_timeval tv;
365 
366 	if (timeo == MAX_SCHEDULE_TIMEOUT) {
367 		tv.tv_sec = 0;
368 		tv.tv_usec = 0;
369 	} else {
370 		tv.tv_sec = timeo / HZ;
371 		tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
372 	}
373 
374 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
375 		struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
376 		*(struct old_timeval32 *)optval = tv32;
377 		return sizeof(tv32);
378 	}
379 
380 	if (old_timeval) {
381 		struct __kernel_old_timeval old_tv;
382 		old_tv.tv_sec = tv.tv_sec;
383 		old_tv.tv_usec = tv.tv_usec;
384 		*(struct __kernel_old_timeval *)optval = old_tv;
385 		return sizeof(old_tv);
386 	}
387 
388 	*(struct __kernel_sock_timeval *)optval = tv;
389 	return sizeof(tv);
390 }
391 EXPORT_SYMBOL(sock_get_timeout);
392 
sock_copy_user_timeval(struct __kernel_sock_timeval * tv,sockptr_t optval,int optlen,bool old_timeval)393 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
394 			   sockptr_t optval, int optlen, bool old_timeval)
395 {
396 	if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
397 		struct old_timeval32 tv32;
398 
399 		if (optlen < sizeof(tv32))
400 			return -EINVAL;
401 
402 		if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
403 			return -EFAULT;
404 		tv->tv_sec = tv32.tv_sec;
405 		tv->tv_usec = tv32.tv_usec;
406 	} else if (old_timeval) {
407 		struct __kernel_old_timeval old_tv;
408 
409 		if (optlen < sizeof(old_tv))
410 			return -EINVAL;
411 		if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
412 			return -EFAULT;
413 		tv->tv_sec = old_tv.tv_sec;
414 		tv->tv_usec = old_tv.tv_usec;
415 	} else {
416 		if (optlen < sizeof(*tv))
417 			return -EINVAL;
418 		if (copy_from_sockptr(tv, optval, sizeof(*tv)))
419 			return -EFAULT;
420 	}
421 
422 	return 0;
423 }
424 EXPORT_SYMBOL(sock_copy_user_timeval);
425 
sock_set_timeout(long * timeo_p,sockptr_t optval,int optlen,bool old_timeval)426 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
427 			    bool old_timeval)
428 {
429 	struct __kernel_sock_timeval tv;
430 	int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
431 	long val;
432 
433 	if (err)
434 		return err;
435 
436 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
437 		return -EDOM;
438 
439 	if (tv.tv_sec < 0) {
440 		static int warned __read_mostly;
441 
442 		WRITE_ONCE(*timeo_p, 0);
443 		if (warned < 10 && net_ratelimit()) {
444 			warned++;
445 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
446 				__func__, current->comm, task_pid_nr(current));
447 		}
448 		return 0;
449 	}
450 	val = MAX_SCHEDULE_TIMEOUT;
451 	if ((tv.tv_sec || tv.tv_usec) &&
452 	    (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
453 		val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
454 						    USEC_PER_SEC / HZ);
455 	WRITE_ONCE(*timeo_p, val);
456 	return 0;
457 }
458 
sock_needs_netstamp(const struct sock * sk)459 static bool sock_needs_netstamp(const struct sock *sk)
460 {
461 	switch (sk->sk_family) {
462 	case AF_UNSPEC:
463 	case AF_UNIX:
464 		return false;
465 	default:
466 		return true;
467 	}
468 }
469 
sock_disable_timestamp(struct sock * sk,unsigned long flags)470 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
471 {
472 	if (sk->sk_flags & flags) {
473 		sk->sk_flags &= ~flags;
474 		if (sock_needs_netstamp(sk) &&
475 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
476 			net_disable_timestamp();
477 	}
478 }
479 
480 
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)481 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
482 {
483 	unsigned long flags;
484 	struct sk_buff_head *list = &sk->sk_receive_queue;
485 
486 	if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
487 		atomic_inc(&sk->sk_drops);
488 		trace_sock_rcvqueue_full(sk, skb);
489 		return -ENOMEM;
490 	}
491 
492 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
493 		atomic_inc(&sk->sk_drops);
494 		return -ENOBUFS;
495 	}
496 
497 	skb->dev = NULL;
498 	skb_set_owner_r(skb, sk);
499 
500 	/* we escape from rcu protected region, make sure we dont leak
501 	 * a norefcounted dst
502 	 */
503 	skb_dst_force(skb);
504 
505 	spin_lock_irqsave(&list->lock, flags);
506 	sock_skb_set_dropcount(sk, skb);
507 	__skb_queue_tail(list, skb);
508 	spin_unlock_irqrestore(&list->lock, flags);
509 
510 	if (!sock_flag(sk, SOCK_DEAD))
511 		sk->sk_data_ready(sk);
512 	return 0;
513 }
514 EXPORT_SYMBOL(__sock_queue_rcv_skb);
515 
sock_queue_rcv_skb_reason(struct sock * sk,struct sk_buff * skb,enum skb_drop_reason * reason)516 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
517 			      enum skb_drop_reason *reason)
518 {
519 	enum skb_drop_reason drop_reason;
520 	int err;
521 
522 	err = sk_filter(sk, skb);
523 	if (err) {
524 		drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
525 		goto out;
526 	}
527 	err = __sock_queue_rcv_skb(sk, skb);
528 	switch (err) {
529 	case -ENOMEM:
530 		drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
531 		break;
532 	case -ENOBUFS:
533 		drop_reason = SKB_DROP_REASON_PROTO_MEM;
534 		break;
535 	default:
536 		drop_reason = SKB_NOT_DROPPED_YET;
537 		break;
538 	}
539 out:
540 	if (reason)
541 		*reason = drop_reason;
542 	return err;
543 }
544 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
545 
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)546 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
547 		     const int nested, unsigned int trim_cap, bool refcounted)
548 {
549 	int rc = NET_RX_SUCCESS;
550 
551 	if (sk_filter_trim_cap(sk, skb, trim_cap))
552 		goto discard_and_relse;
553 
554 	skb->dev = NULL;
555 
556 	if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
557 		atomic_inc(&sk->sk_drops);
558 		goto discard_and_relse;
559 	}
560 	if (nested)
561 		bh_lock_sock_nested(sk);
562 	else
563 		bh_lock_sock(sk);
564 	if (!sock_owned_by_user(sk)) {
565 		/*
566 		 * trylock + unlock semantics:
567 		 */
568 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
569 
570 		rc = sk_backlog_rcv(sk, skb);
571 
572 		mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
573 	} else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
574 		bh_unlock_sock(sk);
575 		atomic_inc(&sk->sk_drops);
576 		goto discard_and_relse;
577 	}
578 
579 	bh_unlock_sock(sk);
580 out:
581 	if (refcounted)
582 		sock_put(sk);
583 	return rc;
584 discard_and_relse:
585 	kfree_skb(skb);
586 	goto out;
587 }
588 EXPORT_SYMBOL(__sk_receive_skb);
589 
590 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
591 							  u32));
592 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
593 							   u32));
__sk_dst_check(struct sock * sk,u32 cookie)594 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
595 {
596 	struct dst_entry *dst = __sk_dst_get(sk);
597 
598 	if (dst && dst->obsolete &&
599 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
600 			       dst, cookie) == NULL) {
601 		sk_tx_queue_clear(sk);
602 		WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
603 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
604 		dst_release(dst);
605 		return NULL;
606 	}
607 
608 	return dst;
609 }
610 EXPORT_SYMBOL(__sk_dst_check);
611 
sk_dst_check(struct sock * sk,u32 cookie)612 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
613 {
614 	struct dst_entry *dst = sk_dst_get(sk);
615 
616 	if (dst && dst->obsolete &&
617 	    INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
618 			       dst, cookie) == NULL) {
619 		sk_dst_reset(sk);
620 		dst_release(dst);
621 		return NULL;
622 	}
623 
624 	return dst;
625 }
626 EXPORT_SYMBOL(sk_dst_check);
627 
sock_bindtoindex_locked(struct sock * sk,int ifindex)628 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
629 {
630 	int ret = -ENOPROTOOPT;
631 #ifdef CONFIG_NETDEVICES
632 	struct net *net = sock_net(sk);
633 
634 	/* Sorry... */
635 	ret = -EPERM;
636 	if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
637 		goto out;
638 
639 	ret = -EINVAL;
640 	if (ifindex < 0)
641 		goto out;
642 
643 	/* Paired with all READ_ONCE() done locklessly. */
644 	WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
645 
646 	if (sk->sk_prot->rehash)
647 		sk->sk_prot->rehash(sk);
648 	sk_dst_reset(sk);
649 
650 	ret = 0;
651 
652 out:
653 #endif
654 
655 	return ret;
656 }
657 
sock_bindtoindex(struct sock * sk,int ifindex,bool lock_sk)658 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
659 {
660 	int ret;
661 
662 	if (lock_sk)
663 		lock_sock(sk);
664 	ret = sock_bindtoindex_locked(sk, ifindex);
665 	if (lock_sk)
666 		release_sock(sk);
667 
668 	return ret;
669 }
670 EXPORT_SYMBOL(sock_bindtoindex);
671 
sock_setbindtodevice(struct sock * sk,sockptr_t optval,int optlen)672 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
673 {
674 	int ret = -ENOPROTOOPT;
675 #ifdef CONFIG_NETDEVICES
676 	struct net *net = sock_net(sk);
677 	char devname[IFNAMSIZ];
678 	int index;
679 
680 	ret = -EINVAL;
681 	if (optlen < 0)
682 		goto out;
683 
684 	/* Bind this socket to a particular device like "eth0",
685 	 * as specified in the passed interface name. If the
686 	 * name is "" or the option length is zero the socket
687 	 * is not bound.
688 	 */
689 	if (optlen > IFNAMSIZ - 1)
690 		optlen = IFNAMSIZ - 1;
691 	memset(devname, 0, sizeof(devname));
692 
693 	ret = -EFAULT;
694 	if (copy_from_sockptr(devname, optval, optlen))
695 		goto out;
696 
697 	index = 0;
698 	if (devname[0] != '\0') {
699 		struct net_device *dev;
700 
701 		rcu_read_lock();
702 		dev = dev_get_by_name_rcu(net, devname);
703 		if (dev)
704 			index = dev->ifindex;
705 		rcu_read_unlock();
706 		ret = -ENODEV;
707 		if (!dev)
708 			goto out;
709 	}
710 
711 	sockopt_lock_sock(sk);
712 	ret = sock_bindtoindex_locked(sk, index);
713 	sockopt_release_sock(sk);
714 out:
715 #endif
716 
717 	return ret;
718 }
719 
sock_getbindtodevice(struct sock * sk,sockptr_t optval,sockptr_t optlen,int len)720 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
721 				sockptr_t optlen, int len)
722 {
723 	int ret = -ENOPROTOOPT;
724 #ifdef CONFIG_NETDEVICES
725 	int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
726 	struct net *net = sock_net(sk);
727 	char devname[IFNAMSIZ];
728 
729 	if (bound_dev_if == 0) {
730 		len = 0;
731 		goto zero;
732 	}
733 
734 	ret = -EINVAL;
735 	if (len < IFNAMSIZ)
736 		goto out;
737 
738 	ret = netdev_get_name(net, devname, bound_dev_if);
739 	if (ret)
740 		goto out;
741 
742 	len = strlen(devname) + 1;
743 
744 	ret = -EFAULT;
745 	if (copy_to_sockptr(optval, devname, len))
746 		goto out;
747 
748 zero:
749 	ret = -EFAULT;
750 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
751 		goto out;
752 
753 	ret = 0;
754 
755 out:
756 #endif
757 
758 	return ret;
759 }
760 
sk_mc_loop(const struct sock * sk)761 bool sk_mc_loop(const struct sock *sk)
762 {
763 	if (dev_recursion_level())
764 		return false;
765 	if (!sk)
766 		return true;
767 	/* IPV6_ADDRFORM can change sk->sk_family under us. */
768 	switch (READ_ONCE(sk->sk_family)) {
769 	case AF_INET:
770 		return inet_test_bit(MC_LOOP, sk);
771 #if IS_ENABLED(CONFIG_IPV6)
772 	case AF_INET6:
773 		return inet6_test_bit(MC6_LOOP, sk);
774 #endif
775 	}
776 	WARN_ON_ONCE(1);
777 	return true;
778 }
779 EXPORT_SYMBOL(sk_mc_loop);
780 
sock_set_reuseaddr(struct sock * sk)781 void sock_set_reuseaddr(struct sock *sk)
782 {
783 	lock_sock(sk);
784 	sk->sk_reuse = SK_CAN_REUSE;
785 	release_sock(sk);
786 }
787 EXPORT_SYMBOL(sock_set_reuseaddr);
788 
sock_set_reuseport(struct sock * sk)789 void sock_set_reuseport(struct sock *sk)
790 {
791 	lock_sock(sk);
792 	sk->sk_reuseport = true;
793 	release_sock(sk);
794 }
795 EXPORT_SYMBOL(sock_set_reuseport);
796 
sock_no_linger(struct sock * sk)797 void sock_no_linger(struct sock *sk)
798 {
799 	lock_sock(sk);
800 	WRITE_ONCE(sk->sk_lingertime, 0);
801 	sock_set_flag(sk, SOCK_LINGER);
802 	release_sock(sk);
803 }
804 EXPORT_SYMBOL(sock_no_linger);
805 
sock_set_priority(struct sock * sk,u32 priority)806 void sock_set_priority(struct sock *sk, u32 priority)
807 {
808 	WRITE_ONCE(sk->sk_priority, priority);
809 }
810 EXPORT_SYMBOL(sock_set_priority);
811 
sock_set_sndtimeo(struct sock * sk,s64 secs)812 void sock_set_sndtimeo(struct sock *sk, s64 secs)
813 {
814 	lock_sock(sk);
815 	if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
816 		WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
817 	else
818 		WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
819 	release_sock(sk);
820 }
821 EXPORT_SYMBOL(sock_set_sndtimeo);
822 
__sock_set_timestamps(struct sock * sk,bool val,bool new,bool ns)823 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
824 {
825 	if (val)  {
826 		sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
827 		sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
828 		sock_set_flag(sk, SOCK_RCVTSTAMP);
829 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
830 	} else {
831 		sock_reset_flag(sk, SOCK_RCVTSTAMP);
832 		sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
833 	}
834 }
835 
sock_enable_timestamps(struct sock * sk)836 void sock_enable_timestamps(struct sock *sk)
837 {
838 	lock_sock(sk);
839 	__sock_set_timestamps(sk, true, false, true);
840 	release_sock(sk);
841 }
842 EXPORT_SYMBOL(sock_enable_timestamps);
843 
sock_set_timestamp(struct sock * sk,int optname,bool valbool)844 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
845 {
846 	switch (optname) {
847 	case SO_TIMESTAMP_OLD:
848 		__sock_set_timestamps(sk, valbool, false, false);
849 		break;
850 	case SO_TIMESTAMP_NEW:
851 		__sock_set_timestamps(sk, valbool, true, false);
852 		break;
853 	case SO_TIMESTAMPNS_OLD:
854 		__sock_set_timestamps(sk, valbool, false, true);
855 		break;
856 	case SO_TIMESTAMPNS_NEW:
857 		__sock_set_timestamps(sk, valbool, true, true);
858 		break;
859 	}
860 }
861 
sock_timestamping_bind_phc(struct sock * sk,int phc_index)862 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
863 {
864 	struct net *net = sock_net(sk);
865 	struct net_device *dev = NULL;
866 	bool match = false;
867 	int *vclock_index;
868 	int i, num;
869 
870 	if (sk->sk_bound_dev_if)
871 		dev = dev_get_by_index(net, sk->sk_bound_dev_if);
872 
873 	if (!dev) {
874 		pr_err("%s: sock not bind to device\n", __func__);
875 		return -EOPNOTSUPP;
876 	}
877 
878 	num = ethtool_get_phc_vclocks(dev, &vclock_index);
879 	dev_put(dev);
880 
881 	for (i = 0; i < num; i++) {
882 		if (*(vclock_index + i) == phc_index) {
883 			match = true;
884 			break;
885 		}
886 	}
887 
888 	if (num > 0)
889 		kfree(vclock_index);
890 
891 	if (!match)
892 		return -EINVAL;
893 
894 	WRITE_ONCE(sk->sk_bind_phc, phc_index);
895 
896 	return 0;
897 }
898 
sock_set_timestamping(struct sock * sk,int optname,struct so_timestamping timestamping)899 int sock_set_timestamping(struct sock *sk, int optname,
900 			  struct so_timestamping timestamping)
901 {
902 	int val = timestamping.flags;
903 	int ret;
904 
905 	if (val & ~SOF_TIMESTAMPING_MASK)
906 		return -EINVAL;
907 
908 	if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
909 	    !(val & SOF_TIMESTAMPING_OPT_ID))
910 		return -EINVAL;
911 
912 	if (val & SOF_TIMESTAMPING_OPT_ID &&
913 	    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
914 		if (sk_is_tcp(sk)) {
915 			if ((1 << sk->sk_state) &
916 			    (TCPF_CLOSE | TCPF_LISTEN))
917 				return -EINVAL;
918 			if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
919 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
920 			else
921 				atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
922 		} else {
923 			atomic_set(&sk->sk_tskey, 0);
924 		}
925 	}
926 
927 	if (val & SOF_TIMESTAMPING_OPT_STATS &&
928 	    !(val & SOF_TIMESTAMPING_OPT_TSONLY))
929 		return -EINVAL;
930 
931 	if (val & SOF_TIMESTAMPING_BIND_PHC) {
932 		ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
933 		if (ret)
934 			return ret;
935 	}
936 
937 	WRITE_ONCE(sk->sk_tsflags, val);
938 	sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
939 
940 	if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
941 		sock_enable_timestamp(sk,
942 				      SOCK_TIMESTAMPING_RX_SOFTWARE);
943 	else
944 		sock_disable_timestamp(sk,
945 				       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
946 	return 0;
947 }
948 
sock_set_keepalive(struct sock * sk)949 void sock_set_keepalive(struct sock *sk)
950 {
951 	lock_sock(sk);
952 	if (sk->sk_prot->keepalive)
953 		sk->sk_prot->keepalive(sk, true);
954 	sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
955 	release_sock(sk);
956 }
957 EXPORT_SYMBOL(sock_set_keepalive);
958 
__sock_set_rcvbuf(struct sock * sk,int val)959 static void __sock_set_rcvbuf(struct sock *sk, int val)
960 {
961 	/* Ensure val * 2 fits into an int, to prevent max_t() from treating it
962 	 * as a negative value.
963 	 */
964 	val = min_t(int, val, INT_MAX / 2);
965 	sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
966 
967 	/* We double it on the way in to account for "struct sk_buff" etc.
968 	 * overhead.   Applications assume that the SO_RCVBUF setting they make
969 	 * will allow that much actual data to be received on that socket.
970 	 *
971 	 * Applications are unaware that "struct sk_buff" and other overheads
972 	 * allocate from the receive buffer during socket buffer allocation.
973 	 *
974 	 * And after considering the possible alternatives, returning the value
975 	 * we actually used in getsockopt is the most desirable behavior.
976 	 */
977 	WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
978 }
979 
sock_set_rcvbuf(struct sock * sk,int val)980 void sock_set_rcvbuf(struct sock *sk, int val)
981 {
982 	lock_sock(sk);
983 	__sock_set_rcvbuf(sk, val);
984 	release_sock(sk);
985 }
986 EXPORT_SYMBOL(sock_set_rcvbuf);
987 
__sock_set_mark(struct sock * sk,u32 val)988 static void __sock_set_mark(struct sock *sk, u32 val)
989 {
990 	if (val != sk->sk_mark) {
991 		WRITE_ONCE(sk->sk_mark, val);
992 		sk_dst_reset(sk);
993 	}
994 }
995 
sock_set_mark(struct sock * sk,u32 val)996 void sock_set_mark(struct sock *sk, u32 val)
997 {
998 	lock_sock(sk);
999 	__sock_set_mark(sk, val);
1000 	release_sock(sk);
1001 }
1002 EXPORT_SYMBOL(sock_set_mark);
1003 
sock_release_reserved_memory(struct sock * sk,int bytes)1004 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1005 {
1006 	/* Round down bytes to multiple of pages */
1007 	bytes = round_down(bytes, PAGE_SIZE);
1008 
1009 	WARN_ON(bytes > sk->sk_reserved_mem);
1010 	WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1011 	sk_mem_reclaim(sk);
1012 }
1013 
sock_reserve_memory(struct sock * sk,int bytes)1014 static int sock_reserve_memory(struct sock *sk, int bytes)
1015 {
1016 	long allocated;
1017 	bool charged;
1018 	int pages;
1019 
1020 	if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1021 		return -EOPNOTSUPP;
1022 
1023 	if (!bytes)
1024 		return 0;
1025 
1026 	pages = sk_mem_pages(bytes);
1027 
1028 	/* pre-charge to memcg */
1029 	charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1030 					  GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1031 	if (!charged)
1032 		return -ENOMEM;
1033 
1034 	/* pre-charge to forward_alloc */
1035 	sk_memory_allocated_add(sk, pages);
1036 	allocated = sk_memory_allocated(sk);
1037 	/* If the system goes into memory pressure with this
1038 	 * precharge, give up and return error.
1039 	 */
1040 	if (allocated > sk_prot_mem_limits(sk, 1)) {
1041 		sk_memory_allocated_sub(sk, pages);
1042 		mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1043 		return -ENOMEM;
1044 	}
1045 	sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1046 
1047 	WRITE_ONCE(sk->sk_reserved_mem,
1048 		   sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1049 
1050 	return 0;
1051 }
1052 
1053 #ifdef CONFIG_PAGE_POOL
1054 
1055 /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
1056  * in 1 syscall. The limit exists to limit the amount of memory the kernel
1057  * allocates to copy these tokens, and to prevent looping over the frags for
1058  * too long.
1059  */
1060 #define MAX_DONTNEED_TOKENS 128
1061 #define MAX_DONTNEED_FRAGS 1024
1062 
1063 static noinline_for_stack int
sock_devmem_dontneed(struct sock * sk,sockptr_t optval,unsigned int optlen)1064 sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
1065 {
1066 	unsigned int num_tokens, i, j, k, netmem_num = 0;
1067 	struct dmabuf_token *tokens;
1068 	int ret = 0, num_frags = 0;
1069 	netmem_ref netmems[16];
1070 
1071 	if (!sk_is_tcp(sk))
1072 		return -EBADF;
1073 
1074 	if (optlen % sizeof(*tokens) ||
1075 	    optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
1076 		return -EINVAL;
1077 
1078 	num_tokens = optlen / sizeof(*tokens);
1079 	tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
1080 	if (!tokens)
1081 		return -ENOMEM;
1082 
1083 	if (copy_from_sockptr(tokens, optval, optlen)) {
1084 		kvfree(tokens);
1085 		return -EFAULT;
1086 	}
1087 
1088 	xa_lock_bh(&sk->sk_user_frags);
1089 	for (i = 0; i < num_tokens; i++) {
1090 		for (j = 0; j < tokens[i].token_count; j++) {
1091 			if (++num_frags > MAX_DONTNEED_FRAGS)
1092 				goto frag_limit_reached;
1093 
1094 			netmem_ref netmem = (__force netmem_ref)__xa_erase(
1095 				&sk->sk_user_frags, tokens[i].token_start + j);
1096 
1097 			if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
1098 				continue;
1099 
1100 			netmems[netmem_num++] = netmem;
1101 			if (netmem_num == ARRAY_SIZE(netmems)) {
1102 				xa_unlock_bh(&sk->sk_user_frags);
1103 				for (k = 0; k < netmem_num; k++)
1104 					WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1105 				netmem_num = 0;
1106 				xa_lock_bh(&sk->sk_user_frags);
1107 			}
1108 			ret++;
1109 		}
1110 	}
1111 
1112 frag_limit_reached:
1113 	xa_unlock_bh(&sk->sk_user_frags);
1114 	for (k = 0; k < netmem_num; k++)
1115 		WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1116 
1117 	kvfree(tokens);
1118 	return ret;
1119 }
1120 #endif
1121 
sockopt_lock_sock(struct sock * sk)1122 void sockopt_lock_sock(struct sock *sk)
1123 {
1124 	/* When current->bpf_ctx is set, the setsockopt is called from
1125 	 * a bpf prog.  bpf has ensured the sk lock has been
1126 	 * acquired before calling setsockopt().
1127 	 */
1128 	if (has_current_bpf_ctx())
1129 		return;
1130 
1131 	lock_sock(sk);
1132 }
1133 EXPORT_SYMBOL(sockopt_lock_sock);
1134 
sockopt_release_sock(struct sock * sk)1135 void sockopt_release_sock(struct sock *sk)
1136 {
1137 	if (has_current_bpf_ctx())
1138 		return;
1139 
1140 	release_sock(sk);
1141 }
1142 EXPORT_SYMBOL(sockopt_release_sock);
1143 
sockopt_ns_capable(struct user_namespace * ns,int cap)1144 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1145 {
1146 	return has_current_bpf_ctx() || ns_capable(ns, cap);
1147 }
1148 EXPORT_SYMBOL(sockopt_ns_capable);
1149 
sockopt_capable(int cap)1150 bool sockopt_capable(int cap)
1151 {
1152 	return has_current_bpf_ctx() || capable(cap);
1153 }
1154 EXPORT_SYMBOL(sockopt_capable);
1155 
sockopt_validate_clockid(__kernel_clockid_t value)1156 static int sockopt_validate_clockid(__kernel_clockid_t value)
1157 {
1158 	switch (value) {
1159 	case CLOCK_REALTIME:
1160 	case CLOCK_MONOTONIC:
1161 	case CLOCK_TAI:
1162 		return 0;
1163 	}
1164 	return -EINVAL;
1165 }
1166 
1167 /*
1168  *	This is meant for all protocols to use and covers goings on
1169  *	at the socket level. Everything here is generic.
1170  */
1171 
sk_setsockopt(struct sock * sk,int level,int optname,sockptr_t optval,unsigned int optlen)1172 int sk_setsockopt(struct sock *sk, int level, int optname,
1173 		  sockptr_t optval, unsigned int optlen)
1174 {
1175 	struct so_timestamping timestamping;
1176 	struct socket *sock = sk->sk_socket;
1177 	struct sock_txtime sk_txtime;
1178 	int val;
1179 	int valbool;
1180 	struct linger ling;
1181 	int ret = 0;
1182 
1183 	/*
1184 	 *	Options without arguments
1185 	 */
1186 
1187 	if (optname == SO_BINDTODEVICE)
1188 		return sock_setbindtodevice(sk, optval, optlen);
1189 
1190 	if (optlen < sizeof(int))
1191 		return -EINVAL;
1192 
1193 	if (copy_from_sockptr(&val, optval, sizeof(val)))
1194 		return -EFAULT;
1195 
1196 	valbool = val ? 1 : 0;
1197 
1198 	/* handle options which do not require locking the socket. */
1199 	switch (optname) {
1200 	case SO_PRIORITY:
1201 		if ((val >= 0 && val <= 6) ||
1202 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1203 		    sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1204 			sock_set_priority(sk, val);
1205 			return 0;
1206 		}
1207 		return -EPERM;
1208 	case SO_PASSSEC:
1209 		assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1210 		return 0;
1211 	case SO_PASSCRED:
1212 		assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1213 		return 0;
1214 	case SO_PASSPIDFD:
1215 		assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1216 		return 0;
1217 	case SO_TYPE:
1218 	case SO_PROTOCOL:
1219 	case SO_DOMAIN:
1220 	case SO_ERROR:
1221 		return -ENOPROTOOPT;
1222 #ifdef CONFIG_NET_RX_BUSY_POLL
1223 	case SO_BUSY_POLL:
1224 		if (val < 0)
1225 			return -EINVAL;
1226 		WRITE_ONCE(sk->sk_ll_usec, val);
1227 		return 0;
1228 	case SO_PREFER_BUSY_POLL:
1229 		if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1230 			return -EPERM;
1231 		WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1232 		return 0;
1233 	case SO_BUSY_POLL_BUDGET:
1234 		if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1235 		    !sockopt_capable(CAP_NET_ADMIN))
1236 			return -EPERM;
1237 		if (val < 0 || val > U16_MAX)
1238 			return -EINVAL;
1239 		WRITE_ONCE(sk->sk_busy_poll_budget, val);
1240 		return 0;
1241 #endif
1242 	case SO_MAX_PACING_RATE:
1243 		{
1244 		unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1245 		unsigned long pacing_rate;
1246 
1247 		if (sizeof(ulval) != sizeof(val) &&
1248 		    optlen >= sizeof(ulval) &&
1249 		    copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1250 			return -EFAULT;
1251 		}
1252 		if (ulval != ~0UL)
1253 			cmpxchg(&sk->sk_pacing_status,
1254 				SK_PACING_NONE,
1255 				SK_PACING_NEEDED);
1256 		/* Pairs with READ_ONCE() from sk_getsockopt() */
1257 		WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1258 		pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1259 		if (ulval < pacing_rate)
1260 			WRITE_ONCE(sk->sk_pacing_rate, ulval);
1261 		return 0;
1262 		}
1263 	case SO_TXREHASH:
1264 		if (val < -1 || val > 1)
1265 			return -EINVAL;
1266 		if ((u8)val == SOCK_TXREHASH_DEFAULT)
1267 			val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1268 		/* Paired with READ_ONCE() in tcp_rtx_synack()
1269 		 * and sk_getsockopt().
1270 		 */
1271 		WRITE_ONCE(sk->sk_txrehash, (u8)val);
1272 		return 0;
1273 	case SO_PEEK_OFF:
1274 		{
1275 		int (*set_peek_off)(struct sock *sk, int val);
1276 
1277 		set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1278 		if (set_peek_off)
1279 			ret = set_peek_off(sk, val);
1280 		else
1281 			ret = -EOPNOTSUPP;
1282 		return ret;
1283 		}
1284 #ifdef CONFIG_PAGE_POOL
1285 	case SO_DEVMEM_DONTNEED:
1286 		return sock_devmem_dontneed(sk, optval, optlen);
1287 #endif
1288 	}
1289 
1290 	sockopt_lock_sock(sk);
1291 
1292 	switch (optname) {
1293 	case SO_DEBUG:
1294 		if (val && !sockopt_capable(CAP_NET_ADMIN))
1295 			ret = -EACCES;
1296 		else
1297 			sock_valbool_flag(sk, SOCK_DBG, valbool);
1298 		break;
1299 	case SO_REUSEADDR:
1300 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1301 		break;
1302 	case SO_REUSEPORT:
1303 		sk->sk_reuseport = valbool;
1304 		break;
1305 	case SO_DONTROUTE:
1306 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1307 		sk_dst_reset(sk);
1308 		break;
1309 	case SO_BROADCAST:
1310 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1311 		break;
1312 	case SO_SNDBUF:
1313 		/* Don't error on this BSD doesn't and if you think
1314 		 * about it this is right. Otherwise apps have to
1315 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1316 		 * are treated in BSD as hints
1317 		 */
1318 		val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1319 set_sndbuf:
1320 		/* Ensure val * 2 fits into an int, to prevent max_t()
1321 		 * from treating it as a negative value.
1322 		 */
1323 		val = min_t(int, val, INT_MAX / 2);
1324 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1325 		WRITE_ONCE(sk->sk_sndbuf,
1326 			   max_t(int, val * 2, SOCK_MIN_SNDBUF));
1327 		/* Wake up sending tasks if we upped the value. */
1328 		sk->sk_write_space(sk);
1329 		break;
1330 
1331 	case SO_SNDBUFFORCE:
1332 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1333 			ret = -EPERM;
1334 			break;
1335 		}
1336 
1337 		/* No negative values (to prevent underflow, as val will be
1338 		 * multiplied by 2).
1339 		 */
1340 		if (val < 0)
1341 			val = 0;
1342 		goto set_sndbuf;
1343 
1344 	case SO_RCVBUF:
1345 		/* Don't error on this BSD doesn't and if you think
1346 		 * about it this is right. Otherwise apps have to
1347 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1348 		 * are treated in BSD as hints
1349 		 */
1350 		__sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1351 		break;
1352 
1353 	case SO_RCVBUFFORCE:
1354 		if (!sockopt_capable(CAP_NET_ADMIN)) {
1355 			ret = -EPERM;
1356 			break;
1357 		}
1358 
1359 		/* No negative values (to prevent underflow, as val will be
1360 		 * multiplied by 2).
1361 		 */
1362 		__sock_set_rcvbuf(sk, max(val, 0));
1363 		break;
1364 
1365 	case SO_KEEPALIVE:
1366 		if (sk->sk_prot->keepalive)
1367 			sk->sk_prot->keepalive(sk, valbool);
1368 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1369 		break;
1370 
1371 	case SO_OOBINLINE:
1372 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1373 		break;
1374 
1375 	case SO_NO_CHECK:
1376 		sk->sk_no_check_tx = valbool;
1377 		break;
1378 
1379 	case SO_LINGER:
1380 		if (optlen < sizeof(ling)) {
1381 			ret = -EINVAL;	/* 1003.1g */
1382 			break;
1383 		}
1384 		if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1385 			ret = -EFAULT;
1386 			break;
1387 		}
1388 		if (!ling.l_onoff) {
1389 			sock_reset_flag(sk, SOCK_LINGER);
1390 		} else {
1391 			unsigned long t_sec = ling.l_linger;
1392 
1393 			if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1394 				WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1395 			else
1396 				WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1397 			sock_set_flag(sk, SOCK_LINGER);
1398 		}
1399 		break;
1400 
1401 	case SO_BSDCOMPAT:
1402 		break;
1403 
1404 	case SO_TIMESTAMP_OLD:
1405 	case SO_TIMESTAMP_NEW:
1406 	case SO_TIMESTAMPNS_OLD:
1407 	case SO_TIMESTAMPNS_NEW:
1408 		sock_set_timestamp(sk, optname, valbool);
1409 		break;
1410 
1411 	case SO_TIMESTAMPING_NEW:
1412 	case SO_TIMESTAMPING_OLD:
1413 		if (optlen == sizeof(timestamping)) {
1414 			if (copy_from_sockptr(&timestamping, optval,
1415 					      sizeof(timestamping))) {
1416 				ret = -EFAULT;
1417 				break;
1418 			}
1419 		} else {
1420 			memset(&timestamping, 0, sizeof(timestamping));
1421 			timestamping.flags = val;
1422 		}
1423 		ret = sock_set_timestamping(sk, optname, timestamping);
1424 		break;
1425 
1426 	case SO_RCVLOWAT:
1427 		{
1428 		int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1429 
1430 		if (val < 0)
1431 			val = INT_MAX;
1432 		if (sock)
1433 			set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1434 		if (set_rcvlowat)
1435 			ret = set_rcvlowat(sk, val);
1436 		else
1437 			WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1438 		break;
1439 		}
1440 	case SO_RCVTIMEO_OLD:
1441 	case SO_RCVTIMEO_NEW:
1442 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1443 				       optlen, optname == SO_RCVTIMEO_OLD);
1444 		break;
1445 
1446 	case SO_SNDTIMEO_OLD:
1447 	case SO_SNDTIMEO_NEW:
1448 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1449 				       optlen, optname == SO_SNDTIMEO_OLD);
1450 		break;
1451 
1452 	case SO_ATTACH_FILTER: {
1453 		struct sock_fprog fprog;
1454 
1455 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1456 		if (!ret)
1457 			ret = sk_attach_filter(&fprog, sk);
1458 		break;
1459 	}
1460 	case SO_ATTACH_BPF:
1461 		ret = -EINVAL;
1462 		if (optlen == sizeof(u32)) {
1463 			u32 ufd;
1464 
1465 			ret = -EFAULT;
1466 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1467 				break;
1468 
1469 			ret = sk_attach_bpf(ufd, sk);
1470 		}
1471 		break;
1472 
1473 	case SO_ATTACH_REUSEPORT_CBPF: {
1474 		struct sock_fprog fprog;
1475 
1476 		ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1477 		if (!ret)
1478 			ret = sk_reuseport_attach_filter(&fprog, sk);
1479 		break;
1480 	}
1481 	case SO_ATTACH_REUSEPORT_EBPF:
1482 		ret = -EINVAL;
1483 		if (optlen == sizeof(u32)) {
1484 			u32 ufd;
1485 
1486 			ret = -EFAULT;
1487 			if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1488 				break;
1489 
1490 			ret = sk_reuseport_attach_bpf(ufd, sk);
1491 		}
1492 		break;
1493 
1494 	case SO_DETACH_REUSEPORT_BPF:
1495 		ret = reuseport_detach_prog(sk);
1496 		break;
1497 
1498 	case SO_DETACH_FILTER:
1499 		ret = sk_detach_filter(sk);
1500 		break;
1501 
1502 	case SO_LOCK_FILTER:
1503 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1504 			ret = -EPERM;
1505 		else
1506 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1507 		break;
1508 
1509 	case SO_MARK:
1510 		if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1511 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1512 			ret = -EPERM;
1513 			break;
1514 		}
1515 
1516 		__sock_set_mark(sk, val);
1517 		break;
1518 	case SO_RCVMARK:
1519 		sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1520 		break;
1521 
1522 	case SO_RXQ_OVFL:
1523 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1524 		break;
1525 
1526 	case SO_WIFI_STATUS:
1527 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1528 		break;
1529 
1530 	case SO_NOFCS:
1531 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1532 		break;
1533 
1534 	case SO_SELECT_ERR_QUEUE:
1535 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1536 		break;
1537 
1538 
1539 	case SO_INCOMING_CPU:
1540 		reuseport_update_incoming_cpu(sk, val);
1541 		break;
1542 
1543 	case SO_CNX_ADVICE:
1544 		if (val == 1)
1545 			dst_negative_advice(sk);
1546 		break;
1547 
1548 	case SO_ZEROCOPY:
1549 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1550 			if (!(sk_is_tcp(sk) ||
1551 			      (sk->sk_type == SOCK_DGRAM &&
1552 			       sk->sk_protocol == IPPROTO_UDP)))
1553 				ret = -EOPNOTSUPP;
1554 		} else if (sk->sk_family != PF_RDS) {
1555 			ret = -EOPNOTSUPP;
1556 		}
1557 		if (!ret) {
1558 			if (val < 0 || val > 1)
1559 				ret = -EINVAL;
1560 			else
1561 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1562 		}
1563 		break;
1564 
1565 	case SO_TXTIME:
1566 		if (optlen != sizeof(struct sock_txtime)) {
1567 			ret = -EINVAL;
1568 			break;
1569 		} else if (copy_from_sockptr(&sk_txtime, optval,
1570 			   sizeof(struct sock_txtime))) {
1571 			ret = -EFAULT;
1572 			break;
1573 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1574 			ret = -EINVAL;
1575 			break;
1576 		}
1577 		/* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1578 		 * scheduler has enough safe guards.
1579 		 */
1580 		if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1581 		    !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1582 			ret = -EPERM;
1583 			break;
1584 		}
1585 
1586 		ret = sockopt_validate_clockid(sk_txtime.clockid);
1587 		if (ret)
1588 			break;
1589 
1590 		sock_valbool_flag(sk, SOCK_TXTIME, true);
1591 		sk->sk_clockid = sk_txtime.clockid;
1592 		sk->sk_txtime_deadline_mode =
1593 			!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1594 		sk->sk_txtime_report_errors =
1595 			!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1596 		break;
1597 
1598 	case SO_BINDTOIFINDEX:
1599 		ret = sock_bindtoindex_locked(sk, val);
1600 		break;
1601 
1602 	case SO_BUF_LOCK:
1603 		if (val & ~SOCK_BUF_LOCK_MASK) {
1604 			ret = -EINVAL;
1605 			break;
1606 		}
1607 		sk->sk_userlocks = val | (sk->sk_userlocks &
1608 					  ~SOCK_BUF_LOCK_MASK);
1609 		break;
1610 
1611 	case SO_RESERVE_MEM:
1612 	{
1613 		int delta;
1614 
1615 		if (val < 0) {
1616 			ret = -EINVAL;
1617 			break;
1618 		}
1619 
1620 		delta = val - sk->sk_reserved_mem;
1621 		if (delta < 0)
1622 			sock_release_reserved_memory(sk, -delta);
1623 		else
1624 			ret = sock_reserve_memory(sk, delta);
1625 		break;
1626 	}
1627 
1628 	default:
1629 		ret = -ENOPROTOOPT;
1630 		break;
1631 	}
1632 	sockopt_release_sock(sk);
1633 	return ret;
1634 }
1635 
sock_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)1636 int sock_setsockopt(struct socket *sock, int level, int optname,
1637 		    sockptr_t optval, unsigned int optlen)
1638 {
1639 	return sk_setsockopt(sock->sk, level, optname,
1640 			     optval, optlen);
1641 }
1642 EXPORT_SYMBOL(sock_setsockopt);
1643 
sk_get_peer_cred(struct sock * sk)1644 static const struct cred *sk_get_peer_cred(struct sock *sk)
1645 {
1646 	const struct cred *cred;
1647 
1648 	spin_lock(&sk->sk_peer_lock);
1649 	cred = get_cred(sk->sk_peer_cred);
1650 	spin_unlock(&sk->sk_peer_lock);
1651 
1652 	return cred;
1653 }
1654 
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1655 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1656 			  struct ucred *ucred)
1657 {
1658 	ucred->pid = pid_vnr(pid);
1659 	ucred->uid = ucred->gid = -1;
1660 	if (cred) {
1661 		struct user_namespace *current_ns = current_user_ns();
1662 
1663 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1664 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1665 	}
1666 }
1667 
groups_to_user(sockptr_t dst,const struct group_info * src)1668 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1669 {
1670 	struct user_namespace *user_ns = current_user_ns();
1671 	int i;
1672 
1673 	for (i = 0; i < src->ngroups; i++) {
1674 		gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1675 
1676 		if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1677 			return -EFAULT;
1678 	}
1679 
1680 	return 0;
1681 }
1682 
sk_getsockopt(struct sock * sk,int level,int optname,sockptr_t optval,sockptr_t optlen)1683 int sk_getsockopt(struct sock *sk, int level, int optname,
1684 		  sockptr_t optval, sockptr_t optlen)
1685 {
1686 	struct socket *sock = sk->sk_socket;
1687 
1688 	union {
1689 		int val;
1690 		u64 val64;
1691 		unsigned long ulval;
1692 		struct linger ling;
1693 		struct old_timeval32 tm32;
1694 		struct __kernel_old_timeval tm;
1695 		struct  __kernel_sock_timeval stm;
1696 		struct sock_txtime txtime;
1697 		struct so_timestamping timestamping;
1698 	} v;
1699 
1700 	int lv = sizeof(int);
1701 	int len;
1702 
1703 	if (copy_from_sockptr(&len, optlen, sizeof(int)))
1704 		return -EFAULT;
1705 	if (len < 0)
1706 		return -EINVAL;
1707 
1708 	memset(&v, 0, sizeof(v));
1709 
1710 	switch (optname) {
1711 	case SO_DEBUG:
1712 		v.val = sock_flag(sk, SOCK_DBG);
1713 		break;
1714 
1715 	case SO_DONTROUTE:
1716 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1717 		break;
1718 
1719 	case SO_BROADCAST:
1720 		v.val = sock_flag(sk, SOCK_BROADCAST);
1721 		break;
1722 
1723 	case SO_SNDBUF:
1724 		v.val = READ_ONCE(sk->sk_sndbuf);
1725 		break;
1726 
1727 	case SO_RCVBUF:
1728 		v.val = READ_ONCE(sk->sk_rcvbuf);
1729 		break;
1730 
1731 	case SO_REUSEADDR:
1732 		v.val = sk->sk_reuse;
1733 		break;
1734 
1735 	case SO_REUSEPORT:
1736 		v.val = sk->sk_reuseport;
1737 		break;
1738 
1739 	case SO_KEEPALIVE:
1740 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1741 		break;
1742 
1743 	case SO_TYPE:
1744 		v.val = sk->sk_type;
1745 		break;
1746 
1747 	case SO_PROTOCOL:
1748 		v.val = sk->sk_protocol;
1749 		break;
1750 
1751 	case SO_DOMAIN:
1752 		v.val = sk->sk_family;
1753 		break;
1754 
1755 	case SO_ERROR:
1756 		v.val = -sock_error(sk);
1757 		if (v.val == 0)
1758 			v.val = xchg(&sk->sk_err_soft, 0);
1759 		break;
1760 
1761 	case SO_OOBINLINE:
1762 		v.val = sock_flag(sk, SOCK_URGINLINE);
1763 		break;
1764 
1765 	case SO_NO_CHECK:
1766 		v.val = sk->sk_no_check_tx;
1767 		break;
1768 
1769 	case SO_PRIORITY:
1770 		v.val = READ_ONCE(sk->sk_priority);
1771 		break;
1772 
1773 	case SO_LINGER:
1774 		lv		= sizeof(v.ling);
1775 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1776 		v.ling.l_linger	= READ_ONCE(sk->sk_lingertime) / HZ;
1777 		break;
1778 
1779 	case SO_BSDCOMPAT:
1780 		break;
1781 
1782 	case SO_TIMESTAMP_OLD:
1783 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1784 				!sock_flag(sk, SOCK_TSTAMP_NEW) &&
1785 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1786 		break;
1787 
1788 	case SO_TIMESTAMPNS_OLD:
1789 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1790 		break;
1791 
1792 	case SO_TIMESTAMP_NEW:
1793 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1794 		break;
1795 
1796 	case SO_TIMESTAMPNS_NEW:
1797 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1798 		break;
1799 
1800 	case SO_TIMESTAMPING_OLD:
1801 	case SO_TIMESTAMPING_NEW:
1802 		lv = sizeof(v.timestamping);
1803 		/* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1804 		 * returning the flags when they were set through the same option.
1805 		 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1806 		 */
1807 		if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1808 			v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1809 			v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1810 		}
1811 		break;
1812 
1813 	case SO_RCVTIMEO_OLD:
1814 	case SO_RCVTIMEO_NEW:
1815 		lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1816 				      SO_RCVTIMEO_OLD == optname);
1817 		break;
1818 
1819 	case SO_SNDTIMEO_OLD:
1820 	case SO_SNDTIMEO_NEW:
1821 		lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1822 				      SO_SNDTIMEO_OLD == optname);
1823 		break;
1824 
1825 	case SO_RCVLOWAT:
1826 		v.val = READ_ONCE(sk->sk_rcvlowat);
1827 		break;
1828 
1829 	case SO_SNDLOWAT:
1830 		v.val = 1;
1831 		break;
1832 
1833 	case SO_PASSCRED:
1834 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1835 		break;
1836 
1837 	case SO_PASSPIDFD:
1838 		v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1839 		break;
1840 
1841 	case SO_PEERCRED:
1842 	{
1843 		struct ucred peercred;
1844 		if (len > sizeof(peercred))
1845 			len = sizeof(peercred);
1846 
1847 		spin_lock(&sk->sk_peer_lock);
1848 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1849 		spin_unlock(&sk->sk_peer_lock);
1850 
1851 		if (copy_to_sockptr(optval, &peercred, len))
1852 			return -EFAULT;
1853 		goto lenout;
1854 	}
1855 
1856 	case SO_PEERPIDFD:
1857 	{
1858 		struct pid *peer_pid;
1859 		struct file *pidfd_file = NULL;
1860 		int pidfd;
1861 
1862 		if (len > sizeof(pidfd))
1863 			len = sizeof(pidfd);
1864 
1865 		spin_lock(&sk->sk_peer_lock);
1866 		peer_pid = get_pid(sk->sk_peer_pid);
1867 		spin_unlock(&sk->sk_peer_lock);
1868 
1869 		if (!peer_pid)
1870 			return -ENODATA;
1871 
1872 		pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1873 		put_pid(peer_pid);
1874 		if (pidfd < 0)
1875 			return pidfd;
1876 
1877 		if (copy_to_sockptr(optval, &pidfd, len) ||
1878 		    copy_to_sockptr(optlen, &len, sizeof(int))) {
1879 			put_unused_fd(pidfd);
1880 			fput(pidfd_file);
1881 
1882 			return -EFAULT;
1883 		}
1884 
1885 		fd_install(pidfd, pidfd_file);
1886 		return 0;
1887 	}
1888 
1889 	case SO_PEERGROUPS:
1890 	{
1891 		const struct cred *cred;
1892 		int ret, n;
1893 
1894 		cred = sk_get_peer_cred(sk);
1895 		if (!cred)
1896 			return -ENODATA;
1897 
1898 		n = cred->group_info->ngroups;
1899 		if (len < n * sizeof(gid_t)) {
1900 			len = n * sizeof(gid_t);
1901 			put_cred(cred);
1902 			return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1903 		}
1904 		len = n * sizeof(gid_t);
1905 
1906 		ret = groups_to_user(optval, cred->group_info);
1907 		put_cred(cred);
1908 		if (ret)
1909 			return ret;
1910 		goto lenout;
1911 	}
1912 
1913 	case SO_PEERNAME:
1914 	{
1915 		struct sockaddr_storage address;
1916 
1917 		lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1918 		if (lv < 0)
1919 			return -ENOTCONN;
1920 		if (lv < len)
1921 			return -EINVAL;
1922 		if (copy_to_sockptr(optval, &address, len))
1923 			return -EFAULT;
1924 		goto lenout;
1925 	}
1926 
1927 	/* Dubious BSD thing... Probably nobody even uses it, but
1928 	 * the UNIX standard wants it for whatever reason... -DaveM
1929 	 */
1930 	case SO_ACCEPTCONN:
1931 		v.val = sk->sk_state == TCP_LISTEN;
1932 		break;
1933 
1934 	case SO_PASSSEC:
1935 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1936 		break;
1937 
1938 	case SO_PEERSEC:
1939 		return security_socket_getpeersec_stream(sock,
1940 							 optval, optlen, len);
1941 
1942 	case SO_MARK:
1943 		v.val = READ_ONCE(sk->sk_mark);
1944 		break;
1945 
1946 	case SO_RCVMARK:
1947 		v.val = sock_flag(sk, SOCK_RCVMARK);
1948 		break;
1949 
1950 	case SO_RXQ_OVFL:
1951 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1952 		break;
1953 
1954 	case SO_WIFI_STATUS:
1955 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1956 		break;
1957 
1958 	case SO_PEEK_OFF:
1959 		if (!READ_ONCE(sock->ops)->set_peek_off)
1960 			return -EOPNOTSUPP;
1961 
1962 		v.val = READ_ONCE(sk->sk_peek_off);
1963 		break;
1964 	case SO_NOFCS:
1965 		v.val = sock_flag(sk, SOCK_NOFCS);
1966 		break;
1967 
1968 	case SO_BINDTODEVICE:
1969 		return sock_getbindtodevice(sk, optval, optlen, len);
1970 
1971 	case SO_GET_FILTER:
1972 		len = sk_get_filter(sk, optval, len);
1973 		if (len < 0)
1974 			return len;
1975 
1976 		goto lenout;
1977 
1978 	case SO_LOCK_FILTER:
1979 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1980 		break;
1981 
1982 	case SO_BPF_EXTENSIONS:
1983 		v.val = bpf_tell_extensions();
1984 		break;
1985 
1986 	case SO_SELECT_ERR_QUEUE:
1987 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1988 		break;
1989 
1990 #ifdef CONFIG_NET_RX_BUSY_POLL
1991 	case SO_BUSY_POLL:
1992 		v.val = READ_ONCE(sk->sk_ll_usec);
1993 		break;
1994 	case SO_PREFER_BUSY_POLL:
1995 		v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1996 		break;
1997 #endif
1998 
1999 	case SO_MAX_PACING_RATE:
2000 		/* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
2001 		if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
2002 			lv = sizeof(v.ulval);
2003 			v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
2004 		} else {
2005 			/* 32bit version */
2006 			v.val = min_t(unsigned long, ~0U,
2007 				      READ_ONCE(sk->sk_max_pacing_rate));
2008 		}
2009 		break;
2010 
2011 	case SO_INCOMING_CPU:
2012 		v.val = READ_ONCE(sk->sk_incoming_cpu);
2013 		break;
2014 
2015 	case SO_MEMINFO:
2016 	{
2017 		u32 meminfo[SK_MEMINFO_VARS];
2018 
2019 		sk_get_meminfo(sk, meminfo);
2020 
2021 		len = min_t(unsigned int, len, sizeof(meminfo));
2022 		if (copy_to_sockptr(optval, &meminfo, len))
2023 			return -EFAULT;
2024 
2025 		goto lenout;
2026 	}
2027 
2028 #ifdef CONFIG_NET_RX_BUSY_POLL
2029 	case SO_INCOMING_NAPI_ID:
2030 		v.val = READ_ONCE(sk->sk_napi_id);
2031 
2032 		/* aggregate non-NAPI IDs down to 0 */
2033 		if (v.val < MIN_NAPI_ID)
2034 			v.val = 0;
2035 
2036 		break;
2037 #endif
2038 
2039 	case SO_COOKIE:
2040 		lv = sizeof(u64);
2041 		if (len < lv)
2042 			return -EINVAL;
2043 		v.val64 = sock_gen_cookie(sk);
2044 		break;
2045 
2046 	case SO_ZEROCOPY:
2047 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
2048 		break;
2049 
2050 	case SO_TXTIME:
2051 		lv = sizeof(v.txtime);
2052 		v.txtime.clockid = sk->sk_clockid;
2053 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
2054 				  SOF_TXTIME_DEADLINE_MODE : 0;
2055 		v.txtime.flags |= sk->sk_txtime_report_errors ?
2056 				  SOF_TXTIME_REPORT_ERRORS : 0;
2057 		break;
2058 
2059 	case SO_BINDTOIFINDEX:
2060 		v.val = READ_ONCE(sk->sk_bound_dev_if);
2061 		break;
2062 
2063 	case SO_NETNS_COOKIE:
2064 		lv = sizeof(u64);
2065 		if (len != lv)
2066 			return -EINVAL;
2067 		v.val64 = sock_net(sk)->net_cookie;
2068 		break;
2069 
2070 	case SO_BUF_LOCK:
2071 		v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
2072 		break;
2073 
2074 	case SO_RESERVE_MEM:
2075 		v.val = READ_ONCE(sk->sk_reserved_mem);
2076 		break;
2077 
2078 	case SO_TXREHASH:
2079 		/* Paired with WRITE_ONCE() in sk_setsockopt() */
2080 		v.val = READ_ONCE(sk->sk_txrehash);
2081 		break;
2082 
2083 	default:
2084 		/* We implement the SO_SNDLOWAT etc to not be settable
2085 		 * (1003.1g 7).
2086 		 */
2087 		return -ENOPROTOOPT;
2088 	}
2089 
2090 	if (len > lv)
2091 		len = lv;
2092 	if (copy_to_sockptr(optval, &v, len))
2093 		return -EFAULT;
2094 lenout:
2095 	if (copy_to_sockptr(optlen, &len, sizeof(int)))
2096 		return -EFAULT;
2097 	return 0;
2098 }
2099 
2100 /*
2101  * Initialize an sk_lock.
2102  *
2103  * (We also register the sk_lock with the lock validator.)
2104  */
sock_lock_init(struct sock * sk)2105 static inline void sock_lock_init(struct sock *sk)
2106 {
2107 	if (sk->sk_kern_sock)
2108 		sock_lock_init_class_and_name(
2109 			sk,
2110 			af_family_kern_slock_key_strings[sk->sk_family],
2111 			af_family_kern_slock_keys + sk->sk_family,
2112 			af_family_kern_key_strings[sk->sk_family],
2113 			af_family_kern_keys + sk->sk_family);
2114 	else
2115 		sock_lock_init_class_and_name(
2116 			sk,
2117 			af_family_slock_key_strings[sk->sk_family],
2118 			af_family_slock_keys + sk->sk_family,
2119 			af_family_key_strings[sk->sk_family],
2120 			af_family_keys + sk->sk_family);
2121 }
2122 
2123 /*
2124  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2125  * even temporarily, because of RCU lookups. sk_node should also be left as is.
2126  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2127  */
sock_copy(struct sock * nsk,const struct sock * osk)2128 static void sock_copy(struct sock *nsk, const struct sock *osk)
2129 {
2130 	const struct proto *prot = READ_ONCE(osk->sk_prot);
2131 #ifdef CONFIG_SECURITY_NETWORK
2132 	void *sptr = nsk->sk_security;
2133 #endif
2134 
2135 	/* If we move sk_tx_queue_mapping out of the private section,
2136 	 * we must check if sk_tx_queue_clear() is called after
2137 	 * sock_copy() in sk_clone_lock().
2138 	 */
2139 	BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2140 		     offsetof(struct sock, sk_dontcopy_begin) ||
2141 		     offsetof(struct sock, sk_tx_queue_mapping) >=
2142 		     offsetof(struct sock, sk_dontcopy_end));
2143 
2144 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2145 
2146 	unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2147 		      prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2148 		      /* alloc is larger than struct, see sk_prot_alloc() */);
2149 
2150 #ifdef CONFIG_SECURITY_NETWORK
2151 	nsk->sk_security = sptr;
2152 	security_sk_clone(osk, nsk);
2153 #endif
2154 }
2155 
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)2156 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2157 		int family)
2158 {
2159 	struct sock *sk;
2160 	struct kmem_cache *slab;
2161 
2162 	slab = prot->slab;
2163 	if (slab != NULL) {
2164 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2165 		if (!sk)
2166 			return sk;
2167 		if (want_init_on_alloc(priority))
2168 			sk_prot_clear_nulls(sk, prot->obj_size);
2169 	} else
2170 		sk = kmalloc(prot->obj_size, priority);
2171 
2172 	if (sk != NULL) {
2173 		if (security_sk_alloc(sk, family, priority))
2174 			goto out_free;
2175 
2176 		if (!try_module_get(prot->owner))
2177 			goto out_free_sec;
2178 	}
2179 
2180 	return sk;
2181 
2182 out_free_sec:
2183 	security_sk_free(sk);
2184 out_free:
2185 	if (slab != NULL)
2186 		kmem_cache_free(slab, sk);
2187 	else
2188 		kfree(sk);
2189 	return NULL;
2190 }
2191 
sk_prot_free(struct proto * prot,struct sock * sk)2192 static void sk_prot_free(struct proto *prot, struct sock *sk)
2193 {
2194 	struct kmem_cache *slab;
2195 	struct module *owner;
2196 
2197 	owner = prot->owner;
2198 	slab = prot->slab;
2199 
2200 	cgroup_sk_free(&sk->sk_cgrp_data);
2201 	mem_cgroup_sk_free(sk);
2202 	security_sk_free(sk);
2203 	if (slab != NULL)
2204 		kmem_cache_free(slab, sk);
2205 	else
2206 		kfree(sk);
2207 	module_put(owner);
2208 }
2209 
2210 /**
2211  *	sk_alloc - All socket objects are allocated here
2212  *	@net: the applicable net namespace
2213  *	@family: protocol family
2214  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2215  *	@prot: struct proto associated with this new sock instance
2216  *	@kern: is this to be a kernel socket?
2217  */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)2218 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2219 		      struct proto *prot, int kern)
2220 {
2221 	struct sock *sk;
2222 
2223 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2224 	if (sk) {
2225 		sk->sk_family = family;
2226 		/*
2227 		 * See comment in struct sock definition to understand
2228 		 * why we need sk_prot_creator -acme
2229 		 */
2230 		sk->sk_prot = sk->sk_prot_creator = prot;
2231 		sk->sk_kern_sock = kern;
2232 		sock_lock_init(sk);
2233 		sk->sk_net_refcnt = kern ? 0 : 1;
2234 		if (likely(sk->sk_net_refcnt)) {
2235 			get_net_track(net, &sk->ns_tracker, priority);
2236 			sock_inuse_add(net, 1);
2237 		} else {
2238 			__netns_tracker_alloc(net, &sk->ns_tracker,
2239 					      false, priority);
2240 		}
2241 
2242 		sock_net_set(sk, net);
2243 		refcount_set(&sk->sk_wmem_alloc, 1);
2244 
2245 		mem_cgroup_sk_alloc(sk);
2246 		cgroup_sk_alloc(&sk->sk_cgrp_data);
2247 		sock_update_classid(&sk->sk_cgrp_data);
2248 		sock_update_netprioidx(&sk->sk_cgrp_data);
2249 		sk_tx_queue_clear(sk);
2250 	}
2251 
2252 	return sk;
2253 }
2254 EXPORT_SYMBOL(sk_alloc);
2255 
2256 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2257  * grace period. This is the case for UDP sockets and TCP listeners.
2258  */
__sk_destruct(struct rcu_head * head)2259 static void __sk_destruct(struct rcu_head *head)
2260 {
2261 	struct sock *sk = container_of(head, struct sock, sk_rcu);
2262 	struct sk_filter *filter;
2263 
2264 	if (sk->sk_destruct)
2265 		sk->sk_destruct(sk);
2266 
2267 	filter = rcu_dereference_check(sk->sk_filter,
2268 				       refcount_read(&sk->sk_wmem_alloc) == 0);
2269 	if (filter) {
2270 		sk_filter_uncharge(sk, filter);
2271 		RCU_INIT_POINTER(sk->sk_filter, NULL);
2272 	}
2273 
2274 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2275 
2276 #ifdef CONFIG_BPF_SYSCALL
2277 	bpf_sk_storage_free(sk);
2278 #endif
2279 
2280 	if (atomic_read(&sk->sk_omem_alloc))
2281 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
2282 			 __func__, atomic_read(&sk->sk_omem_alloc));
2283 
2284 	if (sk->sk_frag.page) {
2285 		put_page(sk->sk_frag.page);
2286 		sk->sk_frag.page = NULL;
2287 	}
2288 
2289 	/* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2290 	put_cred(sk->sk_peer_cred);
2291 	put_pid(sk->sk_peer_pid);
2292 
2293 	if (likely(sk->sk_net_refcnt))
2294 		put_net_track(sock_net(sk), &sk->ns_tracker);
2295 	else
2296 		__netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2297 
2298 	sk_prot_free(sk->sk_prot_creator, sk);
2299 }
2300 
sk_destruct(struct sock * sk)2301 void sk_destruct(struct sock *sk)
2302 {
2303 	bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2304 
2305 	if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2306 		reuseport_detach_sock(sk);
2307 		use_call_rcu = true;
2308 	}
2309 
2310 	if (use_call_rcu)
2311 		call_rcu(&sk->sk_rcu, __sk_destruct);
2312 	else
2313 		__sk_destruct(&sk->sk_rcu);
2314 }
2315 
__sk_free(struct sock * sk)2316 static void __sk_free(struct sock *sk)
2317 {
2318 	if (likely(sk->sk_net_refcnt))
2319 		sock_inuse_add(sock_net(sk), -1);
2320 
2321 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2322 		sock_diag_broadcast_destroy(sk);
2323 	else
2324 		sk_destruct(sk);
2325 }
2326 
sk_free(struct sock * sk)2327 void sk_free(struct sock *sk)
2328 {
2329 	/*
2330 	 * We subtract one from sk_wmem_alloc and can know if
2331 	 * some packets are still in some tx queue.
2332 	 * If not null, sock_wfree() will call __sk_free(sk) later
2333 	 */
2334 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2335 		__sk_free(sk);
2336 }
2337 EXPORT_SYMBOL(sk_free);
2338 
sk_init_common(struct sock * sk)2339 static void sk_init_common(struct sock *sk)
2340 {
2341 	skb_queue_head_init(&sk->sk_receive_queue);
2342 	skb_queue_head_init(&sk->sk_write_queue);
2343 	skb_queue_head_init(&sk->sk_error_queue);
2344 
2345 	rwlock_init(&sk->sk_callback_lock);
2346 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2347 			af_rlock_keys + sk->sk_family,
2348 			af_family_rlock_key_strings[sk->sk_family]);
2349 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2350 			af_wlock_keys + sk->sk_family,
2351 			af_family_wlock_key_strings[sk->sk_family]);
2352 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2353 			af_elock_keys + sk->sk_family,
2354 			af_family_elock_key_strings[sk->sk_family]);
2355 	if (sk->sk_kern_sock)
2356 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2357 			af_kern_callback_keys + sk->sk_family,
2358 			af_family_kern_clock_key_strings[sk->sk_family]);
2359 	else
2360 		lockdep_set_class_and_name(&sk->sk_callback_lock,
2361 			af_callback_keys + sk->sk_family,
2362 			af_family_clock_key_strings[sk->sk_family]);
2363 }
2364 
2365 /**
2366  *	sk_clone_lock - clone a socket, and lock its clone
2367  *	@sk: the socket to clone
2368  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2369  *
2370  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2371  */
sk_clone_lock(const struct sock * sk,const gfp_t priority)2372 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2373 {
2374 	struct proto *prot = READ_ONCE(sk->sk_prot);
2375 	struct sk_filter *filter;
2376 	bool is_charged = true;
2377 	struct sock *newsk;
2378 
2379 	newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2380 	if (!newsk)
2381 		goto out;
2382 
2383 	sock_copy(newsk, sk);
2384 
2385 	newsk->sk_prot_creator = prot;
2386 
2387 	/* SANITY */
2388 	if (likely(newsk->sk_net_refcnt)) {
2389 		get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2390 		sock_inuse_add(sock_net(newsk), 1);
2391 	} else {
2392 		/* Kernel sockets are not elevating the struct net refcount.
2393 		 * Instead, use a tracker to more easily detect if a layer
2394 		 * is not properly dismantling its kernel sockets at netns
2395 		 * destroy time.
2396 		 */
2397 		__netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2398 				      false, priority);
2399 	}
2400 	sk_node_init(&newsk->sk_node);
2401 	sock_lock_init(newsk);
2402 	bh_lock_sock(newsk);
2403 	newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
2404 	newsk->sk_backlog.len = 0;
2405 
2406 	atomic_set(&newsk->sk_rmem_alloc, 0);
2407 
2408 	/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2409 	refcount_set(&newsk->sk_wmem_alloc, 1);
2410 
2411 	atomic_set(&newsk->sk_omem_alloc, 0);
2412 	sk_init_common(newsk);
2413 
2414 	newsk->sk_dst_cache	= NULL;
2415 	newsk->sk_dst_pending_confirm = 0;
2416 	newsk->sk_wmem_queued	= 0;
2417 	newsk->sk_forward_alloc = 0;
2418 	newsk->sk_reserved_mem  = 0;
2419 	atomic_set(&newsk->sk_drops, 0);
2420 	newsk->sk_send_head	= NULL;
2421 	newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2422 	atomic_set(&newsk->sk_zckey, 0);
2423 
2424 	sock_reset_flag(newsk, SOCK_DONE);
2425 
2426 	/* sk->sk_memcg will be populated at accept() time */
2427 	newsk->sk_memcg = NULL;
2428 
2429 	cgroup_sk_clone(&newsk->sk_cgrp_data);
2430 
2431 	rcu_read_lock();
2432 	filter = rcu_dereference(sk->sk_filter);
2433 	if (filter != NULL)
2434 		/* though it's an empty new sock, the charging may fail
2435 		 * if sysctl_optmem_max was changed between creation of
2436 		 * original socket and cloning
2437 		 */
2438 		is_charged = sk_filter_charge(newsk, filter);
2439 	RCU_INIT_POINTER(newsk->sk_filter, filter);
2440 	rcu_read_unlock();
2441 
2442 	if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2443 		/* We need to make sure that we don't uncharge the new
2444 		 * socket if we couldn't charge it in the first place
2445 		 * as otherwise we uncharge the parent's filter.
2446 		 */
2447 		if (!is_charged)
2448 			RCU_INIT_POINTER(newsk->sk_filter, NULL);
2449 		sk_free_unlock_clone(newsk);
2450 		newsk = NULL;
2451 		goto out;
2452 	}
2453 	RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2454 
2455 	if (bpf_sk_storage_clone(sk, newsk)) {
2456 		sk_free_unlock_clone(newsk);
2457 		newsk = NULL;
2458 		goto out;
2459 	}
2460 
2461 	/* Clear sk_user_data if parent had the pointer tagged
2462 	 * as not suitable for copying when cloning.
2463 	 */
2464 	if (sk_user_data_is_nocopy(newsk))
2465 		newsk->sk_user_data = NULL;
2466 
2467 	newsk->sk_err	   = 0;
2468 	newsk->sk_err_soft = 0;
2469 	newsk->sk_priority = 0;
2470 	newsk->sk_incoming_cpu = raw_smp_processor_id();
2471 
2472 	/* Before updating sk_refcnt, we must commit prior changes to memory
2473 	 * (Documentation/RCU/rculist_nulls.rst for details)
2474 	 */
2475 	smp_wmb();
2476 	refcount_set(&newsk->sk_refcnt, 2);
2477 
2478 	sk_set_socket(newsk, NULL);
2479 	sk_tx_queue_clear(newsk);
2480 	RCU_INIT_POINTER(newsk->sk_wq, NULL);
2481 
2482 	if (newsk->sk_prot->sockets_allocated)
2483 		sk_sockets_allocated_inc(newsk);
2484 
2485 	if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2486 		net_enable_timestamp();
2487 out:
2488 	return newsk;
2489 }
2490 EXPORT_SYMBOL_GPL(sk_clone_lock);
2491 
sk_free_unlock_clone(struct sock * sk)2492 void sk_free_unlock_clone(struct sock *sk)
2493 {
2494 	/* It is still raw copy of parent, so invalidate
2495 	 * destructor and make plain sk_free() */
2496 	sk->sk_destruct = NULL;
2497 	bh_unlock_sock(sk);
2498 	sk_free(sk);
2499 }
2500 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2501 
sk_dst_gso_max_size(struct sock * sk,struct dst_entry * dst)2502 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2503 {
2504 	bool is_ipv6 = false;
2505 	u32 max_size;
2506 
2507 #if IS_ENABLED(CONFIG_IPV6)
2508 	is_ipv6 = (sk->sk_family == AF_INET6 &&
2509 		   !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2510 #endif
2511 	/* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2512 	max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2513 			READ_ONCE(dst->dev->gso_ipv4_max_size);
2514 	if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2515 		max_size = GSO_LEGACY_MAX_SIZE;
2516 
2517 	return max_size - (MAX_TCP_HEADER + 1);
2518 }
2519 
sk_setup_caps(struct sock * sk,struct dst_entry * dst)2520 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2521 {
2522 	u32 max_segs = 1;
2523 
2524 	sk->sk_route_caps = dst->dev->features;
2525 	if (sk_is_tcp(sk))
2526 		sk->sk_route_caps |= NETIF_F_GSO;
2527 	if (sk->sk_route_caps & NETIF_F_GSO)
2528 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2529 	if (unlikely(sk->sk_gso_disabled))
2530 		sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2531 	if (sk_can_gso(sk)) {
2532 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2533 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2534 		} else {
2535 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2536 			sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2537 			/* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2538 			max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2539 		}
2540 	}
2541 	sk->sk_gso_max_segs = max_segs;
2542 	sk_dst_set(sk, dst);
2543 }
2544 EXPORT_SYMBOL_GPL(sk_setup_caps);
2545 
2546 /*
2547  *	Simple resource managers for sockets.
2548  */
2549 
2550 
2551 /*
2552  * Write buffer destructor automatically called from kfree_skb.
2553  */
sock_wfree(struct sk_buff * skb)2554 void sock_wfree(struct sk_buff *skb)
2555 {
2556 	struct sock *sk = skb->sk;
2557 	unsigned int len = skb->truesize;
2558 	bool free;
2559 
2560 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2561 		if (sock_flag(sk, SOCK_RCU_FREE) &&
2562 		    sk->sk_write_space == sock_def_write_space) {
2563 			rcu_read_lock();
2564 			free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2565 			sock_def_write_space_wfree(sk);
2566 			rcu_read_unlock();
2567 			if (unlikely(free))
2568 				__sk_free(sk);
2569 			return;
2570 		}
2571 
2572 		/*
2573 		 * Keep a reference on sk_wmem_alloc, this will be released
2574 		 * after sk_write_space() call
2575 		 */
2576 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2577 		sk->sk_write_space(sk);
2578 		len = 1;
2579 	}
2580 	/*
2581 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2582 	 * could not do because of in-flight packets
2583 	 */
2584 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2585 		__sk_free(sk);
2586 }
2587 EXPORT_SYMBOL(sock_wfree);
2588 
2589 /* This variant of sock_wfree() is used by TCP,
2590  * since it sets SOCK_USE_WRITE_QUEUE.
2591  */
__sock_wfree(struct sk_buff * skb)2592 void __sock_wfree(struct sk_buff *skb)
2593 {
2594 	struct sock *sk = skb->sk;
2595 
2596 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2597 		__sk_free(sk);
2598 }
2599 
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)2600 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2601 {
2602 	skb_orphan(skb);
2603 	skb->sk = sk;
2604 #ifdef CONFIG_INET
2605 	if (unlikely(!sk_fullsock(sk))) {
2606 		skb->destructor = sock_edemux;
2607 		sock_hold(sk);
2608 		return;
2609 	}
2610 #endif
2611 	skb->destructor = sock_wfree;
2612 	skb_set_hash_from_sk(skb, sk);
2613 	/*
2614 	 * We used to take a refcount on sk, but following operation
2615 	 * is enough to guarantee sk_free() won't free this sock until
2616 	 * all in-flight packets are completed
2617 	 */
2618 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2619 }
2620 EXPORT_SYMBOL(skb_set_owner_w);
2621 
can_skb_orphan_partial(const struct sk_buff * skb)2622 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2623 {
2624 	/* Drivers depend on in-order delivery for crypto offload,
2625 	 * partial orphan breaks out-of-order-OK logic.
2626 	 */
2627 	if (skb_is_decrypted(skb))
2628 		return false;
2629 
2630 	return (skb->destructor == sock_wfree ||
2631 		(IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2632 }
2633 
2634 /* This helper is used by netem, as it can hold packets in its
2635  * delay queue. We want to allow the owner socket to send more
2636  * packets, as if they were already TX completed by a typical driver.
2637  * But we also want to keep skb->sk set because some packet schedulers
2638  * rely on it (sch_fq for example).
2639  */
skb_orphan_partial(struct sk_buff * skb)2640 void skb_orphan_partial(struct sk_buff *skb)
2641 {
2642 	if (skb_is_tcp_pure_ack(skb))
2643 		return;
2644 
2645 	if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2646 		return;
2647 
2648 	skb_orphan(skb);
2649 }
2650 EXPORT_SYMBOL(skb_orphan_partial);
2651 
2652 /*
2653  * Read buffer destructor automatically called from kfree_skb.
2654  */
sock_rfree(struct sk_buff * skb)2655 void sock_rfree(struct sk_buff *skb)
2656 {
2657 	struct sock *sk = skb->sk;
2658 	unsigned int len = skb->truesize;
2659 
2660 	atomic_sub(len, &sk->sk_rmem_alloc);
2661 	sk_mem_uncharge(sk, len);
2662 }
2663 EXPORT_SYMBOL(sock_rfree);
2664 
2665 /*
2666  * Buffer destructor for skbs that are not used directly in read or write
2667  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2668  */
sock_efree(struct sk_buff * skb)2669 void sock_efree(struct sk_buff *skb)
2670 {
2671 	sock_put(skb->sk);
2672 }
2673 EXPORT_SYMBOL(sock_efree);
2674 
2675 /* Buffer destructor for prefetch/receive path where reference count may
2676  * not be held, e.g. for listen sockets.
2677  */
2678 #ifdef CONFIG_INET
sock_pfree(struct sk_buff * skb)2679 void sock_pfree(struct sk_buff *skb)
2680 {
2681 	struct sock *sk = skb->sk;
2682 
2683 	if (!sk_is_refcounted(sk))
2684 		return;
2685 
2686 	if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2687 		inet_reqsk(sk)->rsk_listener = NULL;
2688 		reqsk_free(inet_reqsk(sk));
2689 		return;
2690 	}
2691 
2692 	sock_gen_put(sk);
2693 }
2694 EXPORT_SYMBOL(sock_pfree);
2695 #endif /* CONFIG_INET */
2696 
sock_i_uid(struct sock * sk)2697 kuid_t sock_i_uid(struct sock *sk)
2698 {
2699 	kuid_t uid;
2700 
2701 	read_lock_bh(&sk->sk_callback_lock);
2702 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2703 	read_unlock_bh(&sk->sk_callback_lock);
2704 	return uid;
2705 }
2706 EXPORT_SYMBOL(sock_i_uid);
2707 
__sock_i_ino(struct sock * sk)2708 unsigned long __sock_i_ino(struct sock *sk)
2709 {
2710 	unsigned long ino;
2711 
2712 	read_lock(&sk->sk_callback_lock);
2713 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2714 	read_unlock(&sk->sk_callback_lock);
2715 	return ino;
2716 }
2717 EXPORT_SYMBOL(__sock_i_ino);
2718 
sock_i_ino(struct sock * sk)2719 unsigned long sock_i_ino(struct sock *sk)
2720 {
2721 	unsigned long ino;
2722 
2723 	local_bh_disable();
2724 	ino = __sock_i_ino(sk);
2725 	local_bh_enable();
2726 	return ino;
2727 }
2728 EXPORT_SYMBOL(sock_i_ino);
2729 
2730 /*
2731  * Allocate a skb from the socket's send buffer.
2732  */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)2733 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2734 			     gfp_t priority)
2735 {
2736 	if (force ||
2737 	    refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2738 		struct sk_buff *skb = alloc_skb(size, priority);
2739 
2740 		if (skb) {
2741 			skb_set_owner_w(skb, sk);
2742 			return skb;
2743 		}
2744 	}
2745 	return NULL;
2746 }
2747 EXPORT_SYMBOL(sock_wmalloc);
2748 
sock_ofree(struct sk_buff * skb)2749 static void sock_ofree(struct sk_buff *skb)
2750 {
2751 	struct sock *sk = skb->sk;
2752 
2753 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2754 }
2755 
sock_omalloc(struct sock * sk,unsigned long size,gfp_t priority)2756 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2757 			     gfp_t priority)
2758 {
2759 	struct sk_buff *skb;
2760 
2761 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2762 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2763 	    READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2764 		return NULL;
2765 
2766 	skb = alloc_skb(size, priority);
2767 	if (!skb)
2768 		return NULL;
2769 
2770 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2771 	skb->sk = sk;
2772 	skb->destructor = sock_ofree;
2773 	return skb;
2774 }
2775 
2776 /*
2777  * Allocate a memory block from the socket's option memory buffer.
2778  */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)2779 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2780 {
2781 	int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2782 
2783 	if ((unsigned int)size <= optmem_max &&
2784 	    atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2785 		void *mem;
2786 		/* First do the add, to avoid the race if kmalloc
2787 		 * might sleep.
2788 		 */
2789 		atomic_add(size, &sk->sk_omem_alloc);
2790 		mem = kmalloc(size, priority);
2791 		if (mem)
2792 			return mem;
2793 		atomic_sub(size, &sk->sk_omem_alloc);
2794 	}
2795 	return NULL;
2796 }
2797 EXPORT_SYMBOL(sock_kmalloc);
2798 
2799 /* Free an option memory block. Note, we actually want the inline
2800  * here as this allows gcc to detect the nullify and fold away the
2801  * condition entirely.
2802  */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)2803 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2804 				  const bool nullify)
2805 {
2806 	if (WARN_ON_ONCE(!mem))
2807 		return;
2808 	if (nullify)
2809 		kfree_sensitive(mem);
2810 	else
2811 		kfree(mem);
2812 	atomic_sub(size, &sk->sk_omem_alloc);
2813 }
2814 
sock_kfree_s(struct sock * sk,void * mem,int size)2815 void sock_kfree_s(struct sock *sk, void *mem, int size)
2816 {
2817 	__sock_kfree_s(sk, mem, size, false);
2818 }
2819 EXPORT_SYMBOL(sock_kfree_s);
2820 
sock_kzfree_s(struct sock * sk,void * mem,int size)2821 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2822 {
2823 	__sock_kfree_s(sk, mem, size, true);
2824 }
2825 EXPORT_SYMBOL(sock_kzfree_s);
2826 
2827 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2828    I think, these locks should be removed for datagram sockets.
2829  */
sock_wait_for_wmem(struct sock * sk,long timeo)2830 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2831 {
2832 	DEFINE_WAIT(wait);
2833 
2834 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2835 	for (;;) {
2836 		if (!timeo)
2837 			break;
2838 		if (signal_pending(current))
2839 			break;
2840 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2841 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2842 		if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2843 			break;
2844 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2845 			break;
2846 		if (READ_ONCE(sk->sk_err))
2847 			break;
2848 		timeo = schedule_timeout(timeo);
2849 	}
2850 	finish_wait(sk_sleep(sk), &wait);
2851 	return timeo;
2852 }
2853 
2854 
2855 /*
2856  *	Generic send/receive buffer handlers
2857  */
2858 
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)2859 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2860 				     unsigned long data_len, int noblock,
2861 				     int *errcode, int max_page_order)
2862 {
2863 	struct sk_buff *skb;
2864 	long timeo;
2865 	int err;
2866 
2867 	timeo = sock_sndtimeo(sk, noblock);
2868 	for (;;) {
2869 		err = sock_error(sk);
2870 		if (err != 0)
2871 			goto failure;
2872 
2873 		err = -EPIPE;
2874 		if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2875 			goto failure;
2876 
2877 		if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2878 			break;
2879 
2880 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2881 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2882 		err = -EAGAIN;
2883 		if (!timeo)
2884 			goto failure;
2885 		if (signal_pending(current))
2886 			goto interrupted;
2887 		timeo = sock_wait_for_wmem(sk, timeo);
2888 	}
2889 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2890 				   errcode, sk->sk_allocation);
2891 	if (skb)
2892 		skb_set_owner_w(skb, sk);
2893 	return skb;
2894 
2895 interrupted:
2896 	err = sock_intr_errno(timeo);
2897 failure:
2898 	*errcode = err;
2899 	return NULL;
2900 }
2901 EXPORT_SYMBOL(sock_alloc_send_pskb);
2902 
__sock_cmsg_send(struct sock * sk,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)2903 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2904 		     struct sockcm_cookie *sockc)
2905 {
2906 	u32 tsflags;
2907 
2908 	switch (cmsg->cmsg_type) {
2909 	case SO_MARK:
2910 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2911 		    !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2912 			return -EPERM;
2913 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2914 			return -EINVAL;
2915 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2916 		break;
2917 	case SO_TIMESTAMPING_OLD:
2918 	case SO_TIMESTAMPING_NEW:
2919 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2920 			return -EINVAL;
2921 
2922 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2923 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2924 			return -EINVAL;
2925 
2926 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2927 		sockc->tsflags |= tsflags;
2928 		break;
2929 	case SCM_TXTIME:
2930 		if (!sock_flag(sk, SOCK_TXTIME))
2931 			return -EINVAL;
2932 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2933 			return -EINVAL;
2934 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2935 		break;
2936 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2937 	case SCM_RIGHTS:
2938 	case SCM_CREDENTIALS:
2939 		break;
2940 	default:
2941 		return -EINVAL;
2942 	}
2943 	return 0;
2944 }
2945 EXPORT_SYMBOL(__sock_cmsg_send);
2946 
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)2947 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2948 		   struct sockcm_cookie *sockc)
2949 {
2950 	struct cmsghdr *cmsg;
2951 	int ret;
2952 
2953 	for_each_cmsghdr(cmsg, msg) {
2954 		if (!CMSG_OK(msg, cmsg))
2955 			return -EINVAL;
2956 		if (cmsg->cmsg_level != SOL_SOCKET)
2957 			continue;
2958 		ret = __sock_cmsg_send(sk, cmsg, sockc);
2959 		if (ret)
2960 			return ret;
2961 	}
2962 	return 0;
2963 }
2964 EXPORT_SYMBOL(sock_cmsg_send);
2965 
sk_enter_memory_pressure(struct sock * sk)2966 static void sk_enter_memory_pressure(struct sock *sk)
2967 {
2968 	if (!sk->sk_prot->enter_memory_pressure)
2969 		return;
2970 
2971 	sk->sk_prot->enter_memory_pressure(sk);
2972 }
2973 
sk_leave_memory_pressure(struct sock * sk)2974 static void sk_leave_memory_pressure(struct sock *sk)
2975 {
2976 	if (sk->sk_prot->leave_memory_pressure) {
2977 		INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2978 				     tcp_leave_memory_pressure, sk);
2979 	} else {
2980 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2981 
2982 		if (memory_pressure && READ_ONCE(*memory_pressure))
2983 			WRITE_ONCE(*memory_pressure, 0);
2984 	}
2985 }
2986 
2987 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2988 
2989 /**
2990  * skb_page_frag_refill - check that a page_frag contains enough room
2991  * @sz: minimum size of the fragment we want to get
2992  * @pfrag: pointer to page_frag
2993  * @gfp: priority for memory allocation
2994  *
2995  * Note: While this allocator tries to use high order pages, there is
2996  * no guarantee that allocations succeed. Therefore, @sz MUST be
2997  * less or equal than PAGE_SIZE.
2998  */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)2999 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
3000 {
3001 	if (pfrag->page) {
3002 		if (page_ref_count(pfrag->page) == 1) {
3003 			pfrag->offset = 0;
3004 			return true;
3005 		}
3006 		if (pfrag->offset + sz <= pfrag->size)
3007 			return true;
3008 		put_page(pfrag->page);
3009 	}
3010 
3011 	pfrag->offset = 0;
3012 	if (SKB_FRAG_PAGE_ORDER &&
3013 	    !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
3014 		/* Avoid direct reclaim but allow kswapd to wake */
3015 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
3016 					  __GFP_COMP | __GFP_NOWARN |
3017 					  __GFP_NORETRY,
3018 					  SKB_FRAG_PAGE_ORDER);
3019 		if (likely(pfrag->page)) {
3020 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
3021 			return true;
3022 		}
3023 	}
3024 	pfrag->page = alloc_page(gfp);
3025 	if (likely(pfrag->page)) {
3026 		pfrag->size = PAGE_SIZE;
3027 		return true;
3028 	}
3029 	return false;
3030 }
3031 EXPORT_SYMBOL(skb_page_frag_refill);
3032 
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)3033 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
3034 {
3035 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
3036 		return true;
3037 
3038 	sk_enter_memory_pressure(sk);
3039 	sk_stream_moderate_sndbuf(sk);
3040 	return false;
3041 }
3042 EXPORT_SYMBOL(sk_page_frag_refill);
3043 
__lock_sock(struct sock * sk)3044 void __lock_sock(struct sock *sk)
3045 	__releases(&sk->sk_lock.slock)
3046 	__acquires(&sk->sk_lock.slock)
3047 {
3048 	DEFINE_WAIT(wait);
3049 
3050 	for (;;) {
3051 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
3052 					TASK_UNINTERRUPTIBLE);
3053 		spin_unlock_bh(&sk->sk_lock.slock);
3054 		schedule();
3055 		spin_lock_bh(&sk->sk_lock.slock);
3056 		if (!sock_owned_by_user(sk))
3057 			break;
3058 	}
3059 	finish_wait(&sk->sk_lock.wq, &wait);
3060 }
3061 
__release_sock(struct sock * sk)3062 void __release_sock(struct sock *sk)
3063 	__releases(&sk->sk_lock.slock)
3064 	__acquires(&sk->sk_lock.slock)
3065 {
3066 	struct sk_buff *skb, *next;
3067 
3068 	while ((skb = sk->sk_backlog.head) != NULL) {
3069 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
3070 
3071 		spin_unlock_bh(&sk->sk_lock.slock);
3072 
3073 		do {
3074 			next = skb->next;
3075 			prefetch(next);
3076 			DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3077 			skb_mark_not_on_list(skb);
3078 			sk_backlog_rcv(sk, skb);
3079 
3080 			cond_resched();
3081 
3082 			skb = next;
3083 		} while (skb != NULL);
3084 
3085 		spin_lock_bh(&sk->sk_lock.slock);
3086 	}
3087 
3088 	/*
3089 	 * Doing the zeroing here guarantee we can not loop forever
3090 	 * while a wild producer attempts to flood us.
3091 	 */
3092 	sk->sk_backlog.len = 0;
3093 }
3094 
__sk_flush_backlog(struct sock * sk)3095 void __sk_flush_backlog(struct sock *sk)
3096 {
3097 	spin_lock_bh(&sk->sk_lock.slock);
3098 	__release_sock(sk);
3099 
3100 	if (sk->sk_prot->release_cb)
3101 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3102 				     tcp_release_cb, sk);
3103 
3104 	spin_unlock_bh(&sk->sk_lock.slock);
3105 }
3106 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3107 
3108 /**
3109  * sk_wait_data - wait for data to arrive at sk_receive_queue
3110  * @sk:    sock to wait on
3111  * @timeo: for how long
3112  * @skb:   last skb seen on sk_receive_queue
3113  *
3114  * Now socket state including sk->sk_err is changed only under lock,
3115  * hence we may omit checks after joining wait queue.
3116  * We check receive queue before schedule() only as optimization;
3117  * it is very likely that release_sock() added new data.
3118  */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)3119 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3120 {
3121 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
3122 	int rc;
3123 
3124 	add_wait_queue(sk_sleep(sk), &wait);
3125 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3126 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3127 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3128 	remove_wait_queue(sk_sleep(sk), &wait);
3129 	return rc;
3130 }
3131 EXPORT_SYMBOL(sk_wait_data);
3132 
3133 /**
3134  *	__sk_mem_raise_allocated - increase memory_allocated
3135  *	@sk: socket
3136  *	@size: memory size to allocate
3137  *	@amt: pages to allocate
3138  *	@kind: allocation type
3139  *
3140  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3141  *
3142  *	Unlike the globally shared limits among the sockets under same protocol,
3143  *	consuming the budget of a memcg won't have direct effect on other ones.
3144  *	So be optimistic about memcg's tolerance, and leave the callers to decide
3145  *	whether or not to raise allocated through sk_under_memory_pressure() or
3146  *	its variants.
3147  */
__sk_mem_raise_allocated(struct sock * sk,int size,int amt,int kind)3148 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3149 {
3150 	struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3151 	struct proto *prot = sk->sk_prot;
3152 	bool charged = false;
3153 	long allocated;
3154 
3155 	sk_memory_allocated_add(sk, amt);
3156 	allocated = sk_memory_allocated(sk);
3157 
3158 	if (memcg) {
3159 		if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
3160 			goto suppress_allocation;
3161 		charged = true;
3162 	}
3163 
3164 	/* Under limit. */
3165 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
3166 		sk_leave_memory_pressure(sk);
3167 		return 1;
3168 	}
3169 
3170 	/* Under pressure. */
3171 	if (allocated > sk_prot_mem_limits(sk, 1))
3172 		sk_enter_memory_pressure(sk);
3173 
3174 	/* Over hard limit. */
3175 	if (allocated > sk_prot_mem_limits(sk, 2))
3176 		goto suppress_allocation;
3177 
3178 	/* Guarantee minimum buffer size under pressure (either global
3179 	 * or memcg) to make sure features described in RFC 7323 (TCP
3180 	 * Extensions for High Performance) work properly.
3181 	 *
3182 	 * This rule does NOT stand when exceeds global or memcg's hard
3183 	 * limit, or else a DoS attack can be taken place by spawning
3184 	 * lots of sockets whose usage are under minimum buffer size.
3185 	 */
3186 	if (kind == SK_MEM_RECV) {
3187 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3188 			return 1;
3189 
3190 	} else { /* SK_MEM_SEND */
3191 		int wmem0 = sk_get_wmem0(sk, prot);
3192 
3193 		if (sk->sk_type == SOCK_STREAM) {
3194 			if (sk->sk_wmem_queued < wmem0)
3195 				return 1;
3196 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3197 				return 1;
3198 		}
3199 	}
3200 
3201 	if (sk_has_memory_pressure(sk)) {
3202 		u64 alloc;
3203 
3204 		/* The following 'average' heuristic is within the
3205 		 * scope of global accounting, so it only makes
3206 		 * sense for global memory pressure.
3207 		 */
3208 		if (!sk_under_global_memory_pressure(sk))
3209 			return 1;
3210 
3211 		/* Try to be fair among all the sockets under global
3212 		 * pressure by allowing the ones that below average
3213 		 * usage to raise.
3214 		 */
3215 		alloc = sk_sockets_allocated_read_positive(sk);
3216 		if (sk_prot_mem_limits(sk, 2) > alloc *
3217 		    sk_mem_pages(sk->sk_wmem_queued +
3218 				 atomic_read(&sk->sk_rmem_alloc) +
3219 				 sk->sk_forward_alloc))
3220 			return 1;
3221 	}
3222 
3223 suppress_allocation:
3224 
3225 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3226 		sk_stream_moderate_sndbuf(sk);
3227 
3228 		/* Fail only if socket is _under_ its sndbuf.
3229 		 * In this case we cannot block, so that we have to fail.
3230 		 */
3231 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3232 			/* Force charge with __GFP_NOFAIL */
3233 			if (memcg && !charged) {
3234 				mem_cgroup_charge_skmem(memcg, amt,
3235 					gfp_memcg_charge() | __GFP_NOFAIL);
3236 			}
3237 			return 1;
3238 		}
3239 	}
3240 
3241 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3242 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3243 
3244 	sk_memory_allocated_sub(sk, amt);
3245 
3246 	if (charged)
3247 		mem_cgroup_uncharge_skmem(memcg, amt);
3248 
3249 	return 0;
3250 }
3251 
3252 /**
3253  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3254  *	@sk: socket
3255  *	@size: memory size to allocate
3256  *	@kind: allocation type
3257  *
3258  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3259  *	rmem allocation. This function assumes that protocols which have
3260  *	memory_pressure use sk_wmem_queued as write buffer accounting.
3261  */
__sk_mem_schedule(struct sock * sk,int size,int kind)3262 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3263 {
3264 	int ret, amt = sk_mem_pages(size);
3265 
3266 	sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3267 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3268 	if (!ret)
3269 		sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3270 	return ret;
3271 }
3272 EXPORT_SYMBOL(__sk_mem_schedule);
3273 
3274 /**
3275  *	__sk_mem_reduce_allocated - reclaim memory_allocated
3276  *	@sk: socket
3277  *	@amount: number of quanta
3278  *
3279  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3280  */
__sk_mem_reduce_allocated(struct sock * sk,int amount)3281 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3282 {
3283 	sk_memory_allocated_sub(sk, amount);
3284 
3285 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3286 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3287 
3288 	if (sk_under_global_memory_pressure(sk) &&
3289 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3290 		sk_leave_memory_pressure(sk);
3291 }
3292 
3293 /**
3294  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3295  *	@sk: socket
3296  *	@amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3297  */
__sk_mem_reclaim(struct sock * sk,int amount)3298 void __sk_mem_reclaim(struct sock *sk, int amount)
3299 {
3300 	amount >>= PAGE_SHIFT;
3301 	sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3302 	__sk_mem_reduce_allocated(sk, amount);
3303 }
3304 EXPORT_SYMBOL(__sk_mem_reclaim);
3305 
sk_set_peek_off(struct sock * sk,int val)3306 int sk_set_peek_off(struct sock *sk, int val)
3307 {
3308 	WRITE_ONCE(sk->sk_peek_off, val);
3309 	return 0;
3310 }
3311 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3312 
3313 /*
3314  * Set of default routines for initialising struct proto_ops when
3315  * the protocol does not support a particular function. In certain
3316  * cases where it makes no sense for a protocol to have a "do nothing"
3317  * function, some default processing is provided.
3318  */
3319 
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)3320 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3321 {
3322 	return -EOPNOTSUPP;
3323 }
3324 EXPORT_SYMBOL(sock_no_bind);
3325 
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)3326 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3327 		    int len, int flags)
3328 {
3329 	return -EOPNOTSUPP;
3330 }
3331 EXPORT_SYMBOL(sock_no_connect);
3332 
sock_no_socketpair(struct socket * sock1,struct socket * sock2)3333 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3334 {
3335 	return -EOPNOTSUPP;
3336 }
3337 EXPORT_SYMBOL(sock_no_socketpair);
3338 
sock_no_accept(struct socket * sock,struct socket * newsock,struct proto_accept_arg * arg)3339 int sock_no_accept(struct socket *sock, struct socket *newsock,
3340 		   struct proto_accept_arg *arg)
3341 {
3342 	return -EOPNOTSUPP;
3343 }
3344 EXPORT_SYMBOL(sock_no_accept);
3345 
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int peer)3346 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3347 		    int peer)
3348 {
3349 	return -EOPNOTSUPP;
3350 }
3351 EXPORT_SYMBOL(sock_no_getname);
3352 
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3353 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3354 {
3355 	return -EOPNOTSUPP;
3356 }
3357 EXPORT_SYMBOL(sock_no_ioctl);
3358 
sock_no_listen(struct socket * sock,int backlog)3359 int sock_no_listen(struct socket *sock, int backlog)
3360 {
3361 	return -EOPNOTSUPP;
3362 }
3363 EXPORT_SYMBOL(sock_no_listen);
3364 
sock_no_shutdown(struct socket * sock,int how)3365 int sock_no_shutdown(struct socket *sock, int how)
3366 {
3367 	return -EOPNOTSUPP;
3368 }
3369 EXPORT_SYMBOL(sock_no_shutdown);
3370 
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)3371 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3372 {
3373 	return -EOPNOTSUPP;
3374 }
3375 EXPORT_SYMBOL(sock_no_sendmsg);
3376 
sock_no_sendmsg_locked(struct sock * sk,struct msghdr * m,size_t len)3377 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3378 {
3379 	return -EOPNOTSUPP;
3380 }
3381 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3382 
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)3383 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3384 		    int flags)
3385 {
3386 	return -EOPNOTSUPP;
3387 }
3388 EXPORT_SYMBOL(sock_no_recvmsg);
3389 
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)3390 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3391 {
3392 	/* Mirror missing mmap method error code */
3393 	return -ENODEV;
3394 }
3395 EXPORT_SYMBOL(sock_no_mmap);
3396 
3397 /*
3398  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3399  * various sock-based usage counts.
3400  */
__receive_sock(struct file * file)3401 void __receive_sock(struct file *file)
3402 {
3403 	struct socket *sock;
3404 
3405 	sock = sock_from_file(file);
3406 	if (sock) {
3407 		sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3408 		sock_update_classid(&sock->sk->sk_cgrp_data);
3409 	}
3410 }
3411 
3412 /*
3413  *	Default Socket Callbacks
3414  */
3415 
sock_def_wakeup(struct sock * sk)3416 static void sock_def_wakeup(struct sock *sk)
3417 {
3418 	struct socket_wq *wq;
3419 
3420 	rcu_read_lock();
3421 	wq = rcu_dereference(sk->sk_wq);
3422 	if (skwq_has_sleeper(wq))
3423 		wake_up_interruptible_all(&wq->wait);
3424 	rcu_read_unlock();
3425 }
3426 
sock_def_error_report(struct sock * sk)3427 static void sock_def_error_report(struct sock *sk)
3428 {
3429 	struct socket_wq *wq;
3430 
3431 	rcu_read_lock();
3432 	wq = rcu_dereference(sk->sk_wq);
3433 	if (skwq_has_sleeper(wq))
3434 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3435 	sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3436 	rcu_read_unlock();
3437 }
3438 
sock_def_readable(struct sock * sk)3439 void sock_def_readable(struct sock *sk)
3440 {
3441 	struct socket_wq *wq;
3442 
3443 	trace_sk_data_ready(sk);
3444 
3445 	rcu_read_lock();
3446 	wq = rcu_dereference(sk->sk_wq);
3447 	if (skwq_has_sleeper(wq))
3448 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3449 						EPOLLRDNORM | EPOLLRDBAND);
3450 	sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3451 	rcu_read_unlock();
3452 }
3453 
sock_def_write_space(struct sock * sk)3454 static void sock_def_write_space(struct sock *sk)
3455 {
3456 	struct socket_wq *wq;
3457 
3458 	rcu_read_lock();
3459 
3460 	/* Do not wake up a writer until he can make "significant"
3461 	 * progress.  --DaveM
3462 	 */
3463 	if (sock_writeable(sk)) {
3464 		wq = rcu_dereference(sk->sk_wq);
3465 		if (skwq_has_sleeper(wq))
3466 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3467 						EPOLLWRNORM | EPOLLWRBAND);
3468 
3469 		/* Should agree with poll, otherwise some programs break */
3470 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3471 	}
3472 
3473 	rcu_read_unlock();
3474 }
3475 
3476 /* An optimised version of sock_def_write_space(), should only be called
3477  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3478  * ->sk_wmem_alloc.
3479  */
sock_def_write_space_wfree(struct sock * sk)3480 static void sock_def_write_space_wfree(struct sock *sk)
3481 {
3482 	/* Do not wake up a writer until he can make "significant"
3483 	 * progress.  --DaveM
3484 	 */
3485 	if (sock_writeable(sk)) {
3486 		struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3487 
3488 		/* rely on refcount_sub from sock_wfree() */
3489 		smp_mb__after_atomic();
3490 		if (wq && waitqueue_active(&wq->wait))
3491 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3492 						EPOLLWRNORM | EPOLLWRBAND);
3493 
3494 		/* Should agree with poll, otherwise some programs break */
3495 		sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3496 	}
3497 }
3498 
sock_def_destruct(struct sock * sk)3499 static void sock_def_destruct(struct sock *sk)
3500 {
3501 }
3502 
sk_send_sigurg(struct sock * sk)3503 void sk_send_sigurg(struct sock *sk)
3504 {
3505 	if (sk->sk_socket && sk->sk_socket->file)
3506 		if (send_sigurg(sk->sk_socket->file))
3507 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3508 }
3509 EXPORT_SYMBOL(sk_send_sigurg);
3510 
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)3511 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3512 		    unsigned long expires)
3513 {
3514 	if (!mod_timer(timer, expires))
3515 		sock_hold(sk);
3516 }
3517 EXPORT_SYMBOL(sk_reset_timer);
3518 
sk_stop_timer(struct sock * sk,struct timer_list * timer)3519 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3520 {
3521 	if (del_timer(timer))
3522 		__sock_put(sk);
3523 }
3524 EXPORT_SYMBOL(sk_stop_timer);
3525 
sk_stop_timer_sync(struct sock * sk,struct timer_list * timer)3526 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3527 {
3528 	if (del_timer_sync(timer))
3529 		__sock_put(sk);
3530 }
3531 EXPORT_SYMBOL(sk_stop_timer_sync);
3532 
sock_init_data_uid(struct socket * sock,struct sock * sk,kuid_t uid)3533 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3534 {
3535 	sk_init_common(sk);
3536 	sk->sk_send_head	=	NULL;
3537 
3538 	timer_setup(&sk->sk_timer, NULL, 0);
3539 
3540 	sk->sk_allocation	=	GFP_KERNEL;
3541 	sk->sk_rcvbuf		=	READ_ONCE(sysctl_rmem_default);
3542 	sk->sk_sndbuf		=	READ_ONCE(sysctl_wmem_default);
3543 	sk->sk_state		=	TCP_CLOSE;
3544 	sk->sk_use_task_frag	=	true;
3545 	sk_set_socket(sk, sock);
3546 
3547 	sock_set_flag(sk, SOCK_ZAPPED);
3548 
3549 	if (sock) {
3550 		sk->sk_type	=	sock->type;
3551 		RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3552 		sock->sk	=	sk;
3553 	} else {
3554 		RCU_INIT_POINTER(sk->sk_wq, NULL);
3555 	}
3556 	sk->sk_uid	=	uid;
3557 
3558 	sk->sk_state_change	=	sock_def_wakeup;
3559 	sk->sk_data_ready	=	sock_def_readable;
3560 	sk->sk_write_space	=	sock_def_write_space;
3561 	sk->sk_error_report	=	sock_def_error_report;
3562 	sk->sk_destruct		=	sock_def_destruct;
3563 
3564 	sk->sk_frag.page	=	NULL;
3565 	sk->sk_frag.offset	=	0;
3566 	sk->sk_peek_off		=	-1;
3567 
3568 	sk->sk_peer_pid 	=	NULL;
3569 	sk->sk_peer_cred	=	NULL;
3570 	spin_lock_init(&sk->sk_peer_lock);
3571 
3572 	sk->sk_write_pending	=	0;
3573 	sk->sk_rcvlowat		=	1;
3574 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
3575 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
3576 
3577 	sk->sk_stamp = SK_DEFAULT_STAMP;
3578 #if BITS_PER_LONG==32
3579 	seqlock_init(&sk->sk_stamp_seq);
3580 #endif
3581 	atomic_set(&sk->sk_zckey, 0);
3582 
3583 #ifdef CONFIG_NET_RX_BUSY_POLL
3584 	sk->sk_napi_id		=	0;
3585 	sk->sk_ll_usec		=	READ_ONCE(sysctl_net_busy_read);
3586 #endif
3587 
3588 	sk->sk_max_pacing_rate = ~0UL;
3589 	sk->sk_pacing_rate = ~0UL;
3590 	WRITE_ONCE(sk->sk_pacing_shift, 10);
3591 	sk->sk_incoming_cpu = -1;
3592 
3593 	sk_rx_queue_clear(sk);
3594 	/*
3595 	 * Before updating sk_refcnt, we must commit prior changes to memory
3596 	 * (Documentation/RCU/rculist_nulls.rst for details)
3597 	 */
3598 	smp_wmb();
3599 	refcount_set(&sk->sk_refcnt, 1);
3600 	atomic_set(&sk->sk_drops, 0);
3601 }
3602 EXPORT_SYMBOL(sock_init_data_uid);
3603 
sock_init_data(struct socket * sock,struct sock * sk)3604 void sock_init_data(struct socket *sock, struct sock *sk)
3605 {
3606 	kuid_t uid = sock ?
3607 		SOCK_INODE(sock)->i_uid :
3608 		make_kuid(sock_net(sk)->user_ns, 0);
3609 
3610 	sock_init_data_uid(sock, sk, uid);
3611 }
3612 EXPORT_SYMBOL(sock_init_data);
3613 
lock_sock_nested(struct sock * sk,int subclass)3614 void lock_sock_nested(struct sock *sk, int subclass)
3615 {
3616 	/* The sk_lock has mutex_lock() semantics here. */
3617 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3618 
3619 	might_sleep();
3620 	spin_lock_bh(&sk->sk_lock.slock);
3621 	if (sock_owned_by_user_nocheck(sk))
3622 		__lock_sock(sk);
3623 	sk->sk_lock.owned = 1;
3624 	spin_unlock_bh(&sk->sk_lock.slock);
3625 }
3626 EXPORT_SYMBOL(lock_sock_nested);
3627 
release_sock(struct sock * sk)3628 void release_sock(struct sock *sk)
3629 {
3630 	spin_lock_bh(&sk->sk_lock.slock);
3631 	if (sk->sk_backlog.tail)
3632 		__release_sock(sk);
3633 
3634 	if (sk->sk_prot->release_cb)
3635 		INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3636 				     tcp_release_cb, sk);
3637 
3638 	sock_release_ownership(sk);
3639 	if (waitqueue_active(&sk->sk_lock.wq))
3640 		wake_up(&sk->sk_lock.wq);
3641 	spin_unlock_bh(&sk->sk_lock.slock);
3642 }
3643 EXPORT_SYMBOL(release_sock);
3644 
__lock_sock_fast(struct sock * sk)3645 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3646 {
3647 	might_sleep();
3648 	spin_lock_bh(&sk->sk_lock.slock);
3649 
3650 	if (!sock_owned_by_user_nocheck(sk)) {
3651 		/*
3652 		 * Fast path return with bottom halves disabled and
3653 		 * sock::sk_lock.slock held.
3654 		 *
3655 		 * The 'mutex' is not contended and holding
3656 		 * sock::sk_lock.slock prevents all other lockers to
3657 		 * proceed so the corresponding unlock_sock_fast() can
3658 		 * avoid the slow path of release_sock() completely and
3659 		 * just release slock.
3660 		 *
3661 		 * From a semantical POV this is equivalent to 'acquiring'
3662 		 * the 'mutex', hence the corresponding lockdep
3663 		 * mutex_release() has to happen in the fast path of
3664 		 * unlock_sock_fast().
3665 		 */
3666 		return false;
3667 	}
3668 
3669 	__lock_sock(sk);
3670 	sk->sk_lock.owned = 1;
3671 	__acquire(&sk->sk_lock.slock);
3672 	spin_unlock_bh(&sk->sk_lock.slock);
3673 	return true;
3674 }
3675 EXPORT_SYMBOL(__lock_sock_fast);
3676 
sock_gettstamp(struct socket * sock,void __user * userstamp,bool timeval,bool time32)3677 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3678 		   bool timeval, bool time32)
3679 {
3680 	struct sock *sk = sock->sk;
3681 	struct timespec64 ts;
3682 
3683 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3684 	ts = ktime_to_timespec64(sock_read_timestamp(sk));
3685 	if (ts.tv_sec == -1)
3686 		return -ENOENT;
3687 	if (ts.tv_sec == 0) {
3688 		ktime_t kt = ktime_get_real();
3689 		sock_write_timestamp(sk, kt);
3690 		ts = ktime_to_timespec64(kt);
3691 	}
3692 
3693 	if (timeval)
3694 		ts.tv_nsec /= 1000;
3695 
3696 #ifdef CONFIG_COMPAT_32BIT_TIME
3697 	if (time32)
3698 		return put_old_timespec32(&ts, userstamp);
3699 #endif
3700 #ifdef CONFIG_SPARC64
3701 	/* beware of padding in sparc64 timeval */
3702 	if (timeval && !in_compat_syscall()) {
3703 		struct __kernel_old_timeval __user tv = {
3704 			.tv_sec = ts.tv_sec,
3705 			.tv_usec = ts.tv_nsec,
3706 		};
3707 		if (copy_to_user(userstamp, &tv, sizeof(tv)))
3708 			return -EFAULT;
3709 		return 0;
3710 	}
3711 #endif
3712 	return put_timespec64(&ts, userstamp);
3713 }
3714 EXPORT_SYMBOL(sock_gettstamp);
3715 
sock_enable_timestamp(struct sock * sk,enum sock_flags flag)3716 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3717 {
3718 	if (!sock_flag(sk, flag)) {
3719 		unsigned long previous_flags = sk->sk_flags;
3720 
3721 		sock_set_flag(sk, flag);
3722 		/*
3723 		 * we just set one of the two flags which require net
3724 		 * time stamping, but time stamping might have been on
3725 		 * already because of the other one
3726 		 */
3727 		if (sock_needs_netstamp(sk) &&
3728 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
3729 			net_enable_timestamp();
3730 	}
3731 }
3732 
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)3733 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3734 		       int level, int type)
3735 {
3736 	struct sock_exterr_skb *serr;
3737 	struct sk_buff *skb;
3738 	int copied, err;
3739 
3740 	err = -EAGAIN;
3741 	skb = sock_dequeue_err_skb(sk);
3742 	if (skb == NULL)
3743 		goto out;
3744 
3745 	copied = skb->len;
3746 	if (copied > len) {
3747 		msg->msg_flags |= MSG_TRUNC;
3748 		copied = len;
3749 	}
3750 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
3751 	if (err)
3752 		goto out_free_skb;
3753 
3754 	sock_recv_timestamp(msg, sk, skb);
3755 
3756 	serr = SKB_EXT_ERR(skb);
3757 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3758 
3759 	msg->msg_flags |= MSG_ERRQUEUE;
3760 	err = copied;
3761 
3762 out_free_skb:
3763 	kfree_skb(skb);
3764 out:
3765 	return err;
3766 }
3767 EXPORT_SYMBOL(sock_recv_errqueue);
3768 
3769 /*
3770  *	Get a socket option on an socket.
3771  *
3772  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
3773  *	asynchronous errors should be reported by getsockopt. We assume
3774  *	this means if you specify SO_ERROR (otherwise what is the point of it).
3775  */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3776 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3777 			   char __user *optval, int __user *optlen)
3778 {
3779 	struct sock *sk = sock->sk;
3780 
3781 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3782 	return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3783 }
3784 EXPORT_SYMBOL(sock_common_getsockopt);
3785 
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)3786 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3787 			int flags)
3788 {
3789 	struct sock *sk = sock->sk;
3790 	int addr_len = 0;
3791 	int err;
3792 
3793 	err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3794 	if (err >= 0)
3795 		msg->msg_namelen = addr_len;
3796 	return err;
3797 }
3798 EXPORT_SYMBOL(sock_common_recvmsg);
3799 
3800 /*
3801  *	Set socket options on an inet socket.
3802  */
sock_common_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)3803 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3804 			   sockptr_t optval, unsigned int optlen)
3805 {
3806 	struct sock *sk = sock->sk;
3807 
3808 	/* IPV6_ADDRFORM can change sk->sk_prot under us. */
3809 	return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3810 }
3811 EXPORT_SYMBOL(sock_common_setsockopt);
3812 
sk_common_release(struct sock * sk)3813 void sk_common_release(struct sock *sk)
3814 {
3815 	if (sk->sk_prot->destroy)
3816 		sk->sk_prot->destroy(sk);
3817 
3818 	/*
3819 	 * Observation: when sk_common_release is called, processes have
3820 	 * no access to socket. But net still has.
3821 	 * Step one, detach it from networking:
3822 	 *
3823 	 * A. Remove from hash tables.
3824 	 */
3825 
3826 	sk->sk_prot->unhash(sk);
3827 
3828 	if (sk->sk_socket)
3829 		sk->sk_socket->sk = NULL;
3830 
3831 	/*
3832 	 * In this point socket cannot receive new packets, but it is possible
3833 	 * that some packets are in flight because some CPU runs receiver and
3834 	 * did hash table lookup before we unhashed socket. They will achieve
3835 	 * receive queue and will be purged by socket destructor.
3836 	 *
3837 	 * Also we still have packets pending on receive queue and probably,
3838 	 * our own packets waiting in device queues. sock_destroy will drain
3839 	 * receive queue, but transmitted packets will delay socket destruction
3840 	 * until the last reference will be released.
3841 	 */
3842 
3843 	sock_orphan(sk);
3844 
3845 	xfrm_sk_free_policy(sk);
3846 
3847 	sock_put(sk);
3848 }
3849 EXPORT_SYMBOL(sk_common_release);
3850 
sk_get_meminfo(const struct sock * sk,u32 * mem)3851 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3852 {
3853 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3854 
3855 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3856 	mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3857 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3858 	mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3859 	mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3860 	mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3861 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3862 	mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3863 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3864 }
3865 
3866 #ifdef CONFIG_PROC_FS
3867 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3868 
sock_prot_inuse_get(struct net * net,struct proto * prot)3869 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3870 {
3871 	int cpu, idx = prot->inuse_idx;
3872 	int res = 0;
3873 
3874 	for_each_possible_cpu(cpu)
3875 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3876 
3877 	return res >= 0 ? res : 0;
3878 }
3879 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3880 
sock_inuse_get(struct net * net)3881 int sock_inuse_get(struct net *net)
3882 {
3883 	int cpu, res = 0;
3884 
3885 	for_each_possible_cpu(cpu)
3886 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3887 
3888 	return res;
3889 }
3890 
3891 EXPORT_SYMBOL_GPL(sock_inuse_get);
3892 
sock_inuse_init_net(struct net * net)3893 static int __net_init sock_inuse_init_net(struct net *net)
3894 {
3895 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3896 	if (net->core.prot_inuse == NULL)
3897 		return -ENOMEM;
3898 	return 0;
3899 }
3900 
sock_inuse_exit_net(struct net * net)3901 static void __net_exit sock_inuse_exit_net(struct net *net)
3902 {
3903 	free_percpu(net->core.prot_inuse);
3904 }
3905 
3906 static struct pernet_operations net_inuse_ops = {
3907 	.init = sock_inuse_init_net,
3908 	.exit = sock_inuse_exit_net,
3909 };
3910 
net_inuse_init(void)3911 static __init int net_inuse_init(void)
3912 {
3913 	if (register_pernet_subsys(&net_inuse_ops))
3914 		panic("Cannot initialize net inuse counters");
3915 
3916 	return 0;
3917 }
3918 
3919 core_initcall(net_inuse_init);
3920 
assign_proto_idx(struct proto * prot)3921 static int assign_proto_idx(struct proto *prot)
3922 {
3923 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3924 
3925 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3926 		pr_err("PROTO_INUSE_NR exhausted\n");
3927 		return -ENOSPC;
3928 	}
3929 
3930 	set_bit(prot->inuse_idx, proto_inuse_idx);
3931 	return 0;
3932 }
3933 
release_proto_idx(struct proto * prot)3934 static void release_proto_idx(struct proto *prot)
3935 {
3936 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3937 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3938 }
3939 #else
assign_proto_idx(struct proto * prot)3940 static inline int assign_proto_idx(struct proto *prot)
3941 {
3942 	return 0;
3943 }
3944 
release_proto_idx(struct proto * prot)3945 static inline void release_proto_idx(struct proto *prot)
3946 {
3947 }
3948 
3949 #endif
3950 
tw_prot_cleanup(struct timewait_sock_ops * twsk_prot)3951 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3952 {
3953 	if (!twsk_prot)
3954 		return;
3955 	kfree(twsk_prot->twsk_slab_name);
3956 	twsk_prot->twsk_slab_name = NULL;
3957 	kmem_cache_destroy(twsk_prot->twsk_slab);
3958 	twsk_prot->twsk_slab = NULL;
3959 }
3960 
tw_prot_init(const struct proto * prot)3961 static int tw_prot_init(const struct proto *prot)
3962 {
3963 	struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3964 
3965 	if (!twsk_prot)
3966 		return 0;
3967 
3968 	twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3969 					      prot->name);
3970 	if (!twsk_prot->twsk_slab_name)
3971 		return -ENOMEM;
3972 
3973 	twsk_prot->twsk_slab =
3974 		kmem_cache_create(twsk_prot->twsk_slab_name,
3975 				  twsk_prot->twsk_obj_size, 0,
3976 				  SLAB_ACCOUNT | prot->slab_flags,
3977 				  NULL);
3978 	if (!twsk_prot->twsk_slab) {
3979 		pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3980 			prot->name);
3981 		return -ENOMEM;
3982 	}
3983 
3984 	return 0;
3985 }
3986 
req_prot_cleanup(struct request_sock_ops * rsk_prot)3987 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3988 {
3989 	if (!rsk_prot)
3990 		return;
3991 	kfree(rsk_prot->slab_name);
3992 	rsk_prot->slab_name = NULL;
3993 	kmem_cache_destroy(rsk_prot->slab);
3994 	rsk_prot->slab = NULL;
3995 }
3996 
req_prot_init(const struct proto * prot)3997 static int req_prot_init(const struct proto *prot)
3998 {
3999 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
4000 
4001 	if (!rsk_prot)
4002 		return 0;
4003 
4004 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
4005 					prot->name);
4006 	if (!rsk_prot->slab_name)
4007 		return -ENOMEM;
4008 
4009 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
4010 					   rsk_prot->obj_size, 0,
4011 					   SLAB_ACCOUNT | prot->slab_flags,
4012 					   NULL);
4013 
4014 	if (!rsk_prot->slab) {
4015 		pr_crit("%s: Can't create request sock SLAB cache!\n",
4016 			prot->name);
4017 		return -ENOMEM;
4018 	}
4019 	return 0;
4020 }
4021 
proto_register(struct proto * prot,int alloc_slab)4022 int proto_register(struct proto *prot, int alloc_slab)
4023 {
4024 	int ret = -ENOBUFS;
4025 
4026 	if (prot->memory_allocated && !prot->sysctl_mem) {
4027 		pr_err("%s: missing sysctl_mem\n", prot->name);
4028 		return -EINVAL;
4029 	}
4030 	if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
4031 		pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
4032 		return -EINVAL;
4033 	}
4034 	if (alloc_slab) {
4035 		prot->slab = kmem_cache_create_usercopy(prot->name,
4036 					prot->obj_size, 0,
4037 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
4038 					prot->slab_flags,
4039 					prot->useroffset, prot->usersize,
4040 					NULL);
4041 
4042 		if (prot->slab == NULL) {
4043 			pr_crit("%s: Can't create sock SLAB cache!\n",
4044 				prot->name);
4045 			goto out;
4046 		}
4047 
4048 		if (req_prot_init(prot))
4049 			goto out_free_request_sock_slab;
4050 
4051 		if (tw_prot_init(prot))
4052 			goto out_free_timewait_sock_slab;
4053 	}
4054 
4055 	mutex_lock(&proto_list_mutex);
4056 	ret = assign_proto_idx(prot);
4057 	if (ret) {
4058 		mutex_unlock(&proto_list_mutex);
4059 		goto out_free_timewait_sock_slab;
4060 	}
4061 	list_add(&prot->node, &proto_list);
4062 	mutex_unlock(&proto_list_mutex);
4063 	return ret;
4064 
4065 out_free_timewait_sock_slab:
4066 	if (alloc_slab)
4067 		tw_prot_cleanup(prot->twsk_prot);
4068 out_free_request_sock_slab:
4069 	if (alloc_slab) {
4070 		req_prot_cleanup(prot->rsk_prot);
4071 
4072 		kmem_cache_destroy(prot->slab);
4073 		prot->slab = NULL;
4074 	}
4075 out:
4076 	return ret;
4077 }
4078 EXPORT_SYMBOL(proto_register);
4079 
proto_unregister(struct proto * prot)4080 void proto_unregister(struct proto *prot)
4081 {
4082 	mutex_lock(&proto_list_mutex);
4083 	release_proto_idx(prot);
4084 	list_del(&prot->node);
4085 	mutex_unlock(&proto_list_mutex);
4086 
4087 	kmem_cache_destroy(prot->slab);
4088 	prot->slab = NULL;
4089 
4090 	req_prot_cleanup(prot->rsk_prot);
4091 	tw_prot_cleanup(prot->twsk_prot);
4092 }
4093 EXPORT_SYMBOL(proto_unregister);
4094 
sock_load_diag_module(int family,int protocol)4095 int sock_load_diag_module(int family, int protocol)
4096 {
4097 	if (!protocol) {
4098 		if (!sock_is_registered(family))
4099 			return -ENOENT;
4100 
4101 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4102 				      NETLINK_SOCK_DIAG, family);
4103 	}
4104 
4105 #ifdef CONFIG_INET
4106 	if (family == AF_INET &&
4107 	    protocol != IPPROTO_RAW &&
4108 	    protocol < MAX_INET_PROTOS &&
4109 	    !rcu_access_pointer(inet_protos[protocol]))
4110 		return -ENOENT;
4111 #endif
4112 
4113 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4114 			      NETLINK_SOCK_DIAG, family, protocol);
4115 }
4116 EXPORT_SYMBOL(sock_load_diag_module);
4117 
4118 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)4119 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4120 	__acquires(proto_list_mutex)
4121 {
4122 	mutex_lock(&proto_list_mutex);
4123 	return seq_list_start_head(&proto_list, *pos);
4124 }
4125 
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)4126 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4127 {
4128 	return seq_list_next(v, &proto_list, pos);
4129 }
4130 
proto_seq_stop(struct seq_file * seq,void * v)4131 static void proto_seq_stop(struct seq_file *seq, void *v)
4132 	__releases(proto_list_mutex)
4133 {
4134 	mutex_unlock(&proto_list_mutex);
4135 }
4136 
proto_method_implemented(const void * method)4137 static char proto_method_implemented(const void *method)
4138 {
4139 	return method == NULL ? 'n' : 'y';
4140 }
sock_prot_memory_allocated(struct proto * proto)4141 static long sock_prot_memory_allocated(struct proto *proto)
4142 {
4143 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4144 }
4145 
sock_prot_memory_pressure(struct proto * proto)4146 static const char *sock_prot_memory_pressure(struct proto *proto)
4147 {
4148 	return proto->memory_pressure != NULL ?
4149 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4150 }
4151 
proto_seq_printf(struct seq_file * seq,struct proto * proto)4152 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4153 {
4154 
4155 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4156 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4157 		   proto->name,
4158 		   proto->obj_size,
4159 		   sock_prot_inuse_get(seq_file_net(seq), proto),
4160 		   sock_prot_memory_allocated(proto),
4161 		   sock_prot_memory_pressure(proto),
4162 		   proto->max_header,
4163 		   proto->slab == NULL ? "no" : "yes",
4164 		   module_name(proto->owner),
4165 		   proto_method_implemented(proto->close),
4166 		   proto_method_implemented(proto->connect),
4167 		   proto_method_implemented(proto->disconnect),
4168 		   proto_method_implemented(proto->accept),
4169 		   proto_method_implemented(proto->ioctl),
4170 		   proto_method_implemented(proto->init),
4171 		   proto_method_implemented(proto->destroy),
4172 		   proto_method_implemented(proto->shutdown),
4173 		   proto_method_implemented(proto->setsockopt),
4174 		   proto_method_implemented(proto->getsockopt),
4175 		   proto_method_implemented(proto->sendmsg),
4176 		   proto_method_implemented(proto->recvmsg),
4177 		   proto_method_implemented(proto->bind),
4178 		   proto_method_implemented(proto->backlog_rcv),
4179 		   proto_method_implemented(proto->hash),
4180 		   proto_method_implemented(proto->unhash),
4181 		   proto_method_implemented(proto->get_port),
4182 		   proto_method_implemented(proto->enter_memory_pressure));
4183 }
4184 
proto_seq_show(struct seq_file * seq,void * v)4185 static int proto_seq_show(struct seq_file *seq, void *v)
4186 {
4187 	if (v == &proto_list)
4188 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4189 			   "protocol",
4190 			   "size",
4191 			   "sockets",
4192 			   "memory",
4193 			   "press",
4194 			   "maxhdr",
4195 			   "slab",
4196 			   "module",
4197 			   "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4198 	else
4199 		proto_seq_printf(seq, list_entry(v, struct proto, node));
4200 	return 0;
4201 }
4202 
4203 static const struct seq_operations proto_seq_ops = {
4204 	.start  = proto_seq_start,
4205 	.next   = proto_seq_next,
4206 	.stop   = proto_seq_stop,
4207 	.show   = proto_seq_show,
4208 };
4209 
proto_init_net(struct net * net)4210 static __net_init int proto_init_net(struct net *net)
4211 {
4212 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4213 			sizeof(struct seq_net_private)))
4214 		return -ENOMEM;
4215 
4216 	return 0;
4217 }
4218 
proto_exit_net(struct net * net)4219 static __net_exit void proto_exit_net(struct net *net)
4220 {
4221 	remove_proc_entry("protocols", net->proc_net);
4222 }
4223 
4224 
4225 static __net_initdata struct pernet_operations proto_net_ops = {
4226 	.init = proto_init_net,
4227 	.exit = proto_exit_net,
4228 };
4229 
proto_init(void)4230 static int __init proto_init(void)
4231 {
4232 	return register_pernet_subsys(&proto_net_ops);
4233 }
4234 
4235 subsys_initcall(proto_init);
4236 
4237 #endif /* PROC_FS */
4238 
4239 #ifdef CONFIG_NET_RX_BUSY_POLL
sk_busy_loop_end(void * p,unsigned long start_time)4240 bool sk_busy_loop_end(void *p, unsigned long start_time)
4241 {
4242 	struct sock *sk = p;
4243 
4244 	if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4245 		return true;
4246 
4247 	if (sk_is_udp(sk) &&
4248 	    !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4249 		return true;
4250 
4251 	return sk_busy_loop_timeout(sk, start_time);
4252 }
4253 EXPORT_SYMBOL(sk_busy_loop_end);
4254 #endif /* CONFIG_NET_RX_BUSY_POLL */
4255 
sock_bind_add(struct sock * sk,struct sockaddr * addr,int addr_len)4256 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4257 {
4258 	if (!sk->sk_prot->bind_add)
4259 		return -EOPNOTSUPP;
4260 	return sk->sk_prot->bind_add(sk, addr, addr_len);
4261 }
4262 EXPORT_SYMBOL(sock_bind_add);
4263 
4264 /* Copy 'size' bytes from userspace and return `size` back to userspace */
sock_ioctl_inout(struct sock * sk,unsigned int cmd,void __user * arg,void * karg,size_t size)4265 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4266 		     void __user *arg, void *karg, size_t size)
4267 {
4268 	int ret;
4269 
4270 	if (copy_from_user(karg, arg, size))
4271 		return -EFAULT;
4272 
4273 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4274 	if (ret)
4275 		return ret;
4276 
4277 	if (copy_to_user(arg, karg, size))
4278 		return -EFAULT;
4279 
4280 	return 0;
4281 }
4282 EXPORT_SYMBOL(sock_ioctl_inout);
4283 
4284 /* This is the most common ioctl prep function, where the result (4 bytes) is
4285  * copied back to userspace if the ioctl() returns successfully. No input is
4286  * copied from userspace as input argument.
4287  */
sock_ioctl_out(struct sock * sk,unsigned int cmd,void __user * arg)4288 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4289 {
4290 	int ret, karg = 0;
4291 
4292 	ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4293 	if (ret)
4294 		return ret;
4295 
4296 	return put_user(karg, (int __user *)arg);
4297 }
4298 
4299 /* A wrapper around sock ioctls, which copies the data from userspace
4300  * (depending on the protocol/ioctl), and copies back the result to userspace.
4301  * The main motivation for this function is to pass kernel memory to the
4302  * protocol ioctl callbacks, instead of userspace memory.
4303  */
sk_ioctl(struct sock * sk,unsigned int cmd,void __user * arg)4304 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4305 {
4306 	int rc = 1;
4307 
4308 	if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4309 		rc = ipmr_sk_ioctl(sk, cmd, arg);
4310 	else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4311 		rc = ip6mr_sk_ioctl(sk, cmd, arg);
4312 	else if (sk_is_phonet(sk))
4313 		rc = phonet_sk_ioctl(sk, cmd, arg);
4314 
4315 	/* If ioctl was processed, returns its value */
4316 	if (rc <= 0)
4317 		return rc;
4318 
4319 	/* Otherwise call the default handler */
4320 	return sock_ioctl_out(sk, cmd, arg);
4321 }
4322 EXPORT_SYMBOL(sk_ioctl);
4323 
sock_struct_check(void)4324 static int __init sock_struct_check(void)
4325 {
4326 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4327 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4328 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4329 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4330 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4331 
4332 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4333 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4334 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4335 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4336 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4337 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4338 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4339 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4340 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4341 
4342 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4343 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4344 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4345 
4346 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4347 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4348 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4349 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4350 
4351 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4352 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4353 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
4354 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4355 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4356 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4357 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4358 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4359 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4360 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
4361 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
4362 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4363 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4364 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4365 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4366 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4367 
4368 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4369 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4370 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4371 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4372 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4373 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4374 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4375 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4376 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4377 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4378 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4379 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4380 	CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4381 	return 0;
4382 }
4383 
4384 core_initcall(sock_struct_check);
4385