xref: /linux/net/core/sock.c (revision 22fc4c4c9fd60427bcda00878cee94e7622cfa7a)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <asm/unaligned.h>
95 #include <linux/capability.h>
96 #include <linux/errno.h>
97 #include <linux/errqueue.h>
98 #include <linux/types.h>
99 #include <linux/socket.h>
100 #include <linux/in.h>
101 #include <linux/kernel.h>
102 #include <linux/module.h>
103 #include <linux/proc_fs.h>
104 #include <linux/seq_file.h>
105 #include <linux/sched.h>
106 #include <linux/sched/mm.h>
107 #include <linux/timer.h>
108 #include <linux/string.h>
109 #include <linux/sockios.h>
110 #include <linux/net.h>
111 #include <linux/mm.h>
112 #include <linux/slab.h>
113 #include <linux/interrupt.h>
114 #include <linux/poll.h>
115 #include <linux/tcp.h>
116 #include <linux/init.h>
117 #include <linux/highmem.h>
118 #include <linux/user_namespace.h>
119 #include <linux/static_key.h>
120 #include <linux/memcontrol.h>
121 #include <linux/prefetch.h>
122 
123 #include <linux/uaccess.h>
124 
125 #include <linux/netdevice.h>
126 #include <net/protocol.h>
127 #include <linux/skbuff.h>
128 #include <net/net_namespace.h>
129 #include <net/request_sock.h>
130 #include <net/sock.h>
131 #include <linux/net_tstamp.h>
132 #include <net/xfrm.h>
133 #include <linux/ipsec.h>
134 #include <net/cls_cgroup.h>
135 #include <net/netprio_cgroup.h>
136 #include <linux/sock_diag.h>
137 
138 #include <linux/filter.h>
139 #include <net/sock_reuseport.h>
140 
141 #include <trace/events/sock.h>
142 
143 #include <net/tcp.h>
144 #include <net/busy_poll.h>
145 
146 static DEFINE_MUTEX(proto_list_mutex);
147 static LIST_HEAD(proto_list);
148 
149 static void sock_inuse_add(struct net *net, int val);
150 
151 /**
152  * sk_ns_capable - General socket capability test
153  * @sk: Socket to use a capability on or through
154  * @user_ns: The user namespace of the capability to use
155  * @cap: The capability to use
156  *
157  * Test to see if the opener of the socket had when the socket was
158  * created and the current process has the capability @cap in the user
159  * namespace @user_ns.
160  */
161 bool sk_ns_capable(const struct sock *sk,
162 		   struct user_namespace *user_ns, int cap)
163 {
164 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
165 		ns_capable(user_ns, cap);
166 }
167 EXPORT_SYMBOL(sk_ns_capable);
168 
169 /**
170  * sk_capable - Socket global capability test
171  * @sk: Socket to use a capability on or through
172  * @cap: The global capability to use
173  *
174  * Test to see if the opener of the socket had when the socket was
175  * created and the current process has the capability @cap in all user
176  * namespaces.
177  */
178 bool sk_capable(const struct sock *sk, int cap)
179 {
180 	return sk_ns_capable(sk, &init_user_ns, cap);
181 }
182 EXPORT_SYMBOL(sk_capable);
183 
184 /**
185  * sk_net_capable - Network namespace socket capability test
186  * @sk: Socket to use a capability on or through
187  * @cap: The capability to use
188  *
189  * Test to see if the opener of the socket had when the socket was created
190  * and the current process has the capability @cap over the network namespace
191  * the socket is a member of.
192  */
193 bool sk_net_capable(const struct sock *sk, int cap)
194 {
195 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
196 }
197 EXPORT_SYMBOL(sk_net_capable);
198 
199 /*
200  * Each address family might have different locking rules, so we have
201  * one slock key per address family and separate keys for internal and
202  * userspace sockets.
203  */
204 static struct lock_class_key af_family_keys[AF_MAX];
205 static struct lock_class_key af_family_kern_keys[AF_MAX];
206 static struct lock_class_key af_family_slock_keys[AF_MAX];
207 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
208 
209 /*
210  * Make lock validator output more readable. (we pre-construct these
211  * strings build-time, so that runtime initialization of socket
212  * locks is fast):
213  */
214 
215 #define _sock_locks(x)						  \
216   x "AF_UNSPEC",	x "AF_UNIX"     ,	x "AF_INET"     , \
217   x "AF_AX25"  ,	x "AF_IPX"      ,	x "AF_APPLETALK", \
218   x "AF_NETROM",	x "AF_BRIDGE"   ,	x "AF_ATMPVC"   , \
219   x "AF_X25"   ,	x "AF_INET6"    ,	x "AF_ROSE"     , \
220   x "AF_DECnet",	x "AF_NETBEUI"  ,	x "AF_SECURITY" , \
221   x "AF_KEY"   ,	x "AF_NETLINK"  ,	x "AF_PACKET"   , \
222   x "AF_ASH"   ,	x "AF_ECONET"   ,	x "AF_ATMSVC"   , \
223   x "AF_RDS"   ,	x "AF_SNA"      ,	x "AF_IRDA"     , \
224   x "AF_PPPOX" ,	x "AF_WANPIPE"  ,	x "AF_LLC"      , \
225   x "27"       ,	x "28"          ,	x "AF_CAN"      , \
226   x "AF_TIPC"  ,	x "AF_BLUETOOTH",	x "IUCV"        , \
227   x "AF_RXRPC" ,	x "AF_ISDN"     ,	x "AF_PHONET"   , \
228   x "AF_IEEE802154",	x "AF_CAIF"	,	x "AF_ALG"      , \
229   x "AF_NFC"   ,	x "AF_VSOCK"    ,	x "AF_KCM"      , \
230   x "AF_QIPCRTR",	x "AF_SMC"	,	x "AF_XDP"	, \
231   x "AF_MAX"
232 
233 static const char *const af_family_key_strings[AF_MAX+1] = {
234 	_sock_locks("sk_lock-")
235 };
236 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
237 	_sock_locks("slock-")
238 };
239 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
240 	_sock_locks("clock-")
241 };
242 
243 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
244 	_sock_locks("k-sk_lock-")
245 };
246 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
247 	_sock_locks("k-slock-")
248 };
249 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
250 	_sock_locks("k-clock-")
251 };
252 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
253 	_sock_locks("rlock-")
254 };
255 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
256 	_sock_locks("wlock-")
257 };
258 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
259 	_sock_locks("elock-")
260 };
261 
262 /*
263  * sk_callback_lock and sk queues locking rules are per-address-family,
264  * so split the lock classes by using a per-AF key:
265  */
266 static struct lock_class_key af_callback_keys[AF_MAX];
267 static struct lock_class_key af_rlock_keys[AF_MAX];
268 static struct lock_class_key af_wlock_keys[AF_MAX];
269 static struct lock_class_key af_elock_keys[AF_MAX];
270 static struct lock_class_key af_kern_callback_keys[AF_MAX];
271 
272 /* Run time adjustable parameters. */
273 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
274 EXPORT_SYMBOL(sysctl_wmem_max);
275 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
276 EXPORT_SYMBOL(sysctl_rmem_max);
277 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
278 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
279 
280 /* Maximal space eaten by iovec or ancillary data plus some space */
281 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
282 EXPORT_SYMBOL(sysctl_optmem_max);
283 
284 int sysctl_tstamp_allow_data __read_mostly = 1;
285 
286 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
287 EXPORT_SYMBOL_GPL(memalloc_socks_key);
288 
289 /**
290  * sk_set_memalloc - sets %SOCK_MEMALLOC
291  * @sk: socket to set it on
292  *
293  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
294  * It's the responsibility of the admin to adjust min_free_kbytes
295  * to meet the requirements
296  */
297 void sk_set_memalloc(struct sock *sk)
298 {
299 	sock_set_flag(sk, SOCK_MEMALLOC);
300 	sk->sk_allocation |= __GFP_MEMALLOC;
301 	static_branch_inc(&memalloc_socks_key);
302 }
303 EXPORT_SYMBOL_GPL(sk_set_memalloc);
304 
305 void sk_clear_memalloc(struct sock *sk)
306 {
307 	sock_reset_flag(sk, SOCK_MEMALLOC);
308 	sk->sk_allocation &= ~__GFP_MEMALLOC;
309 	static_branch_dec(&memalloc_socks_key);
310 
311 	/*
312 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
313 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
314 	 * it has rmem allocations due to the last swapfile being deactivated
315 	 * but there is a risk that the socket is unusable due to exceeding
316 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
317 	 */
318 	sk_mem_reclaim(sk);
319 }
320 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
321 
322 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
323 {
324 	int ret;
325 	unsigned int noreclaim_flag;
326 
327 	/* these should have been dropped before queueing */
328 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
329 
330 	noreclaim_flag = memalloc_noreclaim_save();
331 	ret = sk->sk_backlog_rcv(sk, skb);
332 	memalloc_noreclaim_restore(noreclaim_flag);
333 
334 	return ret;
335 }
336 EXPORT_SYMBOL(__sk_backlog_rcv);
337 
338 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
339 {
340 	struct timeval tv;
341 
342 	if (optlen < sizeof(tv))
343 		return -EINVAL;
344 	if (copy_from_user(&tv, optval, sizeof(tv)))
345 		return -EFAULT;
346 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
347 		return -EDOM;
348 
349 	if (tv.tv_sec < 0) {
350 		static int warned __read_mostly;
351 
352 		*timeo_p = 0;
353 		if (warned < 10 && net_ratelimit()) {
354 			warned++;
355 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
356 				__func__, current->comm, task_pid_nr(current));
357 		}
358 		return 0;
359 	}
360 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
361 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
362 		return 0;
363 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
364 		*timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
365 	return 0;
366 }
367 
368 static void sock_warn_obsolete_bsdism(const char *name)
369 {
370 	static int warned;
371 	static char warncomm[TASK_COMM_LEN];
372 	if (strcmp(warncomm, current->comm) && warned < 5) {
373 		strcpy(warncomm,  current->comm);
374 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
375 			warncomm, name);
376 		warned++;
377 	}
378 }
379 
380 static bool sock_needs_netstamp(const struct sock *sk)
381 {
382 	switch (sk->sk_family) {
383 	case AF_UNSPEC:
384 	case AF_UNIX:
385 		return false;
386 	default:
387 		return true;
388 	}
389 }
390 
391 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
392 {
393 	if (sk->sk_flags & flags) {
394 		sk->sk_flags &= ~flags;
395 		if (sock_needs_netstamp(sk) &&
396 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
397 			net_disable_timestamp();
398 	}
399 }
400 
401 
402 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
403 {
404 	unsigned long flags;
405 	struct sk_buff_head *list = &sk->sk_receive_queue;
406 
407 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
408 		atomic_inc(&sk->sk_drops);
409 		trace_sock_rcvqueue_full(sk, skb);
410 		return -ENOMEM;
411 	}
412 
413 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
414 		atomic_inc(&sk->sk_drops);
415 		return -ENOBUFS;
416 	}
417 
418 	skb->dev = NULL;
419 	skb_set_owner_r(skb, sk);
420 
421 	/* we escape from rcu protected region, make sure we dont leak
422 	 * a norefcounted dst
423 	 */
424 	skb_dst_force(skb);
425 
426 	spin_lock_irqsave(&list->lock, flags);
427 	sock_skb_set_dropcount(sk, skb);
428 	__skb_queue_tail(list, skb);
429 	spin_unlock_irqrestore(&list->lock, flags);
430 
431 	if (!sock_flag(sk, SOCK_DEAD))
432 		sk->sk_data_ready(sk);
433 	return 0;
434 }
435 EXPORT_SYMBOL(__sock_queue_rcv_skb);
436 
437 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
438 {
439 	int err;
440 
441 	err = sk_filter(sk, skb);
442 	if (err)
443 		return err;
444 
445 	return __sock_queue_rcv_skb(sk, skb);
446 }
447 EXPORT_SYMBOL(sock_queue_rcv_skb);
448 
449 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
450 		     const int nested, unsigned int trim_cap, bool refcounted)
451 {
452 	int rc = NET_RX_SUCCESS;
453 
454 	if (sk_filter_trim_cap(sk, skb, trim_cap))
455 		goto discard_and_relse;
456 
457 	skb->dev = NULL;
458 
459 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
460 		atomic_inc(&sk->sk_drops);
461 		goto discard_and_relse;
462 	}
463 	if (nested)
464 		bh_lock_sock_nested(sk);
465 	else
466 		bh_lock_sock(sk);
467 	if (!sock_owned_by_user(sk)) {
468 		/*
469 		 * trylock + unlock semantics:
470 		 */
471 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
472 
473 		rc = sk_backlog_rcv(sk, skb);
474 
475 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
476 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
477 		bh_unlock_sock(sk);
478 		atomic_inc(&sk->sk_drops);
479 		goto discard_and_relse;
480 	}
481 
482 	bh_unlock_sock(sk);
483 out:
484 	if (refcounted)
485 		sock_put(sk);
486 	return rc;
487 discard_and_relse:
488 	kfree_skb(skb);
489 	goto out;
490 }
491 EXPORT_SYMBOL(__sk_receive_skb);
492 
493 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
494 {
495 	struct dst_entry *dst = __sk_dst_get(sk);
496 
497 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
498 		sk_tx_queue_clear(sk);
499 		sk->sk_dst_pending_confirm = 0;
500 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
501 		dst_release(dst);
502 		return NULL;
503 	}
504 
505 	return dst;
506 }
507 EXPORT_SYMBOL(__sk_dst_check);
508 
509 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
510 {
511 	struct dst_entry *dst = sk_dst_get(sk);
512 
513 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
514 		sk_dst_reset(sk);
515 		dst_release(dst);
516 		return NULL;
517 	}
518 
519 	return dst;
520 }
521 EXPORT_SYMBOL(sk_dst_check);
522 
523 static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
524 {
525 	int ret = -ENOPROTOOPT;
526 #ifdef CONFIG_NETDEVICES
527 	struct net *net = sock_net(sk);
528 
529 	/* Sorry... */
530 	ret = -EPERM;
531 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
532 		goto out;
533 
534 	ret = -EINVAL;
535 	if (ifindex < 0)
536 		goto out;
537 
538 	sk->sk_bound_dev_if = ifindex;
539 	if (sk->sk_prot->rehash)
540 		sk->sk_prot->rehash(sk);
541 	sk_dst_reset(sk);
542 
543 	ret = 0;
544 
545 out:
546 #endif
547 
548 	return ret;
549 }
550 
551 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
552 				int optlen)
553 {
554 	int ret = -ENOPROTOOPT;
555 #ifdef CONFIG_NETDEVICES
556 	struct net *net = sock_net(sk);
557 	char devname[IFNAMSIZ];
558 	int index;
559 
560 	ret = -EINVAL;
561 	if (optlen < 0)
562 		goto out;
563 
564 	/* Bind this socket to a particular device like "eth0",
565 	 * as specified in the passed interface name. If the
566 	 * name is "" or the option length is zero the socket
567 	 * is not bound.
568 	 */
569 	if (optlen > IFNAMSIZ - 1)
570 		optlen = IFNAMSIZ - 1;
571 	memset(devname, 0, sizeof(devname));
572 
573 	ret = -EFAULT;
574 	if (copy_from_user(devname, optval, optlen))
575 		goto out;
576 
577 	index = 0;
578 	if (devname[0] != '\0') {
579 		struct net_device *dev;
580 
581 		rcu_read_lock();
582 		dev = dev_get_by_name_rcu(net, devname);
583 		if (dev)
584 			index = dev->ifindex;
585 		rcu_read_unlock();
586 		ret = -ENODEV;
587 		if (!dev)
588 			goto out;
589 	}
590 
591 	lock_sock(sk);
592 	ret = sock_setbindtodevice_locked(sk, index);
593 	release_sock(sk);
594 
595 out:
596 #endif
597 
598 	return ret;
599 }
600 
601 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
602 				int __user *optlen, int len)
603 {
604 	int ret = -ENOPROTOOPT;
605 #ifdef CONFIG_NETDEVICES
606 	struct net *net = sock_net(sk);
607 	char devname[IFNAMSIZ];
608 
609 	if (sk->sk_bound_dev_if == 0) {
610 		len = 0;
611 		goto zero;
612 	}
613 
614 	ret = -EINVAL;
615 	if (len < IFNAMSIZ)
616 		goto out;
617 
618 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
619 	if (ret)
620 		goto out;
621 
622 	len = strlen(devname) + 1;
623 
624 	ret = -EFAULT;
625 	if (copy_to_user(optval, devname, len))
626 		goto out;
627 
628 zero:
629 	ret = -EFAULT;
630 	if (put_user(len, optlen))
631 		goto out;
632 
633 	ret = 0;
634 
635 out:
636 #endif
637 
638 	return ret;
639 }
640 
641 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
642 {
643 	if (valbool)
644 		sock_set_flag(sk, bit);
645 	else
646 		sock_reset_flag(sk, bit);
647 }
648 
649 bool sk_mc_loop(struct sock *sk)
650 {
651 	if (dev_recursion_level())
652 		return false;
653 	if (!sk)
654 		return true;
655 	switch (sk->sk_family) {
656 	case AF_INET:
657 		return inet_sk(sk)->mc_loop;
658 #if IS_ENABLED(CONFIG_IPV6)
659 	case AF_INET6:
660 		return inet6_sk(sk)->mc_loop;
661 #endif
662 	}
663 	WARN_ON(1);
664 	return true;
665 }
666 EXPORT_SYMBOL(sk_mc_loop);
667 
668 /*
669  *	This is meant for all protocols to use and covers goings on
670  *	at the socket level. Everything here is generic.
671  */
672 
673 int sock_setsockopt(struct socket *sock, int level, int optname,
674 		    char __user *optval, unsigned int optlen)
675 {
676 	struct sock_txtime sk_txtime;
677 	struct sock *sk = sock->sk;
678 	int val;
679 	int valbool;
680 	struct linger ling;
681 	int ret = 0;
682 
683 	/*
684 	 *	Options without arguments
685 	 */
686 
687 	if (optname == SO_BINDTODEVICE)
688 		return sock_setbindtodevice(sk, optval, optlen);
689 
690 	if (optlen < sizeof(int))
691 		return -EINVAL;
692 
693 	if (get_user(val, (int __user *)optval))
694 		return -EFAULT;
695 
696 	valbool = val ? 1 : 0;
697 
698 	lock_sock(sk);
699 
700 	switch (optname) {
701 	case SO_DEBUG:
702 		if (val && !capable(CAP_NET_ADMIN))
703 			ret = -EACCES;
704 		else
705 			sock_valbool_flag(sk, SOCK_DBG, valbool);
706 		break;
707 	case SO_REUSEADDR:
708 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
709 		break;
710 	case SO_REUSEPORT:
711 		sk->sk_reuseport = valbool;
712 		break;
713 	case SO_TYPE:
714 	case SO_PROTOCOL:
715 	case SO_DOMAIN:
716 	case SO_ERROR:
717 		ret = -ENOPROTOOPT;
718 		break;
719 	case SO_DONTROUTE:
720 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
721 		sk_dst_reset(sk);
722 		break;
723 	case SO_BROADCAST:
724 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
725 		break;
726 	case SO_SNDBUF:
727 		/* Don't error on this BSD doesn't and if you think
728 		 * about it this is right. Otherwise apps have to
729 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
730 		 * are treated in BSD as hints
731 		 */
732 		val = min_t(u32, val, sysctl_wmem_max);
733 set_sndbuf:
734 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
735 		sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
736 		/* Wake up sending tasks if we upped the value. */
737 		sk->sk_write_space(sk);
738 		break;
739 
740 	case SO_SNDBUFFORCE:
741 		if (!capable(CAP_NET_ADMIN)) {
742 			ret = -EPERM;
743 			break;
744 		}
745 		goto set_sndbuf;
746 
747 	case SO_RCVBUF:
748 		/* Don't error on this BSD doesn't and if you think
749 		 * about it this is right. Otherwise apps have to
750 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
751 		 * are treated in BSD as hints
752 		 */
753 		val = min_t(u32, val, sysctl_rmem_max);
754 set_rcvbuf:
755 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
756 		/*
757 		 * We double it on the way in to account for
758 		 * "struct sk_buff" etc. overhead.   Applications
759 		 * assume that the SO_RCVBUF setting they make will
760 		 * allow that much actual data to be received on that
761 		 * socket.
762 		 *
763 		 * Applications are unaware that "struct sk_buff" and
764 		 * other overheads allocate from the receive buffer
765 		 * during socket buffer allocation.
766 		 *
767 		 * And after considering the possible alternatives,
768 		 * returning the value we actually used in getsockopt
769 		 * is the most desirable behavior.
770 		 */
771 		sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
772 		break;
773 
774 	case SO_RCVBUFFORCE:
775 		if (!capable(CAP_NET_ADMIN)) {
776 			ret = -EPERM;
777 			break;
778 		}
779 		goto set_rcvbuf;
780 
781 	case SO_KEEPALIVE:
782 		if (sk->sk_prot->keepalive)
783 			sk->sk_prot->keepalive(sk, valbool);
784 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
785 		break;
786 
787 	case SO_OOBINLINE:
788 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
789 		break;
790 
791 	case SO_NO_CHECK:
792 		sk->sk_no_check_tx = valbool;
793 		break;
794 
795 	case SO_PRIORITY:
796 		if ((val >= 0 && val <= 6) ||
797 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
798 			sk->sk_priority = val;
799 		else
800 			ret = -EPERM;
801 		break;
802 
803 	case SO_LINGER:
804 		if (optlen < sizeof(ling)) {
805 			ret = -EINVAL;	/* 1003.1g */
806 			break;
807 		}
808 		if (copy_from_user(&ling, optval, sizeof(ling))) {
809 			ret = -EFAULT;
810 			break;
811 		}
812 		if (!ling.l_onoff)
813 			sock_reset_flag(sk, SOCK_LINGER);
814 		else {
815 #if (BITS_PER_LONG == 32)
816 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
817 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
818 			else
819 #endif
820 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
821 			sock_set_flag(sk, SOCK_LINGER);
822 		}
823 		break;
824 
825 	case SO_BSDCOMPAT:
826 		sock_warn_obsolete_bsdism("setsockopt");
827 		break;
828 
829 	case SO_PASSCRED:
830 		if (valbool)
831 			set_bit(SOCK_PASSCRED, &sock->flags);
832 		else
833 			clear_bit(SOCK_PASSCRED, &sock->flags);
834 		break;
835 
836 	case SO_TIMESTAMP:
837 	case SO_TIMESTAMPNS:
838 		if (valbool)  {
839 			if (optname == SO_TIMESTAMP)
840 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
841 			else
842 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
843 			sock_set_flag(sk, SOCK_RCVTSTAMP);
844 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
845 		} else {
846 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
847 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
848 		}
849 		break;
850 
851 	case SO_TIMESTAMPING:
852 		if (val & ~SOF_TIMESTAMPING_MASK) {
853 			ret = -EINVAL;
854 			break;
855 		}
856 
857 		if (val & SOF_TIMESTAMPING_OPT_ID &&
858 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
859 			if (sk->sk_protocol == IPPROTO_TCP &&
860 			    sk->sk_type == SOCK_STREAM) {
861 				if ((1 << sk->sk_state) &
862 				    (TCPF_CLOSE | TCPF_LISTEN)) {
863 					ret = -EINVAL;
864 					break;
865 				}
866 				sk->sk_tskey = tcp_sk(sk)->snd_una;
867 			} else {
868 				sk->sk_tskey = 0;
869 			}
870 		}
871 
872 		if (val & SOF_TIMESTAMPING_OPT_STATS &&
873 		    !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
874 			ret = -EINVAL;
875 			break;
876 		}
877 
878 		sk->sk_tsflags = val;
879 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
880 			sock_enable_timestamp(sk,
881 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
882 		else
883 			sock_disable_timestamp(sk,
884 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
885 		break;
886 
887 	case SO_RCVLOWAT:
888 		if (val < 0)
889 			val = INT_MAX;
890 		if (sock->ops->set_rcvlowat)
891 			ret = sock->ops->set_rcvlowat(sk, val);
892 		else
893 			sk->sk_rcvlowat = val ? : 1;
894 		break;
895 
896 	case SO_RCVTIMEO:
897 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
898 		break;
899 
900 	case SO_SNDTIMEO:
901 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
902 		break;
903 
904 	case SO_ATTACH_FILTER:
905 		ret = -EINVAL;
906 		if (optlen == sizeof(struct sock_fprog)) {
907 			struct sock_fprog fprog;
908 
909 			ret = -EFAULT;
910 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
911 				break;
912 
913 			ret = sk_attach_filter(&fprog, sk);
914 		}
915 		break;
916 
917 	case SO_ATTACH_BPF:
918 		ret = -EINVAL;
919 		if (optlen == sizeof(u32)) {
920 			u32 ufd;
921 
922 			ret = -EFAULT;
923 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
924 				break;
925 
926 			ret = sk_attach_bpf(ufd, sk);
927 		}
928 		break;
929 
930 	case SO_ATTACH_REUSEPORT_CBPF:
931 		ret = -EINVAL;
932 		if (optlen == sizeof(struct sock_fprog)) {
933 			struct sock_fprog fprog;
934 
935 			ret = -EFAULT;
936 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
937 				break;
938 
939 			ret = sk_reuseport_attach_filter(&fprog, sk);
940 		}
941 		break;
942 
943 	case SO_ATTACH_REUSEPORT_EBPF:
944 		ret = -EINVAL;
945 		if (optlen == sizeof(u32)) {
946 			u32 ufd;
947 
948 			ret = -EFAULT;
949 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
950 				break;
951 
952 			ret = sk_reuseport_attach_bpf(ufd, sk);
953 		}
954 		break;
955 
956 	case SO_DETACH_FILTER:
957 		ret = sk_detach_filter(sk);
958 		break;
959 
960 	case SO_LOCK_FILTER:
961 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
962 			ret = -EPERM;
963 		else
964 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
965 		break;
966 
967 	case SO_PASSSEC:
968 		if (valbool)
969 			set_bit(SOCK_PASSSEC, &sock->flags);
970 		else
971 			clear_bit(SOCK_PASSSEC, &sock->flags);
972 		break;
973 	case SO_MARK:
974 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
975 			ret = -EPERM;
976 		} else if (val != sk->sk_mark) {
977 			sk->sk_mark = val;
978 			sk_dst_reset(sk);
979 		}
980 		break;
981 
982 	case SO_RXQ_OVFL:
983 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
984 		break;
985 
986 	case SO_WIFI_STATUS:
987 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
988 		break;
989 
990 	case SO_PEEK_OFF:
991 		if (sock->ops->set_peek_off)
992 			ret = sock->ops->set_peek_off(sk, val);
993 		else
994 			ret = -EOPNOTSUPP;
995 		break;
996 
997 	case SO_NOFCS:
998 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
999 		break;
1000 
1001 	case SO_SELECT_ERR_QUEUE:
1002 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1003 		break;
1004 
1005 #ifdef CONFIG_NET_RX_BUSY_POLL
1006 	case SO_BUSY_POLL:
1007 		/* allow unprivileged users to decrease the value */
1008 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1009 			ret = -EPERM;
1010 		else {
1011 			if (val < 0)
1012 				ret = -EINVAL;
1013 			else
1014 				sk->sk_ll_usec = val;
1015 		}
1016 		break;
1017 #endif
1018 
1019 	case SO_MAX_PACING_RATE:
1020 		if (val != ~0U)
1021 			cmpxchg(&sk->sk_pacing_status,
1022 				SK_PACING_NONE,
1023 				SK_PACING_NEEDED);
1024 		sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val;
1025 		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1026 					 sk->sk_max_pacing_rate);
1027 		break;
1028 
1029 	case SO_INCOMING_CPU:
1030 		sk->sk_incoming_cpu = val;
1031 		break;
1032 
1033 	case SO_CNX_ADVICE:
1034 		if (val == 1)
1035 			dst_negative_advice(sk);
1036 		break;
1037 
1038 	case SO_ZEROCOPY:
1039 		if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1040 			if (!((sk->sk_type == SOCK_STREAM &&
1041 			       sk->sk_protocol == IPPROTO_TCP) ||
1042 			      (sk->sk_type == SOCK_DGRAM &&
1043 			       sk->sk_protocol == IPPROTO_UDP)))
1044 				ret = -ENOTSUPP;
1045 		} else if (sk->sk_family != PF_RDS) {
1046 			ret = -ENOTSUPP;
1047 		}
1048 		if (!ret) {
1049 			if (val < 0 || val > 1)
1050 				ret = -EINVAL;
1051 			else
1052 				sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1053 		}
1054 		break;
1055 
1056 	case SO_TXTIME:
1057 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1058 			ret = -EPERM;
1059 		} else if (optlen != sizeof(struct sock_txtime)) {
1060 			ret = -EINVAL;
1061 		} else if (copy_from_user(&sk_txtime, optval,
1062 			   sizeof(struct sock_txtime))) {
1063 			ret = -EFAULT;
1064 		} else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1065 			ret = -EINVAL;
1066 		} else {
1067 			sock_valbool_flag(sk, SOCK_TXTIME, true);
1068 			sk->sk_clockid = sk_txtime.clockid;
1069 			sk->sk_txtime_deadline_mode =
1070 				!!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1071 			sk->sk_txtime_report_errors =
1072 				!!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1073 		}
1074 		break;
1075 
1076 	case SO_BINDTOIFINDEX:
1077 		ret = sock_setbindtodevice_locked(sk, val);
1078 		break;
1079 
1080 	default:
1081 		ret = -ENOPROTOOPT;
1082 		break;
1083 	}
1084 	release_sock(sk);
1085 	return ret;
1086 }
1087 EXPORT_SYMBOL(sock_setsockopt);
1088 
1089 
1090 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1091 			  struct ucred *ucred)
1092 {
1093 	ucred->pid = pid_vnr(pid);
1094 	ucred->uid = ucred->gid = -1;
1095 	if (cred) {
1096 		struct user_namespace *current_ns = current_user_ns();
1097 
1098 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1099 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1100 	}
1101 }
1102 
1103 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1104 {
1105 	struct user_namespace *user_ns = current_user_ns();
1106 	int i;
1107 
1108 	for (i = 0; i < src->ngroups; i++)
1109 		if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1110 			return -EFAULT;
1111 
1112 	return 0;
1113 }
1114 
1115 int sock_getsockopt(struct socket *sock, int level, int optname,
1116 		    char __user *optval, int __user *optlen)
1117 {
1118 	struct sock *sk = sock->sk;
1119 
1120 	union {
1121 		int val;
1122 		u64 val64;
1123 		struct linger ling;
1124 		struct timeval tm;
1125 		struct sock_txtime txtime;
1126 	} v;
1127 
1128 	int lv = sizeof(int);
1129 	int len;
1130 
1131 	if (get_user(len, optlen))
1132 		return -EFAULT;
1133 	if (len < 0)
1134 		return -EINVAL;
1135 
1136 	memset(&v, 0, sizeof(v));
1137 
1138 	switch (optname) {
1139 	case SO_DEBUG:
1140 		v.val = sock_flag(sk, SOCK_DBG);
1141 		break;
1142 
1143 	case SO_DONTROUTE:
1144 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1145 		break;
1146 
1147 	case SO_BROADCAST:
1148 		v.val = sock_flag(sk, SOCK_BROADCAST);
1149 		break;
1150 
1151 	case SO_SNDBUF:
1152 		v.val = sk->sk_sndbuf;
1153 		break;
1154 
1155 	case SO_RCVBUF:
1156 		v.val = sk->sk_rcvbuf;
1157 		break;
1158 
1159 	case SO_REUSEADDR:
1160 		v.val = sk->sk_reuse;
1161 		break;
1162 
1163 	case SO_REUSEPORT:
1164 		v.val = sk->sk_reuseport;
1165 		break;
1166 
1167 	case SO_KEEPALIVE:
1168 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1169 		break;
1170 
1171 	case SO_TYPE:
1172 		v.val = sk->sk_type;
1173 		break;
1174 
1175 	case SO_PROTOCOL:
1176 		v.val = sk->sk_protocol;
1177 		break;
1178 
1179 	case SO_DOMAIN:
1180 		v.val = sk->sk_family;
1181 		break;
1182 
1183 	case SO_ERROR:
1184 		v.val = -sock_error(sk);
1185 		if (v.val == 0)
1186 			v.val = xchg(&sk->sk_err_soft, 0);
1187 		break;
1188 
1189 	case SO_OOBINLINE:
1190 		v.val = sock_flag(sk, SOCK_URGINLINE);
1191 		break;
1192 
1193 	case SO_NO_CHECK:
1194 		v.val = sk->sk_no_check_tx;
1195 		break;
1196 
1197 	case SO_PRIORITY:
1198 		v.val = sk->sk_priority;
1199 		break;
1200 
1201 	case SO_LINGER:
1202 		lv		= sizeof(v.ling);
1203 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1204 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1205 		break;
1206 
1207 	case SO_BSDCOMPAT:
1208 		sock_warn_obsolete_bsdism("getsockopt");
1209 		break;
1210 
1211 	case SO_TIMESTAMP:
1212 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1213 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1214 		break;
1215 
1216 	case SO_TIMESTAMPNS:
1217 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1218 		break;
1219 
1220 	case SO_TIMESTAMPING:
1221 		v.val = sk->sk_tsflags;
1222 		break;
1223 
1224 	case SO_RCVTIMEO:
1225 		lv = sizeof(struct timeval);
1226 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1227 			v.tm.tv_sec = 0;
1228 			v.tm.tv_usec = 0;
1229 		} else {
1230 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1231 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1232 		}
1233 		break;
1234 
1235 	case SO_SNDTIMEO:
1236 		lv = sizeof(struct timeval);
1237 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1238 			v.tm.tv_sec = 0;
1239 			v.tm.tv_usec = 0;
1240 		} else {
1241 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1242 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1243 		}
1244 		break;
1245 
1246 	case SO_RCVLOWAT:
1247 		v.val = sk->sk_rcvlowat;
1248 		break;
1249 
1250 	case SO_SNDLOWAT:
1251 		v.val = 1;
1252 		break;
1253 
1254 	case SO_PASSCRED:
1255 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1256 		break;
1257 
1258 	case SO_PEERCRED:
1259 	{
1260 		struct ucred peercred;
1261 		if (len > sizeof(peercred))
1262 			len = sizeof(peercred);
1263 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1264 		if (copy_to_user(optval, &peercred, len))
1265 			return -EFAULT;
1266 		goto lenout;
1267 	}
1268 
1269 	case SO_PEERGROUPS:
1270 	{
1271 		int ret, n;
1272 
1273 		if (!sk->sk_peer_cred)
1274 			return -ENODATA;
1275 
1276 		n = sk->sk_peer_cred->group_info->ngroups;
1277 		if (len < n * sizeof(gid_t)) {
1278 			len = n * sizeof(gid_t);
1279 			return put_user(len, optlen) ? -EFAULT : -ERANGE;
1280 		}
1281 		len = n * sizeof(gid_t);
1282 
1283 		ret = groups_to_user((gid_t __user *)optval,
1284 				     sk->sk_peer_cred->group_info);
1285 		if (ret)
1286 			return ret;
1287 		goto lenout;
1288 	}
1289 
1290 	case SO_PEERNAME:
1291 	{
1292 		char address[128];
1293 
1294 		lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1295 		if (lv < 0)
1296 			return -ENOTCONN;
1297 		if (lv < len)
1298 			return -EINVAL;
1299 		if (copy_to_user(optval, address, len))
1300 			return -EFAULT;
1301 		goto lenout;
1302 	}
1303 
1304 	/* Dubious BSD thing... Probably nobody even uses it, but
1305 	 * the UNIX standard wants it for whatever reason... -DaveM
1306 	 */
1307 	case SO_ACCEPTCONN:
1308 		v.val = sk->sk_state == TCP_LISTEN;
1309 		break;
1310 
1311 	case SO_PASSSEC:
1312 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1313 		break;
1314 
1315 	case SO_PEERSEC:
1316 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1317 
1318 	case SO_MARK:
1319 		v.val = sk->sk_mark;
1320 		break;
1321 
1322 	case SO_RXQ_OVFL:
1323 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1324 		break;
1325 
1326 	case SO_WIFI_STATUS:
1327 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1328 		break;
1329 
1330 	case SO_PEEK_OFF:
1331 		if (!sock->ops->set_peek_off)
1332 			return -EOPNOTSUPP;
1333 
1334 		v.val = sk->sk_peek_off;
1335 		break;
1336 	case SO_NOFCS:
1337 		v.val = sock_flag(sk, SOCK_NOFCS);
1338 		break;
1339 
1340 	case SO_BINDTODEVICE:
1341 		return sock_getbindtodevice(sk, optval, optlen, len);
1342 
1343 	case SO_GET_FILTER:
1344 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1345 		if (len < 0)
1346 			return len;
1347 
1348 		goto lenout;
1349 
1350 	case SO_LOCK_FILTER:
1351 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1352 		break;
1353 
1354 	case SO_BPF_EXTENSIONS:
1355 		v.val = bpf_tell_extensions();
1356 		break;
1357 
1358 	case SO_SELECT_ERR_QUEUE:
1359 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1360 		break;
1361 
1362 #ifdef CONFIG_NET_RX_BUSY_POLL
1363 	case SO_BUSY_POLL:
1364 		v.val = sk->sk_ll_usec;
1365 		break;
1366 #endif
1367 
1368 	case SO_MAX_PACING_RATE:
1369 		/* 32bit version */
1370 		v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1371 		break;
1372 
1373 	case SO_INCOMING_CPU:
1374 		v.val = sk->sk_incoming_cpu;
1375 		break;
1376 
1377 	case SO_MEMINFO:
1378 	{
1379 		u32 meminfo[SK_MEMINFO_VARS];
1380 
1381 		if (get_user(len, optlen))
1382 			return -EFAULT;
1383 
1384 		sk_get_meminfo(sk, meminfo);
1385 
1386 		len = min_t(unsigned int, len, sizeof(meminfo));
1387 		if (copy_to_user(optval, &meminfo, len))
1388 			return -EFAULT;
1389 
1390 		goto lenout;
1391 	}
1392 
1393 #ifdef CONFIG_NET_RX_BUSY_POLL
1394 	case SO_INCOMING_NAPI_ID:
1395 		v.val = READ_ONCE(sk->sk_napi_id);
1396 
1397 		/* aggregate non-NAPI IDs down to 0 */
1398 		if (v.val < MIN_NAPI_ID)
1399 			v.val = 0;
1400 
1401 		break;
1402 #endif
1403 
1404 	case SO_COOKIE:
1405 		lv = sizeof(u64);
1406 		if (len < lv)
1407 			return -EINVAL;
1408 		v.val64 = sock_gen_cookie(sk);
1409 		break;
1410 
1411 	case SO_ZEROCOPY:
1412 		v.val = sock_flag(sk, SOCK_ZEROCOPY);
1413 		break;
1414 
1415 	case SO_TXTIME:
1416 		lv = sizeof(v.txtime);
1417 		v.txtime.clockid = sk->sk_clockid;
1418 		v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1419 				  SOF_TXTIME_DEADLINE_MODE : 0;
1420 		v.txtime.flags |= sk->sk_txtime_report_errors ?
1421 				  SOF_TXTIME_REPORT_ERRORS : 0;
1422 		break;
1423 
1424 	case SO_BINDTOIFINDEX:
1425 		v.val = sk->sk_bound_dev_if;
1426 		break;
1427 
1428 	default:
1429 		/* We implement the SO_SNDLOWAT etc to not be settable
1430 		 * (1003.1g 7).
1431 		 */
1432 		return -ENOPROTOOPT;
1433 	}
1434 
1435 	if (len > lv)
1436 		len = lv;
1437 	if (copy_to_user(optval, &v, len))
1438 		return -EFAULT;
1439 lenout:
1440 	if (put_user(len, optlen))
1441 		return -EFAULT;
1442 	return 0;
1443 }
1444 
1445 /*
1446  * Initialize an sk_lock.
1447  *
1448  * (We also register the sk_lock with the lock validator.)
1449  */
1450 static inline void sock_lock_init(struct sock *sk)
1451 {
1452 	if (sk->sk_kern_sock)
1453 		sock_lock_init_class_and_name(
1454 			sk,
1455 			af_family_kern_slock_key_strings[sk->sk_family],
1456 			af_family_kern_slock_keys + sk->sk_family,
1457 			af_family_kern_key_strings[sk->sk_family],
1458 			af_family_kern_keys + sk->sk_family);
1459 	else
1460 		sock_lock_init_class_and_name(
1461 			sk,
1462 			af_family_slock_key_strings[sk->sk_family],
1463 			af_family_slock_keys + sk->sk_family,
1464 			af_family_key_strings[sk->sk_family],
1465 			af_family_keys + sk->sk_family);
1466 }
1467 
1468 /*
1469  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1470  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1471  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1472  */
1473 static void sock_copy(struct sock *nsk, const struct sock *osk)
1474 {
1475 #ifdef CONFIG_SECURITY_NETWORK
1476 	void *sptr = nsk->sk_security;
1477 #endif
1478 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1479 
1480 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1481 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1482 
1483 #ifdef CONFIG_SECURITY_NETWORK
1484 	nsk->sk_security = sptr;
1485 	security_sk_clone(osk, nsk);
1486 #endif
1487 }
1488 
1489 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1490 		int family)
1491 {
1492 	struct sock *sk;
1493 	struct kmem_cache *slab;
1494 
1495 	slab = prot->slab;
1496 	if (slab != NULL) {
1497 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1498 		if (!sk)
1499 			return sk;
1500 		if (priority & __GFP_ZERO)
1501 			sk_prot_clear_nulls(sk, prot->obj_size);
1502 	} else
1503 		sk = kmalloc(prot->obj_size, priority);
1504 
1505 	if (sk != NULL) {
1506 		if (security_sk_alloc(sk, family, priority))
1507 			goto out_free;
1508 
1509 		if (!try_module_get(prot->owner))
1510 			goto out_free_sec;
1511 		sk_tx_queue_clear(sk);
1512 	}
1513 
1514 	return sk;
1515 
1516 out_free_sec:
1517 	security_sk_free(sk);
1518 out_free:
1519 	if (slab != NULL)
1520 		kmem_cache_free(slab, sk);
1521 	else
1522 		kfree(sk);
1523 	return NULL;
1524 }
1525 
1526 static void sk_prot_free(struct proto *prot, struct sock *sk)
1527 {
1528 	struct kmem_cache *slab;
1529 	struct module *owner;
1530 
1531 	owner = prot->owner;
1532 	slab = prot->slab;
1533 
1534 	cgroup_sk_free(&sk->sk_cgrp_data);
1535 	mem_cgroup_sk_free(sk);
1536 	security_sk_free(sk);
1537 	if (slab != NULL)
1538 		kmem_cache_free(slab, sk);
1539 	else
1540 		kfree(sk);
1541 	module_put(owner);
1542 }
1543 
1544 /**
1545  *	sk_alloc - All socket objects are allocated here
1546  *	@net: the applicable net namespace
1547  *	@family: protocol family
1548  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1549  *	@prot: struct proto associated with this new sock instance
1550  *	@kern: is this to be a kernel socket?
1551  */
1552 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1553 		      struct proto *prot, int kern)
1554 {
1555 	struct sock *sk;
1556 
1557 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1558 	if (sk) {
1559 		sk->sk_family = family;
1560 		/*
1561 		 * See comment in struct sock definition to understand
1562 		 * why we need sk_prot_creator -acme
1563 		 */
1564 		sk->sk_prot = sk->sk_prot_creator = prot;
1565 		sk->sk_kern_sock = kern;
1566 		sock_lock_init(sk);
1567 		sk->sk_net_refcnt = kern ? 0 : 1;
1568 		if (likely(sk->sk_net_refcnt)) {
1569 			get_net(net);
1570 			sock_inuse_add(net, 1);
1571 		}
1572 
1573 		sock_net_set(sk, net);
1574 		refcount_set(&sk->sk_wmem_alloc, 1);
1575 
1576 		mem_cgroup_sk_alloc(sk);
1577 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1578 		sock_update_classid(&sk->sk_cgrp_data);
1579 		sock_update_netprioidx(&sk->sk_cgrp_data);
1580 	}
1581 
1582 	return sk;
1583 }
1584 EXPORT_SYMBOL(sk_alloc);
1585 
1586 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1587  * grace period. This is the case for UDP sockets and TCP listeners.
1588  */
1589 static void __sk_destruct(struct rcu_head *head)
1590 {
1591 	struct sock *sk = container_of(head, struct sock, sk_rcu);
1592 	struct sk_filter *filter;
1593 
1594 	if (sk->sk_destruct)
1595 		sk->sk_destruct(sk);
1596 
1597 	filter = rcu_dereference_check(sk->sk_filter,
1598 				       refcount_read(&sk->sk_wmem_alloc) == 0);
1599 	if (filter) {
1600 		sk_filter_uncharge(sk, filter);
1601 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1602 	}
1603 	if (rcu_access_pointer(sk->sk_reuseport_cb))
1604 		reuseport_detach_sock(sk);
1605 
1606 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1607 
1608 	if (atomic_read(&sk->sk_omem_alloc))
1609 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1610 			 __func__, atomic_read(&sk->sk_omem_alloc));
1611 
1612 	if (sk->sk_frag.page) {
1613 		put_page(sk->sk_frag.page);
1614 		sk->sk_frag.page = NULL;
1615 	}
1616 
1617 	if (sk->sk_peer_cred)
1618 		put_cred(sk->sk_peer_cred);
1619 	put_pid(sk->sk_peer_pid);
1620 	if (likely(sk->sk_net_refcnt))
1621 		put_net(sock_net(sk));
1622 	sk_prot_free(sk->sk_prot_creator, sk);
1623 }
1624 
1625 void sk_destruct(struct sock *sk)
1626 {
1627 	if (sock_flag(sk, SOCK_RCU_FREE))
1628 		call_rcu(&sk->sk_rcu, __sk_destruct);
1629 	else
1630 		__sk_destruct(&sk->sk_rcu);
1631 }
1632 
1633 static void __sk_free(struct sock *sk)
1634 {
1635 	if (likely(sk->sk_net_refcnt))
1636 		sock_inuse_add(sock_net(sk), -1);
1637 
1638 	if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1639 		sock_diag_broadcast_destroy(sk);
1640 	else
1641 		sk_destruct(sk);
1642 }
1643 
1644 void sk_free(struct sock *sk)
1645 {
1646 	/*
1647 	 * We subtract one from sk_wmem_alloc and can know if
1648 	 * some packets are still in some tx queue.
1649 	 * If not null, sock_wfree() will call __sk_free(sk) later
1650 	 */
1651 	if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1652 		__sk_free(sk);
1653 }
1654 EXPORT_SYMBOL(sk_free);
1655 
1656 static void sk_init_common(struct sock *sk)
1657 {
1658 	skb_queue_head_init(&sk->sk_receive_queue);
1659 	skb_queue_head_init(&sk->sk_write_queue);
1660 	skb_queue_head_init(&sk->sk_error_queue);
1661 
1662 	rwlock_init(&sk->sk_callback_lock);
1663 	lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1664 			af_rlock_keys + sk->sk_family,
1665 			af_family_rlock_key_strings[sk->sk_family]);
1666 	lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1667 			af_wlock_keys + sk->sk_family,
1668 			af_family_wlock_key_strings[sk->sk_family]);
1669 	lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1670 			af_elock_keys + sk->sk_family,
1671 			af_family_elock_key_strings[sk->sk_family]);
1672 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1673 			af_callback_keys + sk->sk_family,
1674 			af_family_clock_key_strings[sk->sk_family]);
1675 }
1676 
1677 /**
1678  *	sk_clone_lock - clone a socket, and lock its clone
1679  *	@sk: the socket to clone
1680  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1681  *
1682  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1683  */
1684 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1685 {
1686 	struct sock *newsk;
1687 	bool is_charged = true;
1688 
1689 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1690 	if (newsk != NULL) {
1691 		struct sk_filter *filter;
1692 
1693 		sock_copy(newsk, sk);
1694 
1695 		newsk->sk_prot_creator = sk->sk_prot;
1696 
1697 		/* SANITY */
1698 		if (likely(newsk->sk_net_refcnt))
1699 			get_net(sock_net(newsk));
1700 		sk_node_init(&newsk->sk_node);
1701 		sock_lock_init(newsk);
1702 		bh_lock_sock(newsk);
1703 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1704 		newsk->sk_backlog.len = 0;
1705 
1706 		atomic_set(&newsk->sk_rmem_alloc, 0);
1707 		/*
1708 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1709 		 */
1710 		refcount_set(&newsk->sk_wmem_alloc, 1);
1711 		atomic_set(&newsk->sk_omem_alloc, 0);
1712 		sk_init_common(newsk);
1713 
1714 		newsk->sk_dst_cache	= NULL;
1715 		newsk->sk_dst_pending_confirm = 0;
1716 		newsk->sk_wmem_queued	= 0;
1717 		newsk->sk_forward_alloc = 0;
1718 		atomic_set(&newsk->sk_drops, 0);
1719 		newsk->sk_send_head	= NULL;
1720 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1721 		atomic_set(&newsk->sk_zckey, 0);
1722 
1723 		sock_reset_flag(newsk, SOCK_DONE);
1724 		mem_cgroup_sk_alloc(newsk);
1725 		cgroup_sk_alloc(&newsk->sk_cgrp_data);
1726 
1727 		rcu_read_lock();
1728 		filter = rcu_dereference(sk->sk_filter);
1729 		if (filter != NULL)
1730 			/* though it's an empty new sock, the charging may fail
1731 			 * if sysctl_optmem_max was changed between creation of
1732 			 * original socket and cloning
1733 			 */
1734 			is_charged = sk_filter_charge(newsk, filter);
1735 		RCU_INIT_POINTER(newsk->sk_filter, filter);
1736 		rcu_read_unlock();
1737 
1738 		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1739 			/* We need to make sure that we don't uncharge the new
1740 			 * socket if we couldn't charge it in the first place
1741 			 * as otherwise we uncharge the parent's filter.
1742 			 */
1743 			if (!is_charged)
1744 				RCU_INIT_POINTER(newsk->sk_filter, NULL);
1745 			sk_free_unlock_clone(newsk);
1746 			newsk = NULL;
1747 			goto out;
1748 		}
1749 		RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1750 
1751 		newsk->sk_err	   = 0;
1752 		newsk->sk_err_soft = 0;
1753 		newsk->sk_priority = 0;
1754 		newsk->sk_incoming_cpu = raw_smp_processor_id();
1755 		atomic64_set(&newsk->sk_cookie, 0);
1756 		if (likely(newsk->sk_net_refcnt))
1757 			sock_inuse_add(sock_net(newsk), 1);
1758 
1759 		/*
1760 		 * Before updating sk_refcnt, we must commit prior changes to memory
1761 		 * (Documentation/RCU/rculist_nulls.txt for details)
1762 		 */
1763 		smp_wmb();
1764 		refcount_set(&newsk->sk_refcnt, 2);
1765 
1766 		/*
1767 		 * Increment the counter in the same struct proto as the master
1768 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1769 		 * is the same as sk->sk_prot->socks, as this field was copied
1770 		 * with memcpy).
1771 		 *
1772 		 * This _changes_ the previous behaviour, where
1773 		 * tcp_create_openreq_child always was incrementing the
1774 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1775 		 * to be taken into account in all callers. -acme
1776 		 */
1777 		sk_refcnt_debug_inc(newsk);
1778 		sk_set_socket(newsk, NULL);
1779 		newsk->sk_wq = NULL;
1780 
1781 		if (newsk->sk_prot->sockets_allocated)
1782 			sk_sockets_allocated_inc(newsk);
1783 
1784 		if (sock_needs_netstamp(sk) &&
1785 		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1786 			net_enable_timestamp();
1787 	}
1788 out:
1789 	return newsk;
1790 }
1791 EXPORT_SYMBOL_GPL(sk_clone_lock);
1792 
1793 void sk_free_unlock_clone(struct sock *sk)
1794 {
1795 	/* It is still raw copy of parent, so invalidate
1796 	 * destructor and make plain sk_free() */
1797 	sk->sk_destruct = NULL;
1798 	bh_unlock_sock(sk);
1799 	sk_free(sk);
1800 }
1801 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1802 
1803 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1804 {
1805 	u32 max_segs = 1;
1806 
1807 	sk_dst_set(sk, dst);
1808 	sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1809 	if (sk->sk_route_caps & NETIF_F_GSO)
1810 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1811 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1812 	if (sk_can_gso(sk)) {
1813 		if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1814 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1815 		} else {
1816 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1817 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1818 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1819 		}
1820 	}
1821 	sk->sk_gso_max_segs = max_segs;
1822 }
1823 EXPORT_SYMBOL_GPL(sk_setup_caps);
1824 
1825 /*
1826  *	Simple resource managers for sockets.
1827  */
1828 
1829 
1830 /*
1831  * Write buffer destructor automatically called from kfree_skb.
1832  */
1833 void sock_wfree(struct sk_buff *skb)
1834 {
1835 	struct sock *sk = skb->sk;
1836 	unsigned int len = skb->truesize;
1837 
1838 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1839 		/*
1840 		 * Keep a reference on sk_wmem_alloc, this will be released
1841 		 * after sk_write_space() call
1842 		 */
1843 		WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1844 		sk->sk_write_space(sk);
1845 		len = 1;
1846 	}
1847 	/*
1848 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1849 	 * could not do because of in-flight packets
1850 	 */
1851 	if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1852 		__sk_free(sk);
1853 }
1854 EXPORT_SYMBOL(sock_wfree);
1855 
1856 /* This variant of sock_wfree() is used by TCP,
1857  * since it sets SOCK_USE_WRITE_QUEUE.
1858  */
1859 void __sock_wfree(struct sk_buff *skb)
1860 {
1861 	struct sock *sk = skb->sk;
1862 
1863 	if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1864 		__sk_free(sk);
1865 }
1866 
1867 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1868 {
1869 	skb_orphan(skb);
1870 	skb->sk = sk;
1871 #ifdef CONFIG_INET
1872 	if (unlikely(!sk_fullsock(sk))) {
1873 		skb->destructor = sock_edemux;
1874 		sock_hold(sk);
1875 		return;
1876 	}
1877 #endif
1878 	skb->destructor = sock_wfree;
1879 	skb_set_hash_from_sk(skb, sk);
1880 	/*
1881 	 * We used to take a refcount on sk, but following operation
1882 	 * is enough to guarantee sk_free() wont free this sock until
1883 	 * all in-flight packets are completed
1884 	 */
1885 	refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1886 }
1887 EXPORT_SYMBOL(skb_set_owner_w);
1888 
1889 /* This helper is used by netem, as it can hold packets in its
1890  * delay queue. We want to allow the owner socket to send more
1891  * packets, as if they were already TX completed by a typical driver.
1892  * But we also want to keep skb->sk set because some packet schedulers
1893  * rely on it (sch_fq for example).
1894  */
1895 void skb_orphan_partial(struct sk_buff *skb)
1896 {
1897 	if (skb_is_tcp_pure_ack(skb))
1898 		return;
1899 
1900 	if (skb->destructor == sock_wfree
1901 #ifdef CONFIG_INET
1902 	    || skb->destructor == tcp_wfree
1903 #endif
1904 		) {
1905 		struct sock *sk = skb->sk;
1906 
1907 		if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1908 			WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1909 			skb->destructor = sock_efree;
1910 		}
1911 	} else {
1912 		skb_orphan(skb);
1913 	}
1914 }
1915 EXPORT_SYMBOL(skb_orphan_partial);
1916 
1917 /*
1918  * Read buffer destructor automatically called from kfree_skb.
1919  */
1920 void sock_rfree(struct sk_buff *skb)
1921 {
1922 	struct sock *sk = skb->sk;
1923 	unsigned int len = skb->truesize;
1924 
1925 	atomic_sub(len, &sk->sk_rmem_alloc);
1926 	sk_mem_uncharge(sk, len);
1927 }
1928 EXPORT_SYMBOL(sock_rfree);
1929 
1930 /*
1931  * Buffer destructor for skbs that are not used directly in read or write
1932  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1933  */
1934 void sock_efree(struct sk_buff *skb)
1935 {
1936 	sock_put(skb->sk);
1937 }
1938 EXPORT_SYMBOL(sock_efree);
1939 
1940 kuid_t sock_i_uid(struct sock *sk)
1941 {
1942 	kuid_t uid;
1943 
1944 	read_lock_bh(&sk->sk_callback_lock);
1945 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1946 	read_unlock_bh(&sk->sk_callback_lock);
1947 	return uid;
1948 }
1949 EXPORT_SYMBOL(sock_i_uid);
1950 
1951 unsigned long sock_i_ino(struct sock *sk)
1952 {
1953 	unsigned long ino;
1954 
1955 	read_lock_bh(&sk->sk_callback_lock);
1956 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1957 	read_unlock_bh(&sk->sk_callback_lock);
1958 	return ino;
1959 }
1960 EXPORT_SYMBOL(sock_i_ino);
1961 
1962 /*
1963  * Allocate a skb from the socket's send buffer.
1964  */
1965 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1966 			     gfp_t priority)
1967 {
1968 	if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1969 		struct sk_buff *skb = alloc_skb(size, priority);
1970 		if (skb) {
1971 			skb_set_owner_w(skb, sk);
1972 			return skb;
1973 		}
1974 	}
1975 	return NULL;
1976 }
1977 EXPORT_SYMBOL(sock_wmalloc);
1978 
1979 static void sock_ofree(struct sk_buff *skb)
1980 {
1981 	struct sock *sk = skb->sk;
1982 
1983 	atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1984 }
1985 
1986 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1987 			     gfp_t priority)
1988 {
1989 	struct sk_buff *skb;
1990 
1991 	/* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1992 	if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1993 	    sysctl_optmem_max)
1994 		return NULL;
1995 
1996 	skb = alloc_skb(size, priority);
1997 	if (!skb)
1998 		return NULL;
1999 
2000 	atomic_add(skb->truesize, &sk->sk_omem_alloc);
2001 	skb->sk = sk;
2002 	skb->destructor = sock_ofree;
2003 	return skb;
2004 }
2005 
2006 /*
2007  * Allocate a memory block from the socket's option memory buffer.
2008  */
2009 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2010 {
2011 	if ((unsigned int)size <= sysctl_optmem_max &&
2012 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2013 		void *mem;
2014 		/* First do the add, to avoid the race if kmalloc
2015 		 * might sleep.
2016 		 */
2017 		atomic_add(size, &sk->sk_omem_alloc);
2018 		mem = kmalloc(size, priority);
2019 		if (mem)
2020 			return mem;
2021 		atomic_sub(size, &sk->sk_omem_alloc);
2022 	}
2023 	return NULL;
2024 }
2025 EXPORT_SYMBOL(sock_kmalloc);
2026 
2027 /* Free an option memory block. Note, we actually want the inline
2028  * here as this allows gcc to detect the nullify and fold away the
2029  * condition entirely.
2030  */
2031 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2032 				  const bool nullify)
2033 {
2034 	if (WARN_ON_ONCE(!mem))
2035 		return;
2036 	if (nullify)
2037 		kzfree(mem);
2038 	else
2039 		kfree(mem);
2040 	atomic_sub(size, &sk->sk_omem_alloc);
2041 }
2042 
2043 void sock_kfree_s(struct sock *sk, void *mem, int size)
2044 {
2045 	__sock_kfree_s(sk, mem, size, false);
2046 }
2047 EXPORT_SYMBOL(sock_kfree_s);
2048 
2049 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2050 {
2051 	__sock_kfree_s(sk, mem, size, true);
2052 }
2053 EXPORT_SYMBOL(sock_kzfree_s);
2054 
2055 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2056    I think, these locks should be removed for datagram sockets.
2057  */
2058 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2059 {
2060 	DEFINE_WAIT(wait);
2061 
2062 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2063 	for (;;) {
2064 		if (!timeo)
2065 			break;
2066 		if (signal_pending(current))
2067 			break;
2068 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2069 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2070 		if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2071 			break;
2072 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2073 			break;
2074 		if (sk->sk_err)
2075 			break;
2076 		timeo = schedule_timeout(timeo);
2077 	}
2078 	finish_wait(sk_sleep(sk), &wait);
2079 	return timeo;
2080 }
2081 
2082 
2083 /*
2084  *	Generic send/receive buffer handlers
2085  */
2086 
2087 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2088 				     unsigned long data_len, int noblock,
2089 				     int *errcode, int max_page_order)
2090 {
2091 	struct sk_buff *skb;
2092 	long timeo;
2093 	int err;
2094 
2095 	timeo = sock_sndtimeo(sk, noblock);
2096 	for (;;) {
2097 		err = sock_error(sk);
2098 		if (err != 0)
2099 			goto failure;
2100 
2101 		err = -EPIPE;
2102 		if (sk->sk_shutdown & SEND_SHUTDOWN)
2103 			goto failure;
2104 
2105 		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2106 			break;
2107 
2108 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2109 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2110 		err = -EAGAIN;
2111 		if (!timeo)
2112 			goto failure;
2113 		if (signal_pending(current))
2114 			goto interrupted;
2115 		timeo = sock_wait_for_wmem(sk, timeo);
2116 	}
2117 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2118 				   errcode, sk->sk_allocation);
2119 	if (skb)
2120 		skb_set_owner_w(skb, sk);
2121 	return skb;
2122 
2123 interrupted:
2124 	err = sock_intr_errno(timeo);
2125 failure:
2126 	*errcode = err;
2127 	return NULL;
2128 }
2129 EXPORT_SYMBOL(sock_alloc_send_pskb);
2130 
2131 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2132 				    int noblock, int *errcode)
2133 {
2134 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2135 }
2136 EXPORT_SYMBOL(sock_alloc_send_skb);
2137 
2138 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2139 		     struct sockcm_cookie *sockc)
2140 {
2141 	u32 tsflags;
2142 
2143 	switch (cmsg->cmsg_type) {
2144 	case SO_MARK:
2145 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2146 			return -EPERM;
2147 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2148 			return -EINVAL;
2149 		sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2150 		break;
2151 	case SO_TIMESTAMPING:
2152 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2153 			return -EINVAL;
2154 
2155 		tsflags = *(u32 *)CMSG_DATA(cmsg);
2156 		if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2157 			return -EINVAL;
2158 
2159 		sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2160 		sockc->tsflags |= tsflags;
2161 		break;
2162 	case SCM_TXTIME:
2163 		if (!sock_flag(sk, SOCK_TXTIME))
2164 			return -EINVAL;
2165 		if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2166 			return -EINVAL;
2167 		sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2168 		break;
2169 	/* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2170 	case SCM_RIGHTS:
2171 	case SCM_CREDENTIALS:
2172 		break;
2173 	default:
2174 		return -EINVAL;
2175 	}
2176 	return 0;
2177 }
2178 EXPORT_SYMBOL(__sock_cmsg_send);
2179 
2180 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2181 		   struct sockcm_cookie *sockc)
2182 {
2183 	struct cmsghdr *cmsg;
2184 	int ret;
2185 
2186 	for_each_cmsghdr(cmsg, msg) {
2187 		if (!CMSG_OK(msg, cmsg))
2188 			return -EINVAL;
2189 		if (cmsg->cmsg_level != SOL_SOCKET)
2190 			continue;
2191 		ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2192 		if (ret)
2193 			return ret;
2194 	}
2195 	return 0;
2196 }
2197 EXPORT_SYMBOL(sock_cmsg_send);
2198 
2199 static void sk_enter_memory_pressure(struct sock *sk)
2200 {
2201 	if (!sk->sk_prot->enter_memory_pressure)
2202 		return;
2203 
2204 	sk->sk_prot->enter_memory_pressure(sk);
2205 }
2206 
2207 static void sk_leave_memory_pressure(struct sock *sk)
2208 {
2209 	if (sk->sk_prot->leave_memory_pressure) {
2210 		sk->sk_prot->leave_memory_pressure(sk);
2211 	} else {
2212 		unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2213 
2214 		if (memory_pressure && *memory_pressure)
2215 			*memory_pressure = 0;
2216 	}
2217 }
2218 
2219 /* On 32bit arches, an skb frag is limited to 2^15 */
2220 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
2221 
2222 /**
2223  * skb_page_frag_refill - check that a page_frag contains enough room
2224  * @sz: minimum size of the fragment we want to get
2225  * @pfrag: pointer to page_frag
2226  * @gfp: priority for memory allocation
2227  *
2228  * Note: While this allocator tries to use high order pages, there is
2229  * no guarantee that allocations succeed. Therefore, @sz MUST be
2230  * less or equal than PAGE_SIZE.
2231  */
2232 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2233 {
2234 	if (pfrag->page) {
2235 		if (page_ref_count(pfrag->page) == 1) {
2236 			pfrag->offset = 0;
2237 			return true;
2238 		}
2239 		if (pfrag->offset + sz <= pfrag->size)
2240 			return true;
2241 		put_page(pfrag->page);
2242 	}
2243 
2244 	pfrag->offset = 0;
2245 	if (SKB_FRAG_PAGE_ORDER) {
2246 		/* Avoid direct reclaim but allow kswapd to wake */
2247 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2248 					  __GFP_COMP | __GFP_NOWARN |
2249 					  __GFP_NORETRY,
2250 					  SKB_FRAG_PAGE_ORDER);
2251 		if (likely(pfrag->page)) {
2252 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2253 			return true;
2254 		}
2255 	}
2256 	pfrag->page = alloc_page(gfp);
2257 	if (likely(pfrag->page)) {
2258 		pfrag->size = PAGE_SIZE;
2259 		return true;
2260 	}
2261 	return false;
2262 }
2263 EXPORT_SYMBOL(skb_page_frag_refill);
2264 
2265 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2266 {
2267 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2268 		return true;
2269 
2270 	sk_enter_memory_pressure(sk);
2271 	sk_stream_moderate_sndbuf(sk);
2272 	return false;
2273 }
2274 EXPORT_SYMBOL(sk_page_frag_refill);
2275 
2276 static void __lock_sock(struct sock *sk)
2277 	__releases(&sk->sk_lock.slock)
2278 	__acquires(&sk->sk_lock.slock)
2279 {
2280 	DEFINE_WAIT(wait);
2281 
2282 	for (;;) {
2283 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2284 					TASK_UNINTERRUPTIBLE);
2285 		spin_unlock_bh(&sk->sk_lock.slock);
2286 		schedule();
2287 		spin_lock_bh(&sk->sk_lock.slock);
2288 		if (!sock_owned_by_user(sk))
2289 			break;
2290 	}
2291 	finish_wait(&sk->sk_lock.wq, &wait);
2292 }
2293 
2294 void __release_sock(struct sock *sk)
2295 	__releases(&sk->sk_lock.slock)
2296 	__acquires(&sk->sk_lock.slock)
2297 {
2298 	struct sk_buff *skb, *next;
2299 
2300 	while ((skb = sk->sk_backlog.head) != NULL) {
2301 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2302 
2303 		spin_unlock_bh(&sk->sk_lock.slock);
2304 
2305 		do {
2306 			next = skb->next;
2307 			prefetch(next);
2308 			WARN_ON_ONCE(skb_dst_is_noref(skb));
2309 			skb_mark_not_on_list(skb);
2310 			sk_backlog_rcv(sk, skb);
2311 
2312 			cond_resched();
2313 
2314 			skb = next;
2315 		} while (skb != NULL);
2316 
2317 		spin_lock_bh(&sk->sk_lock.slock);
2318 	}
2319 
2320 	/*
2321 	 * Doing the zeroing here guarantee we can not loop forever
2322 	 * while a wild producer attempts to flood us.
2323 	 */
2324 	sk->sk_backlog.len = 0;
2325 }
2326 
2327 void __sk_flush_backlog(struct sock *sk)
2328 {
2329 	spin_lock_bh(&sk->sk_lock.slock);
2330 	__release_sock(sk);
2331 	spin_unlock_bh(&sk->sk_lock.slock);
2332 }
2333 
2334 /**
2335  * sk_wait_data - wait for data to arrive at sk_receive_queue
2336  * @sk:    sock to wait on
2337  * @timeo: for how long
2338  * @skb:   last skb seen on sk_receive_queue
2339  *
2340  * Now socket state including sk->sk_err is changed only under lock,
2341  * hence we may omit checks after joining wait queue.
2342  * We check receive queue before schedule() only as optimization;
2343  * it is very likely that release_sock() added new data.
2344  */
2345 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2346 {
2347 	DEFINE_WAIT_FUNC(wait, woken_wake_function);
2348 	int rc;
2349 
2350 	add_wait_queue(sk_sleep(sk), &wait);
2351 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2352 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2353 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2354 	remove_wait_queue(sk_sleep(sk), &wait);
2355 	return rc;
2356 }
2357 EXPORT_SYMBOL(sk_wait_data);
2358 
2359 /**
2360  *	__sk_mem_raise_allocated - increase memory_allocated
2361  *	@sk: socket
2362  *	@size: memory size to allocate
2363  *	@amt: pages to allocate
2364  *	@kind: allocation type
2365  *
2366  *	Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2367  */
2368 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2369 {
2370 	struct proto *prot = sk->sk_prot;
2371 	long allocated = sk_memory_allocated_add(sk, amt);
2372 	bool charged = true;
2373 
2374 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2375 	    !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt)))
2376 		goto suppress_allocation;
2377 
2378 	/* Under limit. */
2379 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2380 		sk_leave_memory_pressure(sk);
2381 		return 1;
2382 	}
2383 
2384 	/* Under pressure. */
2385 	if (allocated > sk_prot_mem_limits(sk, 1))
2386 		sk_enter_memory_pressure(sk);
2387 
2388 	/* Over hard limit. */
2389 	if (allocated > sk_prot_mem_limits(sk, 2))
2390 		goto suppress_allocation;
2391 
2392 	/* guarantee minimum buffer size under pressure */
2393 	if (kind == SK_MEM_RECV) {
2394 		if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2395 			return 1;
2396 
2397 	} else { /* SK_MEM_SEND */
2398 		int wmem0 = sk_get_wmem0(sk, prot);
2399 
2400 		if (sk->sk_type == SOCK_STREAM) {
2401 			if (sk->sk_wmem_queued < wmem0)
2402 				return 1;
2403 		} else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2404 				return 1;
2405 		}
2406 	}
2407 
2408 	if (sk_has_memory_pressure(sk)) {
2409 		int alloc;
2410 
2411 		if (!sk_under_memory_pressure(sk))
2412 			return 1;
2413 		alloc = sk_sockets_allocated_read_positive(sk);
2414 		if (sk_prot_mem_limits(sk, 2) > alloc *
2415 		    sk_mem_pages(sk->sk_wmem_queued +
2416 				 atomic_read(&sk->sk_rmem_alloc) +
2417 				 sk->sk_forward_alloc))
2418 			return 1;
2419 	}
2420 
2421 suppress_allocation:
2422 
2423 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2424 		sk_stream_moderate_sndbuf(sk);
2425 
2426 		/* Fail only if socket is _under_ its sndbuf.
2427 		 * In this case we cannot block, so that we have to fail.
2428 		 */
2429 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2430 			return 1;
2431 	}
2432 
2433 	if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2434 		trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2435 
2436 	sk_memory_allocated_sub(sk, amt);
2437 
2438 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2439 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2440 
2441 	return 0;
2442 }
2443 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2444 
2445 /**
2446  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2447  *	@sk: socket
2448  *	@size: memory size to allocate
2449  *	@kind: allocation type
2450  *
2451  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2452  *	rmem allocation. This function assumes that protocols which have
2453  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2454  */
2455 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2456 {
2457 	int ret, amt = sk_mem_pages(size);
2458 
2459 	sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2460 	ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2461 	if (!ret)
2462 		sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2463 	return ret;
2464 }
2465 EXPORT_SYMBOL(__sk_mem_schedule);
2466 
2467 /**
2468  *	__sk_mem_reduce_allocated - reclaim memory_allocated
2469  *	@sk: socket
2470  *	@amount: number of quanta
2471  *
2472  *	Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2473  */
2474 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2475 {
2476 	sk_memory_allocated_sub(sk, amount);
2477 
2478 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2479 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2480 
2481 	if (sk_under_memory_pressure(sk) &&
2482 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2483 		sk_leave_memory_pressure(sk);
2484 }
2485 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2486 
2487 /**
2488  *	__sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2489  *	@sk: socket
2490  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2491  */
2492 void __sk_mem_reclaim(struct sock *sk, int amount)
2493 {
2494 	amount >>= SK_MEM_QUANTUM_SHIFT;
2495 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2496 	__sk_mem_reduce_allocated(sk, amount);
2497 }
2498 EXPORT_SYMBOL(__sk_mem_reclaim);
2499 
2500 int sk_set_peek_off(struct sock *sk, int val)
2501 {
2502 	sk->sk_peek_off = val;
2503 	return 0;
2504 }
2505 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2506 
2507 /*
2508  * Set of default routines for initialising struct proto_ops when
2509  * the protocol does not support a particular function. In certain
2510  * cases where it makes no sense for a protocol to have a "do nothing"
2511  * function, some default processing is provided.
2512  */
2513 
2514 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2515 {
2516 	return -EOPNOTSUPP;
2517 }
2518 EXPORT_SYMBOL(sock_no_bind);
2519 
2520 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2521 		    int len, int flags)
2522 {
2523 	return -EOPNOTSUPP;
2524 }
2525 EXPORT_SYMBOL(sock_no_connect);
2526 
2527 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2528 {
2529 	return -EOPNOTSUPP;
2530 }
2531 EXPORT_SYMBOL(sock_no_socketpair);
2532 
2533 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2534 		   bool kern)
2535 {
2536 	return -EOPNOTSUPP;
2537 }
2538 EXPORT_SYMBOL(sock_no_accept);
2539 
2540 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2541 		    int peer)
2542 {
2543 	return -EOPNOTSUPP;
2544 }
2545 EXPORT_SYMBOL(sock_no_getname);
2546 
2547 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2548 {
2549 	return -EOPNOTSUPP;
2550 }
2551 EXPORT_SYMBOL(sock_no_ioctl);
2552 
2553 int sock_no_listen(struct socket *sock, int backlog)
2554 {
2555 	return -EOPNOTSUPP;
2556 }
2557 EXPORT_SYMBOL(sock_no_listen);
2558 
2559 int sock_no_shutdown(struct socket *sock, int how)
2560 {
2561 	return -EOPNOTSUPP;
2562 }
2563 EXPORT_SYMBOL(sock_no_shutdown);
2564 
2565 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2566 		    char __user *optval, unsigned int optlen)
2567 {
2568 	return -EOPNOTSUPP;
2569 }
2570 EXPORT_SYMBOL(sock_no_setsockopt);
2571 
2572 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2573 		    char __user *optval, int __user *optlen)
2574 {
2575 	return -EOPNOTSUPP;
2576 }
2577 EXPORT_SYMBOL(sock_no_getsockopt);
2578 
2579 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2580 {
2581 	return -EOPNOTSUPP;
2582 }
2583 EXPORT_SYMBOL(sock_no_sendmsg);
2584 
2585 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2586 {
2587 	return -EOPNOTSUPP;
2588 }
2589 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2590 
2591 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2592 		    int flags)
2593 {
2594 	return -EOPNOTSUPP;
2595 }
2596 EXPORT_SYMBOL(sock_no_recvmsg);
2597 
2598 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2599 {
2600 	/* Mirror missing mmap method error code */
2601 	return -ENODEV;
2602 }
2603 EXPORT_SYMBOL(sock_no_mmap);
2604 
2605 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2606 {
2607 	ssize_t res;
2608 	struct msghdr msg = {.msg_flags = flags};
2609 	struct kvec iov;
2610 	char *kaddr = kmap(page);
2611 	iov.iov_base = kaddr + offset;
2612 	iov.iov_len = size;
2613 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2614 	kunmap(page);
2615 	return res;
2616 }
2617 EXPORT_SYMBOL(sock_no_sendpage);
2618 
2619 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2620 				int offset, size_t size, int flags)
2621 {
2622 	ssize_t res;
2623 	struct msghdr msg = {.msg_flags = flags};
2624 	struct kvec iov;
2625 	char *kaddr = kmap(page);
2626 
2627 	iov.iov_base = kaddr + offset;
2628 	iov.iov_len = size;
2629 	res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2630 	kunmap(page);
2631 	return res;
2632 }
2633 EXPORT_SYMBOL(sock_no_sendpage_locked);
2634 
2635 /*
2636  *	Default Socket Callbacks
2637  */
2638 
2639 static void sock_def_wakeup(struct sock *sk)
2640 {
2641 	struct socket_wq *wq;
2642 
2643 	rcu_read_lock();
2644 	wq = rcu_dereference(sk->sk_wq);
2645 	if (skwq_has_sleeper(wq))
2646 		wake_up_interruptible_all(&wq->wait);
2647 	rcu_read_unlock();
2648 }
2649 
2650 static void sock_def_error_report(struct sock *sk)
2651 {
2652 	struct socket_wq *wq;
2653 
2654 	rcu_read_lock();
2655 	wq = rcu_dereference(sk->sk_wq);
2656 	if (skwq_has_sleeper(wq))
2657 		wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2658 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2659 	rcu_read_unlock();
2660 }
2661 
2662 static void sock_def_readable(struct sock *sk)
2663 {
2664 	struct socket_wq *wq;
2665 
2666 	rcu_read_lock();
2667 	wq = rcu_dereference(sk->sk_wq);
2668 	if (skwq_has_sleeper(wq))
2669 		wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2670 						EPOLLRDNORM | EPOLLRDBAND);
2671 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2672 	rcu_read_unlock();
2673 }
2674 
2675 static void sock_def_write_space(struct sock *sk)
2676 {
2677 	struct socket_wq *wq;
2678 
2679 	rcu_read_lock();
2680 
2681 	/* Do not wake up a writer until he can make "significant"
2682 	 * progress.  --DaveM
2683 	 */
2684 	if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2685 		wq = rcu_dereference(sk->sk_wq);
2686 		if (skwq_has_sleeper(wq))
2687 			wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2688 						EPOLLWRNORM | EPOLLWRBAND);
2689 
2690 		/* Should agree with poll, otherwise some programs break */
2691 		if (sock_writeable(sk))
2692 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2693 	}
2694 
2695 	rcu_read_unlock();
2696 }
2697 
2698 static void sock_def_destruct(struct sock *sk)
2699 {
2700 }
2701 
2702 void sk_send_sigurg(struct sock *sk)
2703 {
2704 	if (sk->sk_socket && sk->sk_socket->file)
2705 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2706 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2707 }
2708 EXPORT_SYMBOL(sk_send_sigurg);
2709 
2710 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2711 		    unsigned long expires)
2712 {
2713 	if (!mod_timer(timer, expires))
2714 		sock_hold(sk);
2715 }
2716 EXPORT_SYMBOL(sk_reset_timer);
2717 
2718 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2719 {
2720 	if (del_timer(timer))
2721 		__sock_put(sk);
2722 }
2723 EXPORT_SYMBOL(sk_stop_timer);
2724 
2725 void sock_init_data(struct socket *sock, struct sock *sk)
2726 {
2727 	sk_init_common(sk);
2728 	sk->sk_send_head	=	NULL;
2729 
2730 	timer_setup(&sk->sk_timer, NULL, 0);
2731 
2732 	sk->sk_allocation	=	GFP_KERNEL;
2733 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2734 	sk->sk_sndbuf		=	sysctl_wmem_default;
2735 	sk->sk_state		=	TCP_CLOSE;
2736 	sk_set_socket(sk, sock);
2737 
2738 	sock_set_flag(sk, SOCK_ZAPPED);
2739 
2740 	if (sock) {
2741 		sk->sk_type	=	sock->type;
2742 		sk->sk_wq	=	sock->wq;
2743 		sock->sk	=	sk;
2744 		sk->sk_uid	=	SOCK_INODE(sock)->i_uid;
2745 	} else {
2746 		sk->sk_wq	=	NULL;
2747 		sk->sk_uid	=	make_kuid(sock_net(sk)->user_ns, 0);
2748 	}
2749 
2750 	rwlock_init(&sk->sk_callback_lock);
2751 	if (sk->sk_kern_sock)
2752 		lockdep_set_class_and_name(
2753 			&sk->sk_callback_lock,
2754 			af_kern_callback_keys + sk->sk_family,
2755 			af_family_kern_clock_key_strings[sk->sk_family]);
2756 	else
2757 		lockdep_set_class_and_name(
2758 			&sk->sk_callback_lock,
2759 			af_callback_keys + sk->sk_family,
2760 			af_family_clock_key_strings[sk->sk_family]);
2761 
2762 	sk->sk_state_change	=	sock_def_wakeup;
2763 	sk->sk_data_ready	=	sock_def_readable;
2764 	sk->sk_write_space	=	sock_def_write_space;
2765 	sk->sk_error_report	=	sock_def_error_report;
2766 	sk->sk_destruct		=	sock_def_destruct;
2767 
2768 	sk->sk_frag.page	=	NULL;
2769 	sk->sk_frag.offset	=	0;
2770 	sk->sk_peek_off		=	-1;
2771 
2772 	sk->sk_peer_pid 	=	NULL;
2773 	sk->sk_peer_cred	=	NULL;
2774 	sk->sk_write_pending	=	0;
2775 	sk->sk_rcvlowat		=	1;
2776 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2777 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2778 
2779 	sk->sk_stamp = SK_DEFAULT_STAMP;
2780 #if BITS_PER_LONG==32
2781 	seqlock_init(&sk->sk_stamp_seq);
2782 #endif
2783 	atomic_set(&sk->sk_zckey, 0);
2784 
2785 #ifdef CONFIG_NET_RX_BUSY_POLL
2786 	sk->sk_napi_id		=	0;
2787 	sk->sk_ll_usec		=	sysctl_net_busy_read;
2788 #endif
2789 
2790 	sk->sk_max_pacing_rate = ~0UL;
2791 	sk->sk_pacing_rate = ~0UL;
2792 	sk->sk_pacing_shift = 10;
2793 	sk->sk_incoming_cpu = -1;
2794 
2795 	sk_rx_queue_clear(sk);
2796 	/*
2797 	 * Before updating sk_refcnt, we must commit prior changes to memory
2798 	 * (Documentation/RCU/rculist_nulls.txt for details)
2799 	 */
2800 	smp_wmb();
2801 	refcount_set(&sk->sk_refcnt, 1);
2802 	atomic_set(&sk->sk_drops, 0);
2803 }
2804 EXPORT_SYMBOL(sock_init_data);
2805 
2806 void lock_sock_nested(struct sock *sk, int subclass)
2807 {
2808 	might_sleep();
2809 	spin_lock_bh(&sk->sk_lock.slock);
2810 	if (sk->sk_lock.owned)
2811 		__lock_sock(sk);
2812 	sk->sk_lock.owned = 1;
2813 	spin_unlock(&sk->sk_lock.slock);
2814 	/*
2815 	 * The sk_lock has mutex_lock() semantics here:
2816 	 */
2817 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2818 	local_bh_enable();
2819 }
2820 EXPORT_SYMBOL(lock_sock_nested);
2821 
2822 void release_sock(struct sock *sk)
2823 {
2824 	spin_lock_bh(&sk->sk_lock.slock);
2825 	if (sk->sk_backlog.tail)
2826 		__release_sock(sk);
2827 
2828 	/* Warning : release_cb() might need to release sk ownership,
2829 	 * ie call sock_release_ownership(sk) before us.
2830 	 */
2831 	if (sk->sk_prot->release_cb)
2832 		sk->sk_prot->release_cb(sk);
2833 
2834 	sock_release_ownership(sk);
2835 	if (waitqueue_active(&sk->sk_lock.wq))
2836 		wake_up(&sk->sk_lock.wq);
2837 	spin_unlock_bh(&sk->sk_lock.slock);
2838 }
2839 EXPORT_SYMBOL(release_sock);
2840 
2841 /**
2842  * lock_sock_fast - fast version of lock_sock
2843  * @sk: socket
2844  *
2845  * This version should be used for very small section, where process wont block
2846  * return false if fast path is taken:
2847  *
2848  *   sk_lock.slock locked, owned = 0, BH disabled
2849  *
2850  * return true if slow path is taken:
2851  *
2852  *   sk_lock.slock unlocked, owned = 1, BH enabled
2853  */
2854 bool lock_sock_fast(struct sock *sk)
2855 {
2856 	might_sleep();
2857 	spin_lock_bh(&sk->sk_lock.slock);
2858 
2859 	if (!sk->sk_lock.owned)
2860 		/*
2861 		 * Note : We must disable BH
2862 		 */
2863 		return false;
2864 
2865 	__lock_sock(sk);
2866 	sk->sk_lock.owned = 1;
2867 	spin_unlock(&sk->sk_lock.slock);
2868 	/*
2869 	 * The sk_lock has mutex_lock() semantics here:
2870 	 */
2871 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2872 	local_bh_enable();
2873 	return true;
2874 }
2875 EXPORT_SYMBOL(lock_sock_fast);
2876 
2877 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2878 {
2879 	struct timeval tv;
2880 
2881 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2882 	tv = ktime_to_timeval(sock_read_timestamp(sk));
2883 	if (tv.tv_sec == -1)
2884 		return -ENOENT;
2885 	if (tv.tv_sec == 0) {
2886 		ktime_t kt = ktime_get_real();
2887 		sock_write_timestamp(sk, kt);
2888 		tv = ktime_to_timeval(kt);
2889 	}
2890 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2891 }
2892 EXPORT_SYMBOL(sock_get_timestamp);
2893 
2894 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2895 {
2896 	struct timespec ts;
2897 
2898 	sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2899 	ts = ktime_to_timespec(sock_read_timestamp(sk));
2900 	if (ts.tv_sec == -1)
2901 		return -ENOENT;
2902 	if (ts.tv_sec == 0) {
2903 		ktime_t kt = ktime_get_real();
2904 		sock_write_timestamp(sk, kt);
2905 		ts = ktime_to_timespec(sk->sk_stamp);
2906 	}
2907 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2908 }
2909 EXPORT_SYMBOL(sock_get_timestampns);
2910 
2911 void sock_enable_timestamp(struct sock *sk, int flag)
2912 {
2913 	if (!sock_flag(sk, flag)) {
2914 		unsigned long previous_flags = sk->sk_flags;
2915 
2916 		sock_set_flag(sk, flag);
2917 		/*
2918 		 * we just set one of the two flags which require net
2919 		 * time stamping, but time stamping might have been on
2920 		 * already because of the other one
2921 		 */
2922 		if (sock_needs_netstamp(sk) &&
2923 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
2924 			net_enable_timestamp();
2925 	}
2926 }
2927 
2928 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2929 		       int level, int type)
2930 {
2931 	struct sock_exterr_skb *serr;
2932 	struct sk_buff *skb;
2933 	int copied, err;
2934 
2935 	err = -EAGAIN;
2936 	skb = sock_dequeue_err_skb(sk);
2937 	if (skb == NULL)
2938 		goto out;
2939 
2940 	copied = skb->len;
2941 	if (copied > len) {
2942 		msg->msg_flags |= MSG_TRUNC;
2943 		copied = len;
2944 	}
2945 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
2946 	if (err)
2947 		goto out_free_skb;
2948 
2949 	sock_recv_timestamp(msg, sk, skb);
2950 
2951 	serr = SKB_EXT_ERR(skb);
2952 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2953 
2954 	msg->msg_flags |= MSG_ERRQUEUE;
2955 	err = copied;
2956 
2957 out_free_skb:
2958 	kfree_skb(skb);
2959 out:
2960 	return err;
2961 }
2962 EXPORT_SYMBOL(sock_recv_errqueue);
2963 
2964 /*
2965  *	Get a socket option on an socket.
2966  *
2967  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2968  *	asynchronous errors should be reported by getsockopt. We assume
2969  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2970  */
2971 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2972 			   char __user *optval, int __user *optlen)
2973 {
2974 	struct sock *sk = sock->sk;
2975 
2976 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2977 }
2978 EXPORT_SYMBOL(sock_common_getsockopt);
2979 
2980 #ifdef CONFIG_COMPAT
2981 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2982 				  char __user *optval, int __user *optlen)
2983 {
2984 	struct sock *sk = sock->sk;
2985 
2986 	if (sk->sk_prot->compat_getsockopt != NULL)
2987 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2988 						      optval, optlen);
2989 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2990 }
2991 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2992 #endif
2993 
2994 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2995 			int flags)
2996 {
2997 	struct sock *sk = sock->sk;
2998 	int addr_len = 0;
2999 	int err;
3000 
3001 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3002 				   flags & ~MSG_DONTWAIT, &addr_len);
3003 	if (err >= 0)
3004 		msg->msg_namelen = addr_len;
3005 	return err;
3006 }
3007 EXPORT_SYMBOL(sock_common_recvmsg);
3008 
3009 /*
3010  *	Set socket options on an inet socket.
3011  */
3012 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3013 			   char __user *optval, unsigned int optlen)
3014 {
3015 	struct sock *sk = sock->sk;
3016 
3017 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3018 }
3019 EXPORT_SYMBOL(sock_common_setsockopt);
3020 
3021 #ifdef CONFIG_COMPAT
3022 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
3023 				  char __user *optval, unsigned int optlen)
3024 {
3025 	struct sock *sk = sock->sk;
3026 
3027 	if (sk->sk_prot->compat_setsockopt != NULL)
3028 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
3029 						      optval, optlen);
3030 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3031 }
3032 EXPORT_SYMBOL(compat_sock_common_setsockopt);
3033 #endif
3034 
3035 void sk_common_release(struct sock *sk)
3036 {
3037 	if (sk->sk_prot->destroy)
3038 		sk->sk_prot->destroy(sk);
3039 
3040 	/*
3041 	 * Observation: when sock_common_release is called, processes have
3042 	 * no access to socket. But net still has.
3043 	 * Step one, detach it from networking:
3044 	 *
3045 	 * A. Remove from hash tables.
3046 	 */
3047 
3048 	sk->sk_prot->unhash(sk);
3049 
3050 	/*
3051 	 * In this point socket cannot receive new packets, but it is possible
3052 	 * that some packets are in flight because some CPU runs receiver and
3053 	 * did hash table lookup before we unhashed socket. They will achieve
3054 	 * receive queue and will be purged by socket destructor.
3055 	 *
3056 	 * Also we still have packets pending on receive queue and probably,
3057 	 * our own packets waiting in device queues. sock_destroy will drain
3058 	 * receive queue, but transmitted packets will delay socket destruction
3059 	 * until the last reference will be released.
3060 	 */
3061 
3062 	sock_orphan(sk);
3063 
3064 	xfrm_sk_free_policy(sk);
3065 
3066 	sk_refcnt_debug_release(sk);
3067 
3068 	sock_put(sk);
3069 }
3070 EXPORT_SYMBOL(sk_common_release);
3071 
3072 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3073 {
3074 	memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3075 
3076 	mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3077 	mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3078 	mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3079 	mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3080 	mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3081 	mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3082 	mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3083 	mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3084 	mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3085 }
3086 
3087 #ifdef CONFIG_PROC_FS
3088 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
3089 struct prot_inuse {
3090 	int val[PROTO_INUSE_NR];
3091 };
3092 
3093 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3094 
3095 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3096 {
3097 	__this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3098 }
3099 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3100 
3101 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3102 {
3103 	int cpu, idx = prot->inuse_idx;
3104 	int res = 0;
3105 
3106 	for_each_possible_cpu(cpu)
3107 		res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3108 
3109 	return res >= 0 ? res : 0;
3110 }
3111 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3112 
3113 static void sock_inuse_add(struct net *net, int val)
3114 {
3115 	this_cpu_add(*net->core.sock_inuse, val);
3116 }
3117 
3118 int sock_inuse_get(struct net *net)
3119 {
3120 	int cpu, res = 0;
3121 
3122 	for_each_possible_cpu(cpu)
3123 		res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3124 
3125 	return res;
3126 }
3127 
3128 EXPORT_SYMBOL_GPL(sock_inuse_get);
3129 
3130 static int __net_init sock_inuse_init_net(struct net *net)
3131 {
3132 	net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3133 	if (net->core.prot_inuse == NULL)
3134 		return -ENOMEM;
3135 
3136 	net->core.sock_inuse = alloc_percpu(int);
3137 	if (net->core.sock_inuse == NULL)
3138 		goto out;
3139 
3140 	return 0;
3141 
3142 out:
3143 	free_percpu(net->core.prot_inuse);
3144 	return -ENOMEM;
3145 }
3146 
3147 static void __net_exit sock_inuse_exit_net(struct net *net)
3148 {
3149 	free_percpu(net->core.prot_inuse);
3150 	free_percpu(net->core.sock_inuse);
3151 }
3152 
3153 static struct pernet_operations net_inuse_ops = {
3154 	.init = sock_inuse_init_net,
3155 	.exit = sock_inuse_exit_net,
3156 };
3157 
3158 static __init int net_inuse_init(void)
3159 {
3160 	if (register_pernet_subsys(&net_inuse_ops))
3161 		panic("Cannot initialize net inuse counters");
3162 
3163 	return 0;
3164 }
3165 
3166 core_initcall(net_inuse_init);
3167 
3168 static void assign_proto_idx(struct proto *prot)
3169 {
3170 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3171 
3172 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3173 		pr_err("PROTO_INUSE_NR exhausted\n");
3174 		return;
3175 	}
3176 
3177 	set_bit(prot->inuse_idx, proto_inuse_idx);
3178 }
3179 
3180 static void release_proto_idx(struct proto *prot)
3181 {
3182 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3183 		clear_bit(prot->inuse_idx, proto_inuse_idx);
3184 }
3185 #else
3186 static inline void assign_proto_idx(struct proto *prot)
3187 {
3188 }
3189 
3190 static inline void release_proto_idx(struct proto *prot)
3191 {
3192 }
3193 
3194 static void sock_inuse_add(struct net *net, int val)
3195 {
3196 }
3197 #endif
3198 
3199 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3200 {
3201 	if (!rsk_prot)
3202 		return;
3203 	kfree(rsk_prot->slab_name);
3204 	rsk_prot->slab_name = NULL;
3205 	kmem_cache_destroy(rsk_prot->slab);
3206 	rsk_prot->slab = NULL;
3207 }
3208 
3209 static int req_prot_init(const struct proto *prot)
3210 {
3211 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
3212 
3213 	if (!rsk_prot)
3214 		return 0;
3215 
3216 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3217 					prot->name);
3218 	if (!rsk_prot->slab_name)
3219 		return -ENOMEM;
3220 
3221 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3222 					   rsk_prot->obj_size, 0,
3223 					   SLAB_ACCOUNT | prot->slab_flags,
3224 					   NULL);
3225 
3226 	if (!rsk_prot->slab) {
3227 		pr_crit("%s: Can't create request sock SLAB cache!\n",
3228 			prot->name);
3229 		return -ENOMEM;
3230 	}
3231 	return 0;
3232 }
3233 
3234 int proto_register(struct proto *prot, int alloc_slab)
3235 {
3236 	if (alloc_slab) {
3237 		prot->slab = kmem_cache_create_usercopy(prot->name,
3238 					prot->obj_size, 0,
3239 					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3240 					prot->slab_flags,
3241 					prot->useroffset, prot->usersize,
3242 					NULL);
3243 
3244 		if (prot->slab == NULL) {
3245 			pr_crit("%s: Can't create sock SLAB cache!\n",
3246 				prot->name);
3247 			goto out;
3248 		}
3249 
3250 		if (req_prot_init(prot))
3251 			goto out_free_request_sock_slab;
3252 
3253 		if (prot->twsk_prot != NULL) {
3254 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3255 
3256 			if (prot->twsk_prot->twsk_slab_name == NULL)
3257 				goto out_free_request_sock_slab;
3258 
3259 			prot->twsk_prot->twsk_slab =
3260 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3261 						  prot->twsk_prot->twsk_obj_size,
3262 						  0,
3263 						  SLAB_ACCOUNT |
3264 						  prot->slab_flags,
3265 						  NULL);
3266 			if (prot->twsk_prot->twsk_slab == NULL)
3267 				goto out_free_timewait_sock_slab_name;
3268 		}
3269 	}
3270 
3271 	mutex_lock(&proto_list_mutex);
3272 	list_add(&prot->node, &proto_list);
3273 	assign_proto_idx(prot);
3274 	mutex_unlock(&proto_list_mutex);
3275 	return 0;
3276 
3277 out_free_timewait_sock_slab_name:
3278 	kfree(prot->twsk_prot->twsk_slab_name);
3279 out_free_request_sock_slab:
3280 	req_prot_cleanup(prot->rsk_prot);
3281 
3282 	kmem_cache_destroy(prot->slab);
3283 	prot->slab = NULL;
3284 out:
3285 	return -ENOBUFS;
3286 }
3287 EXPORT_SYMBOL(proto_register);
3288 
3289 void proto_unregister(struct proto *prot)
3290 {
3291 	mutex_lock(&proto_list_mutex);
3292 	release_proto_idx(prot);
3293 	list_del(&prot->node);
3294 	mutex_unlock(&proto_list_mutex);
3295 
3296 	kmem_cache_destroy(prot->slab);
3297 	prot->slab = NULL;
3298 
3299 	req_prot_cleanup(prot->rsk_prot);
3300 
3301 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3302 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3303 		kfree(prot->twsk_prot->twsk_slab_name);
3304 		prot->twsk_prot->twsk_slab = NULL;
3305 	}
3306 }
3307 EXPORT_SYMBOL(proto_unregister);
3308 
3309 int sock_load_diag_module(int family, int protocol)
3310 {
3311 	if (!protocol) {
3312 		if (!sock_is_registered(family))
3313 			return -ENOENT;
3314 
3315 		return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3316 				      NETLINK_SOCK_DIAG, family);
3317 	}
3318 
3319 #ifdef CONFIG_INET
3320 	if (family == AF_INET &&
3321 	    protocol != IPPROTO_RAW &&
3322 	    !rcu_access_pointer(inet_protos[protocol]))
3323 		return -ENOENT;
3324 #endif
3325 
3326 	return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3327 			      NETLINK_SOCK_DIAG, family, protocol);
3328 }
3329 EXPORT_SYMBOL(sock_load_diag_module);
3330 
3331 #ifdef CONFIG_PROC_FS
3332 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3333 	__acquires(proto_list_mutex)
3334 {
3335 	mutex_lock(&proto_list_mutex);
3336 	return seq_list_start_head(&proto_list, *pos);
3337 }
3338 
3339 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3340 {
3341 	return seq_list_next(v, &proto_list, pos);
3342 }
3343 
3344 static void proto_seq_stop(struct seq_file *seq, void *v)
3345 	__releases(proto_list_mutex)
3346 {
3347 	mutex_unlock(&proto_list_mutex);
3348 }
3349 
3350 static char proto_method_implemented(const void *method)
3351 {
3352 	return method == NULL ? 'n' : 'y';
3353 }
3354 static long sock_prot_memory_allocated(struct proto *proto)
3355 {
3356 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3357 }
3358 
3359 static char *sock_prot_memory_pressure(struct proto *proto)
3360 {
3361 	return proto->memory_pressure != NULL ?
3362 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3363 }
3364 
3365 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3366 {
3367 
3368 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3369 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3370 		   proto->name,
3371 		   proto->obj_size,
3372 		   sock_prot_inuse_get(seq_file_net(seq), proto),
3373 		   sock_prot_memory_allocated(proto),
3374 		   sock_prot_memory_pressure(proto),
3375 		   proto->max_header,
3376 		   proto->slab == NULL ? "no" : "yes",
3377 		   module_name(proto->owner),
3378 		   proto_method_implemented(proto->close),
3379 		   proto_method_implemented(proto->connect),
3380 		   proto_method_implemented(proto->disconnect),
3381 		   proto_method_implemented(proto->accept),
3382 		   proto_method_implemented(proto->ioctl),
3383 		   proto_method_implemented(proto->init),
3384 		   proto_method_implemented(proto->destroy),
3385 		   proto_method_implemented(proto->shutdown),
3386 		   proto_method_implemented(proto->setsockopt),
3387 		   proto_method_implemented(proto->getsockopt),
3388 		   proto_method_implemented(proto->sendmsg),
3389 		   proto_method_implemented(proto->recvmsg),
3390 		   proto_method_implemented(proto->sendpage),
3391 		   proto_method_implemented(proto->bind),
3392 		   proto_method_implemented(proto->backlog_rcv),
3393 		   proto_method_implemented(proto->hash),
3394 		   proto_method_implemented(proto->unhash),
3395 		   proto_method_implemented(proto->get_port),
3396 		   proto_method_implemented(proto->enter_memory_pressure));
3397 }
3398 
3399 static int proto_seq_show(struct seq_file *seq, void *v)
3400 {
3401 	if (v == &proto_list)
3402 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3403 			   "protocol",
3404 			   "size",
3405 			   "sockets",
3406 			   "memory",
3407 			   "press",
3408 			   "maxhdr",
3409 			   "slab",
3410 			   "module",
3411 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3412 	else
3413 		proto_seq_printf(seq, list_entry(v, struct proto, node));
3414 	return 0;
3415 }
3416 
3417 static const struct seq_operations proto_seq_ops = {
3418 	.start  = proto_seq_start,
3419 	.next   = proto_seq_next,
3420 	.stop   = proto_seq_stop,
3421 	.show   = proto_seq_show,
3422 };
3423 
3424 static __net_init int proto_init_net(struct net *net)
3425 {
3426 	if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3427 			sizeof(struct seq_net_private)))
3428 		return -ENOMEM;
3429 
3430 	return 0;
3431 }
3432 
3433 static __net_exit void proto_exit_net(struct net *net)
3434 {
3435 	remove_proc_entry("protocols", net->proc_net);
3436 }
3437 
3438 
3439 static __net_initdata struct pernet_operations proto_net_ops = {
3440 	.init = proto_init_net,
3441 	.exit = proto_exit_net,
3442 };
3443 
3444 static int __init proto_init(void)
3445 {
3446 	return register_pernet_subsys(&proto_net_ops);
3447 }
3448 
3449 subsys_initcall(proto_init);
3450 
3451 #endif /* PROC_FS */
3452 
3453 #ifdef CONFIG_NET_RX_BUSY_POLL
3454 bool sk_busy_loop_end(void *p, unsigned long start_time)
3455 {
3456 	struct sock *sk = p;
3457 
3458 	return !skb_queue_empty(&sk->sk_receive_queue) ||
3459 	       sk_busy_loop_timeout(sk, start_time);
3460 }
3461 EXPORT_SYMBOL(sk_busy_loop_end);
3462 #endif /* CONFIG_NET_RX_BUSY_POLL */
3463