xref: /linux/net/core/sock.c (revision 08ec212c0f92cbf30e3ecc7349f18151714041d6)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/types.h>
97 #include <linux/socket.h>
98 #include <linux/in.h>
99 #include <linux/kernel.h>
100 #include <linux/module.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/sched.h>
104 #include <linux/timer.h>
105 #include <linux/string.h>
106 #include <linux/sockios.h>
107 #include <linux/net.h>
108 #include <linux/mm.h>
109 #include <linux/slab.h>
110 #include <linux/interrupt.h>
111 #include <linux/poll.h>
112 #include <linux/tcp.h>
113 #include <linux/init.h>
114 #include <linux/highmem.h>
115 #include <linux/user_namespace.h>
116 #include <linux/static_key.h>
117 #include <linux/memcontrol.h>
118 #include <linux/prefetch.h>
119 
120 #include <asm/uaccess.h>
121 
122 #include <linux/netdevice.h>
123 #include <net/protocol.h>
124 #include <linux/skbuff.h>
125 #include <net/net_namespace.h>
126 #include <net/request_sock.h>
127 #include <net/sock.h>
128 #include <linux/net_tstamp.h>
129 #include <net/xfrm.h>
130 #include <linux/ipsec.h>
131 #include <net/cls_cgroup.h>
132 #include <net/netprio_cgroup.h>
133 
134 #include <linux/filter.h>
135 
136 #include <trace/events/sock.h>
137 
138 #ifdef CONFIG_INET
139 #include <net/tcp.h>
140 #endif
141 
142 static DEFINE_MUTEX(proto_list_mutex);
143 static LIST_HEAD(proto_list);
144 
145 #ifdef CONFIG_MEMCG_KMEM
146 int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
147 {
148 	struct proto *proto;
149 	int ret = 0;
150 
151 	mutex_lock(&proto_list_mutex);
152 	list_for_each_entry(proto, &proto_list, node) {
153 		if (proto->init_cgroup) {
154 			ret = proto->init_cgroup(memcg, ss);
155 			if (ret)
156 				goto out;
157 		}
158 	}
159 
160 	mutex_unlock(&proto_list_mutex);
161 	return ret;
162 out:
163 	list_for_each_entry_continue_reverse(proto, &proto_list, node)
164 		if (proto->destroy_cgroup)
165 			proto->destroy_cgroup(memcg);
166 	mutex_unlock(&proto_list_mutex);
167 	return ret;
168 }
169 
170 void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
171 {
172 	struct proto *proto;
173 
174 	mutex_lock(&proto_list_mutex);
175 	list_for_each_entry_reverse(proto, &proto_list, node)
176 		if (proto->destroy_cgroup)
177 			proto->destroy_cgroup(memcg);
178 	mutex_unlock(&proto_list_mutex);
179 }
180 #endif
181 
182 /*
183  * Each address family might have different locking rules, so we have
184  * one slock key per address family:
185  */
186 static struct lock_class_key af_family_keys[AF_MAX];
187 static struct lock_class_key af_family_slock_keys[AF_MAX];
188 
189 struct static_key memcg_socket_limit_enabled;
190 EXPORT_SYMBOL(memcg_socket_limit_enabled);
191 
192 /*
193  * Make lock validator output more readable. (we pre-construct these
194  * strings build-time, so that runtime initialization of socket
195  * locks is fast):
196  */
197 static const char *const af_family_key_strings[AF_MAX+1] = {
198   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
199   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
200   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
201   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
202   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
203   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
204   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
205   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
206   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
207   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
208   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
209   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
210   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
211   "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
212 };
213 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
214   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
215   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
216   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
217   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
218   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
219   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
220   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
221   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
222   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
223   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
224   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
225   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
226   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
227   "slock-AF_NFC"   , "slock-AF_MAX"
228 };
229 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
230   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
231   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
232   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
233   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
234   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
235   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
236   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
237   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
238   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
239   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
240   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
241   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
242   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
243   "clock-AF_NFC"   , "clock-AF_MAX"
244 };
245 
246 /*
247  * sk_callback_lock locking rules are per-address-family,
248  * so split the lock classes by using a per-AF key:
249  */
250 static struct lock_class_key af_callback_keys[AF_MAX];
251 
252 /* Take into consideration the size of the struct sk_buff overhead in the
253  * determination of these values, since that is non-constant across
254  * platforms.  This makes socket queueing behavior and performance
255  * not depend upon such differences.
256  */
257 #define _SK_MEM_PACKETS		256
258 #define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
259 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
260 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
261 
262 /* Run time adjustable parameters. */
263 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
264 EXPORT_SYMBOL(sysctl_wmem_max);
265 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
266 EXPORT_SYMBOL(sysctl_rmem_max);
267 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
268 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
269 
270 /* Maximal space eaten by iovec or ancillary data plus some space */
271 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
272 EXPORT_SYMBOL(sysctl_optmem_max);
273 
274 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
275 EXPORT_SYMBOL_GPL(memalloc_socks);
276 
277 /**
278  * sk_set_memalloc - sets %SOCK_MEMALLOC
279  * @sk: socket to set it on
280  *
281  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
282  * It's the responsibility of the admin to adjust min_free_kbytes
283  * to meet the requirements
284  */
285 void sk_set_memalloc(struct sock *sk)
286 {
287 	sock_set_flag(sk, SOCK_MEMALLOC);
288 	sk->sk_allocation |= __GFP_MEMALLOC;
289 	static_key_slow_inc(&memalloc_socks);
290 }
291 EXPORT_SYMBOL_GPL(sk_set_memalloc);
292 
293 void sk_clear_memalloc(struct sock *sk)
294 {
295 	sock_reset_flag(sk, SOCK_MEMALLOC);
296 	sk->sk_allocation &= ~__GFP_MEMALLOC;
297 	static_key_slow_dec(&memalloc_socks);
298 
299 	/*
300 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
301 	 * progress of swapping. However, if SOCK_MEMALLOC is cleared while
302 	 * it has rmem allocations there is a risk that the user of the
303 	 * socket cannot make forward progress due to exceeding the rmem
304 	 * limits. By rights, sk_clear_memalloc() should only be called
305 	 * on sockets being torn down but warn and reset the accounting if
306 	 * that assumption breaks.
307 	 */
308 	if (WARN_ON(sk->sk_forward_alloc))
309 		sk_mem_reclaim(sk);
310 }
311 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
312 
313 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
314 {
315 	int ret;
316 	unsigned long pflags = current->flags;
317 
318 	/* these should have been dropped before queueing */
319 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
320 
321 	current->flags |= PF_MEMALLOC;
322 	ret = sk->sk_backlog_rcv(sk, skb);
323 	tsk_restore_flags(current, pflags, PF_MEMALLOC);
324 
325 	return ret;
326 }
327 EXPORT_SYMBOL(__sk_backlog_rcv);
328 
329 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
330 {
331 	struct timeval tv;
332 
333 	if (optlen < sizeof(tv))
334 		return -EINVAL;
335 	if (copy_from_user(&tv, optval, sizeof(tv)))
336 		return -EFAULT;
337 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
338 		return -EDOM;
339 
340 	if (tv.tv_sec < 0) {
341 		static int warned __read_mostly;
342 
343 		*timeo_p = 0;
344 		if (warned < 10 && net_ratelimit()) {
345 			warned++;
346 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
347 				__func__, current->comm, task_pid_nr(current));
348 		}
349 		return 0;
350 	}
351 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
352 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
353 		return 0;
354 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
355 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
356 	return 0;
357 }
358 
359 static void sock_warn_obsolete_bsdism(const char *name)
360 {
361 	static int warned;
362 	static char warncomm[TASK_COMM_LEN];
363 	if (strcmp(warncomm, current->comm) && warned < 5) {
364 		strcpy(warncomm,  current->comm);
365 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
366 			warncomm, name);
367 		warned++;
368 	}
369 }
370 
371 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
372 
373 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
374 {
375 	if (sk->sk_flags & flags) {
376 		sk->sk_flags &= ~flags;
377 		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
378 			net_disable_timestamp();
379 	}
380 }
381 
382 
383 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
384 {
385 	int err;
386 	int skb_len;
387 	unsigned long flags;
388 	struct sk_buff_head *list = &sk->sk_receive_queue;
389 
390 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
391 		atomic_inc(&sk->sk_drops);
392 		trace_sock_rcvqueue_full(sk, skb);
393 		return -ENOMEM;
394 	}
395 
396 	err = sk_filter(sk, skb);
397 	if (err)
398 		return err;
399 
400 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
401 		atomic_inc(&sk->sk_drops);
402 		return -ENOBUFS;
403 	}
404 
405 	skb->dev = NULL;
406 	skb_set_owner_r(skb, sk);
407 
408 	/* Cache the SKB length before we tack it onto the receive
409 	 * queue.  Once it is added it no longer belongs to us and
410 	 * may be freed by other threads of control pulling packets
411 	 * from the queue.
412 	 */
413 	skb_len = skb->len;
414 
415 	/* we escape from rcu protected region, make sure we dont leak
416 	 * a norefcounted dst
417 	 */
418 	skb_dst_force(skb);
419 
420 	spin_lock_irqsave(&list->lock, flags);
421 	skb->dropcount = atomic_read(&sk->sk_drops);
422 	__skb_queue_tail(list, skb);
423 	spin_unlock_irqrestore(&list->lock, flags);
424 
425 	if (!sock_flag(sk, SOCK_DEAD))
426 		sk->sk_data_ready(sk, skb_len);
427 	return 0;
428 }
429 EXPORT_SYMBOL(sock_queue_rcv_skb);
430 
431 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
432 {
433 	int rc = NET_RX_SUCCESS;
434 
435 	if (sk_filter(sk, skb))
436 		goto discard_and_relse;
437 
438 	skb->dev = NULL;
439 
440 	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
441 		atomic_inc(&sk->sk_drops);
442 		goto discard_and_relse;
443 	}
444 	if (nested)
445 		bh_lock_sock_nested(sk);
446 	else
447 		bh_lock_sock(sk);
448 	if (!sock_owned_by_user(sk)) {
449 		/*
450 		 * trylock + unlock semantics:
451 		 */
452 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
453 
454 		rc = sk_backlog_rcv(sk, skb);
455 
456 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
457 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
458 		bh_unlock_sock(sk);
459 		atomic_inc(&sk->sk_drops);
460 		goto discard_and_relse;
461 	}
462 
463 	bh_unlock_sock(sk);
464 out:
465 	sock_put(sk);
466 	return rc;
467 discard_and_relse:
468 	kfree_skb(skb);
469 	goto out;
470 }
471 EXPORT_SYMBOL(sk_receive_skb);
472 
473 void sk_reset_txq(struct sock *sk)
474 {
475 	sk_tx_queue_clear(sk);
476 }
477 EXPORT_SYMBOL(sk_reset_txq);
478 
479 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
480 {
481 	struct dst_entry *dst = __sk_dst_get(sk);
482 
483 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
484 		sk_tx_queue_clear(sk);
485 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
486 		dst_release(dst);
487 		return NULL;
488 	}
489 
490 	return dst;
491 }
492 EXPORT_SYMBOL(__sk_dst_check);
493 
494 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
495 {
496 	struct dst_entry *dst = sk_dst_get(sk);
497 
498 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
499 		sk_dst_reset(sk);
500 		dst_release(dst);
501 		return NULL;
502 	}
503 
504 	return dst;
505 }
506 EXPORT_SYMBOL(sk_dst_check);
507 
508 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
509 {
510 	int ret = -ENOPROTOOPT;
511 #ifdef CONFIG_NETDEVICES
512 	struct net *net = sock_net(sk);
513 	char devname[IFNAMSIZ];
514 	int index;
515 
516 	/* Sorry... */
517 	ret = -EPERM;
518 	if (!capable(CAP_NET_RAW))
519 		goto out;
520 
521 	ret = -EINVAL;
522 	if (optlen < 0)
523 		goto out;
524 
525 	/* Bind this socket to a particular device like "eth0",
526 	 * as specified in the passed interface name. If the
527 	 * name is "" or the option length is zero the socket
528 	 * is not bound.
529 	 */
530 	if (optlen > IFNAMSIZ - 1)
531 		optlen = IFNAMSIZ - 1;
532 	memset(devname, 0, sizeof(devname));
533 
534 	ret = -EFAULT;
535 	if (copy_from_user(devname, optval, optlen))
536 		goto out;
537 
538 	index = 0;
539 	if (devname[0] != '\0') {
540 		struct net_device *dev;
541 
542 		rcu_read_lock();
543 		dev = dev_get_by_name_rcu(net, devname);
544 		if (dev)
545 			index = dev->ifindex;
546 		rcu_read_unlock();
547 		ret = -ENODEV;
548 		if (!dev)
549 			goto out;
550 	}
551 
552 	lock_sock(sk);
553 	sk->sk_bound_dev_if = index;
554 	sk_dst_reset(sk);
555 	release_sock(sk);
556 
557 	ret = 0;
558 
559 out:
560 #endif
561 
562 	return ret;
563 }
564 
565 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
566 {
567 	if (valbool)
568 		sock_set_flag(sk, bit);
569 	else
570 		sock_reset_flag(sk, bit);
571 }
572 
573 /*
574  *	This is meant for all protocols to use and covers goings on
575  *	at the socket level. Everything here is generic.
576  */
577 
578 int sock_setsockopt(struct socket *sock, int level, int optname,
579 		    char __user *optval, unsigned int optlen)
580 {
581 	struct sock *sk = sock->sk;
582 	int val;
583 	int valbool;
584 	struct linger ling;
585 	int ret = 0;
586 
587 	/*
588 	 *	Options without arguments
589 	 */
590 
591 	if (optname == SO_BINDTODEVICE)
592 		return sock_bindtodevice(sk, optval, optlen);
593 
594 	if (optlen < sizeof(int))
595 		return -EINVAL;
596 
597 	if (get_user(val, (int __user *)optval))
598 		return -EFAULT;
599 
600 	valbool = val ? 1 : 0;
601 
602 	lock_sock(sk);
603 
604 	switch (optname) {
605 	case SO_DEBUG:
606 		if (val && !capable(CAP_NET_ADMIN))
607 			ret = -EACCES;
608 		else
609 			sock_valbool_flag(sk, SOCK_DBG, valbool);
610 		break;
611 	case SO_REUSEADDR:
612 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
613 		break;
614 	case SO_TYPE:
615 	case SO_PROTOCOL:
616 	case SO_DOMAIN:
617 	case SO_ERROR:
618 		ret = -ENOPROTOOPT;
619 		break;
620 	case SO_DONTROUTE:
621 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
622 		break;
623 	case SO_BROADCAST:
624 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
625 		break;
626 	case SO_SNDBUF:
627 		/* Don't error on this BSD doesn't and if you think
628 		 * about it this is right. Otherwise apps have to
629 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
630 		 * are treated in BSD as hints
631 		 */
632 		val = min_t(u32, val, sysctl_wmem_max);
633 set_sndbuf:
634 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
635 		sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
636 		/* Wake up sending tasks if we upped the value. */
637 		sk->sk_write_space(sk);
638 		break;
639 
640 	case SO_SNDBUFFORCE:
641 		if (!capable(CAP_NET_ADMIN)) {
642 			ret = -EPERM;
643 			break;
644 		}
645 		goto set_sndbuf;
646 
647 	case SO_RCVBUF:
648 		/* Don't error on this BSD doesn't and if you think
649 		 * about it this is right. Otherwise apps have to
650 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
651 		 * are treated in BSD as hints
652 		 */
653 		val = min_t(u32, val, sysctl_rmem_max);
654 set_rcvbuf:
655 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
656 		/*
657 		 * We double it on the way in to account for
658 		 * "struct sk_buff" etc. overhead.   Applications
659 		 * assume that the SO_RCVBUF setting they make will
660 		 * allow that much actual data to be received on that
661 		 * socket.
662 		 *
663 		 * Applications are unaware that "struct sk_buff" and
664 		 * other overheads allocate from the receive buffer
665 		 * during socket buffer allocation.
666 		 *
667 		 * And after considering the possible alternatives,
668 		 * returning the value we actually used in getsockopt
669 		 * is the most desirable behavior.
670 		 */
671 		sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
672 		break;
673 
674 	case SO_RCVBUFFORCE:
675 		if (!capable(CAP_NET_ADMIN)) {
676 			ret = -EPERM;
677 			break;
678 		}
679 		goto set_rcvbuf;
680 
681 	case SO_KEEPALIVE:
682 #ifdef CONFIG_INET
683 		if (sk->sk_protocol == IPPROTO_TCP &&
684 		    sk->sk_type == SOCK_STREAM)
685 			tcp_set_keepalive(sk, valbool);
686 #endif
687 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
688 		break;
689 
690 	case SO_OOBINLINE:
691 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
692 		break;
693 
694 	case SO_NO_CHECK:
695 		sk->sk_no_check = valbool;
696 		break;
697 
698 	case SO_PRIORITY:
699 		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
700 			sk->sk_priority = val;
701 		else
702 			ret = -EPERM;
703 		break;
704 
705 	case SO_LINGER:
706 		if (optlen < sizeof(ling)) {
707 			ret = -EINVAL;	/* 1003.1g */
708 			break;
709 		}
710 		if (copy_from_user(&ling, optval, sizeof(ling))) {
711 			ret = -EFAULT;
712 			break;
713 		}
714 		if (!ling.l_onoff)
715 			sock_reset_flag(sk, SOCK_LINGER);
716 		else {
717 #if (BITS_PER_LONG == 32)
718 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
719 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
720 			else
721 #endif
722 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
723 			sock_set_flag(sk, SOCK_LINGER);
724 		}
725 		break;
726 
727 	case SO_BSDCOMPAT:
728 		sock_warn_obsolete_bsdism("setsockopt");
729 		break;
730 
731 	case SO_PASSCRED:
732 		if (valbool)
733 			set_bit(SOCK_PASSCRED, &sock->flags);
734 		else
735 			clear_bit(SOCK_PASSCRED, &sock->flags);
736 		break;
737 
738 	case SO_TIMESTAMP:
739 	case SO_TIMESTAMPNS:
740 		if (valbool)  {
741 			if (optname == SO_TIMESTAMP)
742 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
743 			else
744 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
745 			sock_set_flag(sk, SOCK_RCVTSTAMP);
746 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
747 		} else {
748 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
749 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
750 		}
751 		break;
752 
753 	case SO_TIMESTAMPING:
754 		if (val & ~SOF_TIMESTAMPING_MASK) {
755 			ret = -EINVAL;
756 			break;
757 		}
758 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
759 				  val & SOF_TIMESTAMPING_TX_HARDWARE);
760 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
761 				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
762 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
763 				  val & SOF_TIMESTAMPING_RX_HARDWARE);
764 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
765 			sock_enable_timestamp(sk,
766 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
767 		else
768 			sock_disable_timestamp(sk,
769 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
770 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
771 				  val & SOF_TIMESTAMPING_SOFTWARE);
772 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
773 				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
774 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
775 				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
776 		break;
777 
778 	case SO_RCVLOWAT:
779 		if (val < 0)
780 			val = INT_MAX;
781 		sk->sk_rcvlowat = val ? : 1;
782 		break;
783 
784 	case SO_RCVTIMEO:
785 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
786 		break;
787 
788 	case SO_SNDTIMEO:
789 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
790 		break;
791 
792 	case SO_ATTACH_FILTER:
793 		ret = -EINVAL;
794 		if (optlen == sizeof(struct sock_fprog)) {
795 			struct sock_fprog fprog;
796 
797 			ret = -EFAULT;
798 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
799 				break;
800 
801 			ret = sk_attach_filter(&fprog, sk);
802 		}
803 		break;
804 
805 	case SO_DETACH_FILTER:
806 		ret = sk_detach_filter(sk);
807 		break;
808 
809 	case SO_PASSSEC:
810 		if (valbool)
811 			set_bit(SOCK_PASSSEC, &sock->flags);
812 		else
813 			clear_bit(SOCK_PASSSEC, &sock->flags);
814 		break;
815 	case SO_MARK:
816 		if (!capable(CAP_NET_ADMIN))
817 			ret = -EPERM;
818 		else
819 			sk->sk_mark = val;
820 		break;
821 
822 		/* We implement the SO_SNDLOWAT etc to
823 		   not be settable (1003.1g 5.3) */
824 	case SO_RXQ_OVFL:
825 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
826 		break;
827 
828 	case SO_WIFI_STATUS:
829 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
830 		break;
831 
832 	case SO_PEEK_OFF:
833 		if (sock->ops->set_peek_off)
834 			sock->ops->set_peek_off(sk, val);
835 		else
836 			ret = -EOPNOTSUPP;
837 		break;
838 
839 	case SO_NOFCS:
840 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
841 		break;
842 
843 	default:
844 		ret = -ENOPROTOOPT;
845 		break;
846 	}
847 	release_sock(sk);
848 	return ret;
849 }
850 EXPORT_SYMBOL(sock_setsockopt);
851 
852 
853 void cred_to_ucred(struct pid *pid, const struct cred *cred,
854 		   struct ucred *ucred)
855 {
856 	ucred->pid = pid_vnr(pid);
857 	ucred->uid = ucred->gid = -1;
858 	if (cred) {
859 		struct user_namespace *current_ns = current_user_ns();
860 
861 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
862 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
863 	}
864 }
865 EXPORT_SYMBOL_GPL(cred_to_ucred);
866 
867 int sock_getsockopt(struct socket *sock, int level, int optname,
868 		    char __user *optval, int __user *optlen)
869 {
870 	struct sock *sk = sock->sk;
871 
872 	union {
873 		int val;
874 		struct linger ling;
875 		struct timeval tm;
876 	} v;
877 
878 	int lv = sizeof(int);
879 	int len;
880 
881 	if (get_user(len, optlen))
882 		return -EFAULT;
883 	if (len < 0)
884 		return -EINVAL;
885 
886 	memset(&v, 0, sizeof(v));
887 
888 	switch (optname) {
889 	case SO_DEBUG:
890 		v.val = sock_flag(sk, SOCK_DBG);
891 		break;
892 
893 	case SO_DONTROUTE:
894 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
895 		break;
896 
897 	case SO_BROADCAST:
898 		v.val = sock_flag(sk, SOCK_BROADCAST);
899 		break;
900 
901 	case SO_SNDBUF:
902 		v.val = sk->sk_sndbuf;
903 		break;
904 
905 	case SO_RCVBUF:
906 		v.val = sk->sk_rcvbuf;
907 		break;
908 
909 	case SO_REUSEADDR:
910 		v.val = sk->sk_reuse;
911 		break;
912 
913 	case SO_KEEPALIVE:
914 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
915 		break;
916 
917 	case SO_TYPE:
918 		v.val = sk->sk_type;
919 		break;
920 
921 	case SO_PROTOCOL:
922 		v.val = sk->sk_protocol;
923 		break;
924 
925 	case SO_DOMAIN:
926 		v.val = sk->sk_family;
927 		break;
928 
929 	case SO_ERROR:
930 		v.val = -sock_error(sk);
931 		if (v.val == 0)
932 			v.val = xchg(&sk->sk_err_soft, 0);
933 		break;
934 
935 	case SO_OOBINLINE:
936 		v.val = sock_flag(sk, SOCK_URGINLINE);
937 		break;
938 
939 	case SO_NO_CHECK:
940 		v.val = sk->sk_no_check;
941 		break;
942 
943 	case SO_PRIORITY:
944 		v.val = sk->sk_priority;
945 		break;
946 
947 	case SO_LINGER:
948 		lv		= sizeof(v.ling);
949 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
950 		v.ling.l_linger	= sk->sk_lingertime / HZ;
951 		break;
952 
953 	case SO_BSDCOMPAT:
954 		sock_warn_obsolete_bsdism("getsockopt");
955 		break;
956 
957 	case SO_TIMESTAMP:
958 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
959 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
960 		break;
961 
962 	case SO_TIMESTAMPNS:
963 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
964 		break;
965 
966 	case SO_TIMESTAMPING:
967 		v.val = 0;
968 		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
969 			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
970 		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
971 			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
972 		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
973 			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
974 		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
975 			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
976 		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
977 			v.val |= SOF_TIMESTAMPING_SOFTWARE;
978 		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
979 			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
980 		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
981 			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
982 		break;
983 
984 	case SO_RCVTIMEO:
985 		lv = sizeof(struct timeval);
986 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
987 			v.tm.tv_sec = 0;
988 			v.tm.tv_usec = 0;
989 		} else {
990 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
991 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
992 		}
993 		break;
994 
995 	case SO_SNDTIMEO:
996 		lv = sizeof(struct timeval);
997 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
998 			v.tm.tv_sec = 0;
999 			v.tm.tv_usec = 0;
1000 		} else {
1001 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1002 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1003 		}
1004 		break;
1005 
1006 	case SO_RCVLOWAT:
1007 		v.val = sk->sk_rcvlowat;
1008 		break;
1009 
1010 	case SO_SNDLOWAT:
1011 		v.val = 1;
1012 		break;
1013 
1014 	case SO_PASSCRED:
1015 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1016 		break;
1017 
1018 	case SO_PEERCRED:
1019 	{
1020 		struct ucred peercred;
1021 		if (len > sizeof(peercred))
1022 			len = sizeof(peercred);
1023 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1024 		if (copy_to_user(optval, &peercred, len))
1025 			return -EFAULT;
1026 		goto lenout;
1027 	}
1028 
1029 	case SO_PEERNAME:
1030 	{
1031 		char address[128];
1032 
1033 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1034 			return -ENOTCONN;
1035 		if (lv < len)
1036 			return -EINVAL;
1037 		if (copy_to_user(optval, address, len))
1038 			return -EFAULT;
1039 		goto lenout;
1040 	}
1041 
1042 	/* Dubious BSD thing... Probably nobody even uses it, but
1043 	 * the UNIX standard wants it for whatever reason... -DaveM
1044 	 */
1045 	case SO_ACCEPTCONN:
1046 		v.val = sk->sk_state == TCP_LISTEN;
1047 		break;
1048 
1049 	case SO_PASSSEC:
1050 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1051 		break;
1052 
1053 	case SO_PEERSEC:
1054 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1055 
1056 	case SO_MARK:
1057 		v.val = sk->sk_mark;
1058 		break;
1059 
1060 	case SO_RXQ_OVFL:
1061 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1062 		break;
1063 
1064 	case SO_WIFI_STATUS:
1065 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1066 		break;
1067 
1068 	case SO_PEEK_OFF:
1069 		if (!sock->ops->set_peek_off)
1070 			return -EOPNOTSUPP;
1071 
1072 		v.val = sk->sk_peek_off;
1073 		break;
1074 	case SO_NOFCS:
1075 		v.val = sock_flag(sk, SOCK_NOFCS);
1076 		break;
1077 	default:
1078 		return -ENOPROTOOPT;
1079 	}
1080 
1081 	if (len > lv)
1082 		len = lv;
1083 	if (copy_to_user(optval, &v, len))
1084 		return -EFAULT;
1085 lenout:
1086 	if (put_user(len, optlen))
1087 		return -EFAULT;
1088 	return 0;
1089 }
1090 
1091 /*
1092  * Initialize an sk_lock.
1093  *
1094  * (We also register the sk_lock with the lock validator.)
1095  */
1096 static inline void sock_lock_init(struct sock *sk)
1097 {
1098 	sock_lock_init_class_and_name(sk,
1099 			af_family_slock_key_strings[sk->sk_family],
1100 			af_family_slock_keys + sk->sk_family,
1101 			af_family_key_strings[sk->sk_family],
1102 			af_family_keys + sk->sk_family);
1103 }
1104 
1105 /*
1106  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1107  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1108  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1109  */
1110 static void sock_copy(struct sock *nsk, const struct sock *osk)
1111 {
1112 #ifdef CONFIG_SECURITY_NETWORK
1113 	void *sptr = nsk->sk_security;
1114 #endif
1115 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1116 
1117 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1118 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1119 
1120 #ifdef CONFIG_SECURITY_NETWORK
1121 	nsk->sk_security = sptr;
1122 	security_sk_clone(osk, nsk);
1123 #endif
1124 }
1125 
1126 /*
1127  * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1128  * un-modified. Special care is taken when initializing object to zero.
1129  */
1130 static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1131 {
1132 	if (offsetof(struct sock, sk_node.next) != 0)
1133 		memset(sk, 0, offsetof(struct sock, sk_node.next));
1134 	memset(&sk->sk_node.pprev, 0,
1135 	       size - offsetof(struct sock, sk_node.pprev));
1136 }
1137 
1138 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1139 {
1140 	unsigned long nulls1, nulls2;
1141 
1142 	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1143 	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1144 	if (nulls1 > nulls2)
1145 		swap(nulls1, nulls2);
1146 
1147 	if (nulls1 != 0)
1148 		memset((char *)sk, 0, nulls1);
1149 	memset((char *)sk + nulls1 + sizeof(void *), 0,
1150 	       nulls2 - nulls1 - sizeof(void *));
1151 	memset((char *)sk + nulls2 + sizeof(void *), 0,
1152 	       size - nulls2 - sizeof(void *));
1153 }
1154 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1155 
1156 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1157 		int family)
1158 {
1159 	struct sock *sk;
1160 	struct kmem_cache *slab;
1161 
1162 	slab = prot->slab;
1163 	if (slab != NULL) {
1164 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1165 		if (!sk)
1166 			return sk;
1167 		if (priority & __GFP_ZERO) {
1168 			if (prot->clear_sk)
1169 				prot->clear_sk(sk, prot->obj_size);
1170 			else
1171 				sk_prot_clear_nulls(sk, prot->obj_size);
1172 		}
1173 	} else
1174 		sk = kmalloc(prot->obj_size, priority);
1175 
1176 	if (sk != NULL) {
1177 		kmemcheck_annotate_bitfield(sk, flags);
1178 
1179 		if (security_sk_alloc(sk, family, priority))
1180 			goto out_free;
1181 
1182 		if (!try_module_get(prot->owner))
1183 			goto out_free_sec;
1184 		sk_tx_queue_clear(sk);
1185 	}
1186 
1187 	return sk;
1188 
1189 out_free_sec:
1190 	security_sk_free(sk);
1191 out_free:
1192 	if (slab != NULL)
1193 		kmem_cache_free(slab, sk);
1194 	else
1195 		kfree(sk);
1196 	return NULL;
1197 }
1198 
1199 static void sk_prot_free(struct proto *prot, struct sock *sk)
1200 {
1201 	struct kmem_cache *slab;
1202 	struct module *owner;
1203 
1204 	owner = prot->owner;
1205 	slab = prot->slab;
1206 
1207 	security_sk_free(sk);
1208 	if (slab != NULL)
1209 		kmem_cache_free(slab, sk);
1210 	else
1211 		kfree(sk);
1212 	module_put(owner);
1213 }
1214 
1215 #ifdef CONFIG_CGROUPS
1216 #if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
1217 void sock_update_classid(struct sock *sk)
1218 {
1219 	u32 classid;
1220 
1221 	rcu_read_lock();  /* doing current task, which cannot vanish. */
1222 	classid = task_cls_classid(current);
1223 	rcu_read_unlock();
1224 	if (classid != sk->sk_classid)
1225 		sk->sk_classid = classid;
1226 }
1227 EXPORT_SYMBOL(sock_update_classid);
1228 #endif
1229 
1230 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
1231 void sock_update_netprioidx(struct sock *sk, struct task_struct *task)
1232 {
1233 	if (in_interrupt())
1234 		return;
1235 
1236 	sk->sk_cgrp_prioidx = task_netprioidx(task);
1237 }
1238 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1239 #endif
1240 #endif
1241 
1242 /**
1243  *	sk_alloc - All socket objects are allocated here
1244  *	@net: the applicable net namespace
1245  *	@family: protocol family
1246  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1247  *	@prot: struct proto associated with this new sock instance
1248  */
1249 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1250 		      struct proto *prot)
1251 {
1252 	struct sock *sk;
1253 
1254 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1255 	if (sk) {
1256 		sk->sk_family = family;
1257 		/*
1258 		 * See comment in struct sock definition to understand
1259 		 * why we need sk_prot_creator -acme
1260 		 */
1261 		sk->sk_prot = sk->sk_prot_creator = prot;
1262 		sock_lock_init(sk);
1263 		sock_net_set(sk, get_net(net));
1264 		atomic_set(&sk->sk_wmem_alloc, 1);
1265 
1266 		sock_update_classid(sk);
1267 		sock_update_netprioidx(sk, current);
1268 	}
1269 
1270 	return sk;
1271 }
1272 EXPORT_SYMBOL(sk_alloc);
1273 
1274 static void __sk_free(struct sock *sk)
1275 {
1276 	struct sk_filter *filter;
1277 
1278 	if (sk->sk_destruct)
1279 		sk->sk_destruct(sk);
1280 
1281 	filter = rcu_dereference_check(sk->sk_filter,
1282 				       atomic_read(&sk->sk_wmem_alloc) == 0);
1283 	if (filter) {
1284 		sk_filter_uncharge(sk, filter);
1285 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1286 	}
1287 
1288 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1289 
1290 	if (atomic_read(&sk->sk_omem_alloc))
1291 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1292 			 __func__, atomic_read(&sk->sk_omem_alloc));
1293 
1294 	if (sk->sk_peer_cred)
1295 		put_cred(sk->sk_peer_cred);
1296 	put_pid(sk->sk_peer_pid);
1297 	put_net(sock_net(sk));
1298 	sk_prot_free(sk->sk_prot_creator, sk);
1299 }
1300 
1301 void sk_free(struct sock *sk)
1302 {
1303 	/*
1304 	 * We subtract one from sk_wmem_alloc and can know if
1305 	 * some packets are still in some tx queue.
1306 	 * If not null, sock_wfree() will call __sk_free(sk) later
1307 	 */
1308 	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1309 		__sk_free(sk);
1310 }
1311 EXPORT_SYMBOL(sk_free);
1312 
1313 /*
1314  * Last sock_put should drop reference to sk->sk_net. It has already
1315  * been dropped in sk_change_net. Taking reference to stopping namespace
1316  * is not an option.
1317  * Take reference to a socket to remove it from hash _alive_ and after that
1318  * destroy it in the context of init_net.
1319  */
1320 void sk_release_kernel(struct sock *sk)
1321 {
1322 	if (sk == NULL || sk->sk_socket == NULL)
1323 		return;
1324 
1325 	sock_hold(sk);
1326 	sock_release(sk->sk_socket);
1327 	release_net(sock_net(sk));
1328 	sock_net_set(sk, get_net(&init_net));
1329 	sock_put(sk);
1330 }
1331 EXPORT_SYMBOL(sk_release_kernel);
1332 
1333 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1334 {
1335 	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1336 		sock_update_memcg(newsk);
1337 }
1338 
1339 /**
1340  *	sk_clone_lock - clone a socket, and lock its clone
1341  *	@sk: the socket to clone
1342  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1343  *
1344  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1345  */
1346 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1347 {
1348 	struct sock *newsk;
1349 
1350 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1351 	if (newsk != NULL) {
1352 		struct sk_filter *filter;
1353 
1354 		sock_copy(newsk, sk);
1355 
1356 		/* SANITY */
1357 		get_net(sock_net(newsk));
1358 		sk_node_init(&newsk->sk_node);
1359 		sock_lock_init(newsk);
1360 		bh_lock_sock(newsk);
1361 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1362 		newsk->sk_backlog.len = 0;
1363 
1364 		atomic_set(&newsk->sk_rmem_alloc, 0);
1365 		/*
1366 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1367 		 */
1368 		atomic_set(&newsk->sk_wmem_alloc, 1);
1369 		atomic_set(&newsk->sk_omem_alloc, 0);
1370 		skb_queue_head_init(&newsk->sk_receive_queue);
1371 		skb_queue_head_init(&newsk->sk_write_queue);
1372 #ifdef CONFIG_NET_DMA
1373 		skb_queue_head_init(&newsk->sk_async_wait_queue);
1374 #endif
1375 
1376 		spin_lock_init(&newsk->sk_dst_lock);
1377 		rwlock_init(&newsk->sk_callback_lock);
1378 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1379 				af_callback_keys + newsk->sk_family,
1380 				af_family_clock_key_strings[newsk->sk_family]);
1381 
1382 		newsk->sk_dst_cache	= NULL;
1383 		newsk->sk_wmem_queued	= 0;
1384 		newsk->sk_forward_alloc = 0;
1385 		newsk->sk_send_head	= NULL;
1386 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1387 
1388 		sock_reset_flag(newsk, SOCK_DONE);
1389 		skb_queue_head_init(&newsk->sk_error_queue);
1390 
1391 		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1392 		if (filter != NULL)
1393 			sk_filter_charge(newsk, filter);
1394 
1395 		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1396 			/* It is still raw copy of parent, so invalidate
1397 			 * destructor and make plain sk_free() */
1398 			newsk->sk_destruct = NULL;
1399 			bh_unlock_sock(newsk);
1400 			sk_free(newsk);
1401 			newsk = NULL;
1402 			goto out;
1403 		}
1404 
1405 		newsk->sk_err	   = 0;
1406 		newsk->sk_priority = 0;
1407 		/*
1408 		 * Before updating sk_refcnt, we must commit prior changes to memory
1409 		 * (Documentation/RCU/rculist_nulls.txt for details)
1410 		 */
1411 		smp_wmb();
1412 		atomic_set(&newsk->sk_refcnt, 2);
1413 
1414 		/*
1415 		 * Increment the counter in the same struct proto as the master
1416 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1417 		 * is the same as sk->sk_prot->socks, as this field was copied
1418 		 * with memcpy).
1419 		 *
1420 		 * This _changes_ the previous behaviour, where
1421 		 * tcp_create_openreq_child always was incrementing the
1422 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1423 		 * to be taken into account in all callers. -acme
1424 		 */
1425 		sk_refcnt_debug_inc(newsk);
1426 		sk_set_socket(newsk, NULL);
1427 		newsk->sk_wq = NULL;
1428 
1429 		sk_update_clone(sk, newsk);
1430 
1431 		if (newsk->sk_prot->sockets_allocated)
1432 			sk_sockets_allocated_inc(newsk);
1433 
1434 		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1435 			net_enable_timestamp();
1436 	}
1437 out:
1438 	return newsk;
1439 }
1440 EXPORT_SYMBOL_GPL(sk_clone_lock);
1441 
1442 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1443 {
1444 	__sk_dst_set(sk, dst);
1445 	sk->sk_route_caps = dst->dev->features;
1446 	if (sk->sk_route_caps & NETIF_F_GSO)
1447 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1448 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1449 	if (sk_can_gso(sk)) {
1450 		if (dst->header_len) {
1451 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1452 		} else {
1453 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1454 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1455 			sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1456 		}
1457 	}
1458 }
1459 EXPORT_SYMBOL_GPL(sk_setup_caps);
1460 
1461 /*
1462  *	Simple resource managers for sockets.
1463  */
1464 
1465 
1466 /*
1467  * Write buffer destructor automatically called from kfree_skb.
1468  */
1469 void sock_wfree(struct sk_buff *skb)
1470 {
1471 	struct sock *sk = skb->sk;
1472 	unsigned int len = skb->truesize;
1473 
1474 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1475 		/*
1476 		 * Keep a reference on sk_wmem_alloc, this will be released
1477 		 * after sk_write_space() call
1478 		 */
1479 		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1480 		sk->sk_write_space(sk);
1481 		len = 1;
1482 	}
1483 	/*
1484 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1485 	 * could not do because of in-flight packets
1486 	 */
1487 	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1488 		__sk_free(sk);
1489 }
1490 EXPORT_SYMBOL(sock_wfree);
1491 
1492 /*
1493  * Read buffer destructor automatically called from kfree_skb.
1494  */
1495 void sock_rfree(struct sk_buff *skb)
1496 {
1497 	struct sock *sk = skb->sk;
1498 	unsigned int len = skb->truesize;
1499 
1500 	atomic_sub(len, &sk->sk_rmem_alloc);
1501 	sk_mem_uncharge(sk, len);
1502 }
1503 EXPORT_SYMBOL(sock_rfree);
1504 
1505 void sock_edemux(struct sk_buff *skb)
1506 {
1507 	struct sock *sk = skb->sk;
1508 
1509 #ifdef CONFIG_INET
1510 	if (sk->sk_state == TCP_TIME_WAIT)
1511 		inet_twsk_put(inet_twsk(sk));
1512 	else
1513 #endif
1514 		sock_put(sk);
1515 }
1516 EXPORT_SYMBOL(sock_edemux);
1517 
1518 kuid_t sock_i_uid(struct sock *sk)
1519 {
1520 	kuid_t uid;
1521 
1522 	read_lock_bh(&sk->sk_callback_lock);
1523 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1524 	read_unlock_bh(&sk->sk_callback_lock);
1525 	return uid;
1526 }
1527 EXPORT_SYMBOL(sock_i_uid);
1528 
1529 unsigned long sock_i_ino(struct sock *sk)
1530 {
1531 	unsigned long ino;
1532 
1533 	read_lock_bh(&sk->sk_callback_lock);
1534 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1535 	read_unlock_bh(&sk->sk_callback_lock);
1536 	return ino;
1537 }
1538 EXPORT_SYMBOL(sock_i_ino);
1539 
1540 /*
1541  * Allocate a skb from the socket's send buffer.
1542  */
1543 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1544 			     gfp_t priority)
1545 {
1546 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1547 		struct sk_buff *skb = alloc_skb(size, priority);
1548 		if (skb) {
1549 			skb_set_owner_w(skb, sk);
1550 			return skb;
1551 		}
1552 	}
1553 	return NULL;
1554 }
1555 EXPORT_SYMBOL(sock_wmalloc);
1556 
1557 /*
1558  * Allocate a skb from the socket's receive buffer.
1559  */
1560 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1561 			     gfp_t priority)
1562 {
1563 	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1564 		struct sk_buff *skb = alloc_skb(size, priority);
1565 		if (skb) {
1566 			skb_set_owner_r(skb, sk);
1567 			return skb;
1568 		}
1569 	}
1570 	return NULL;
1571 }
1572 
1573 /*
1574  * Allocate a memory block from the socket's option memory buffer.
1575  */
1576 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1577 {
1578 	if ((unsigned int)size <= sysctl_optmem_max &&
1579 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1580 		void *mem;
1581 		/* First do the add, to avoid the race if kmalloc
1582 		 * might sleep.
1583 		 */
1584 		atomic_add(size, &sk->sk_omem_alloc);
1585 		mem = kmalloc(size, priority);
1586 		if (mem)
1587 			return mem;
1588 		atomic_sub(size, &sk->sk_omem_alloc);
1589 	}
1590 	return NULL;
1591 }
1592 EXPORT_SYMBOL(sock_kmalloc);
1593 
1594 /*
1595  * Free an option memory block.
1596  */
1597 void sock_kfree_s(struct sock *sk, void *mem, int size)
1598 {
1599 	kfree(mem);
1600 	atomic_sub(size, &sk->sk_omem_alloc);
1601 }
1602 EXPORT_SYMBOL(sock_kfree_s);
1603 
1604 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1605    I think, these locks should be removed for datagram sockets.
1606  */
1607 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1608 {
1609 	DEFINE_WAIT(wait);
1610 
1611 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1612 	for (;;) {
1613 		if (!timeo)
1614 			break;
1615 		if (signal_pending(current))
1616 			break;
1617 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1618 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1619 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1620 			break;
1621 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1622 			break;
1623 		if (sk->sk_err)
1624 			break;
1625 		timeo = schedule_timeout(timeo);
1626 	}
1627 	finish_wait(sk_sleep(sk), &wait);
1628 	return timeo;
1629 }
1630 
1631 
1632 /*
1633  *	Generic send/receive buffer handlers
1634  */
1635 
1636 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1637 				     unsigned long data_len, int noblock,
1638 				     int *errcode)
1639 {
1640 	struct sk_buff *skb;
1641 	gfp_t gfp_mask;
1642 	long timeo;
1643 	int err;
1644 	int npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1645 
1646 	err = -EMSGSIZE;
1647 	if (npages > MAX_SKB_FRAGS)
1648 		goto failure;
1649 
1650 	gfp_mask = sk->sk_allocation;
1651 	if (gfp_mask & __GFP_WAIT)
1652 		gfp_mask |= __GFP_REPEAT;
1653 
1654 	timeo = sock_sndtimeo(sk, noblock);
1655 	while (1) {
1656 		err = sock_error(sk);
1657 		if (err != 0)
1658 			goto failure;
1659 
1660 		err = -EPIPE;
1661 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1662 			goto failure;
1663 
1664 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1665 			skb = alloc_skb(header_len, gfp_mask);
1666 			if (skb) {
1667 				int i;
1668 
1669 				/* No pages, we're done... */
1670 				if (!data_len)
1671 					break;
1672 
1673 				skb->truesize += data_len;
1674 				skb_shinfo(skb)->nr_frags = npages;
1675 				for (i = 0; i < npages; i++) {
1676 					struct page *page;
1677 
1678 					page = alloc_pages(sk->sk_allocation, 0);
1679 					if (!page) {
1680 						err = -ENOBUFS;
1681 						skb_shinfo(skb)->nr_frags = i;
1682 						kfree_skb(skb);
1683 						goto failure;
1684 					}
1685 
1686 					__skb_fill_page_desc(skb, i,
1687 							page, 0,
1688 							(data_len >= PAGE_SIZE ?
1689 							 PAGE_SIZE :
1690 							 data_len));
1691 					data_len -= PAGE_SIZE;
1692 				}
1693 
1694 				/* Full success... */
1695 				break;
1696 			}
1697 			err = -ENOBUFS;
1698 			goto failure;
1699 		}
1700 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1701 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1702 		err = -EAGAIN;
1703 		if (!timeo)
1704 			goto failure;
1705 		if (signal_pending(current))
1706 			goto interrupted;
1707 		timeo = sock_wait_for_wmem(sk, timeo);
1708 	}
1709 
1710 	skb_set_owner_w(skb, sk);
1711 	return skb;
1712 
1713 interrupted:
1714 	err = sock_intr_errno(timeo);
1715 failure:
1716 	*errcode = err;
1717 	return NULL;
1718 }
1719 EXPORT_SYMBOL(sock_alloc_send_pskb);
1720 
1721 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1722 				    int noblock, int *errcode)
1723 {
1724 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1725 }
1726 EXPORT_SYMBOL(sock_alloc_send_skb);
1727 
1728 /* On 32bit arches, an skb frag is limited to 2^15 */
1729 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
1730 
1731 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1732 {
1733 	int order;
1734 
1735 	if (pfrag->page) {
1736 		if (atomic_read(&pfrag->page->_count) == 1) {
1737 			pfrag->offset = 0;
1738 			return true;
1739 		}
1740 		if (pfrag->offset < pfrag->size)
1741 			return true;
1742 		put_page(pfrag->page);
1743 	}
1744 
1745 	/* We restrict high order allocations to users that can afford to wait */
1746 	order = (sk->sk_allocation & __GFP_WAIT) ? SKB_FRAG_PAGE_ORDER : 0;
1747 
1748 	do {
1749 		gfp_t gfp = sk->sk_allocation;
1750 
1751 		if (order)
1752 			gfp |= __GFP_COMP | __GFP_NOWARN;
1753 		pfrag->page = alloc_pages(gfp, order);
1754 		if (likely(pfrag->page)) {
1755 			pfrag->offset = 0;
1756 			pfrag->size = PAGE_SIZE << order;
1757 			return true;
1758 		}
1759 	} while (--order >= 0);
1760 
1761 	sk_enter_memory_pressure(sk);
1762 	sk_stream_moderate_sndbuf(sk);
1763 	return false;
1764 }
1765 EXPORT_SYMBOL(sk_page_frag_refill);
1766 
1767 static void __lock_sock(struct sock *sk)
1768 	__releases(&sk->sk_lock.slock)
1769 	__acquires(&sk->sk_lock.slock)
1770 {
1771 	DEFINE_WAIT(wait);
1772 
1773 	for (;;) {
1774 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1775 					TASK_UNINTERRUPTIBLE);
1776 		spin_unlock_bh(&sk->sk_lock.slock);
1777 		schedule();
1778 		spin_lock_bh(&sk->sk_lock.slock);
1779 		if (!sock_owned_by_user(sk))
1780 			break;
1781 	}
1782 	finish_wait(&sk->sk_lock.wq, &wait);
1783 }
1784 
1785 static void __release_sock(struct sock *sk)
1786 	__releases(&sk->sk_lock.slock)
1787 	__acquires(&sk->sk_lock.slock)
1788 {
1789 	struct sk_buff *skb = sk->sk_backlog.head;
1790 
1791 	do {
1792 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1793 		bh_unlock_sock(sk);
1794 
1795 		do {
1796 			struct sk_buff *next = skb->next;
1797 
1798 			prefetch(next);
1799 			WARN_ON_ONCE(skb_dst_is_noref(skb));
1800 			skb->next = NULL;
1801 			sk_backlog_rcv(sk, skb);
1802 
1803 			/*
1804 			 * We are in process context here with softirqs
1805 			 * disabled, use cond_resched_softirq() to preempt.
1806 			 * This is safe to do because we've taken the backlog
1807 			 * queue private:
1808 			 */
1809 			cond_resched_softirq();
1810 
1811 			skb = next;
1812 		} while (skb != NULL);
1813 
1814 		bh_lock_sock(sk);
1815 	} while ((skb = sk->sk_backlog.head) != NULL);
1816 
1817 	/*
1818 	 * Doing the zeroing here guarantee we can not loop forever
1819 	 * while a wild producer attempts to flood us.
1820 	 */
1821 	sk->sk_backlog.len = 0;
1822 }
1823 
1824 /**
1825  * sk_wait_data - wait for data to arrive at sk_receive_queue
1826  * @sk:    sock to wait on
1827  * @timeo: for how long
1828  *
1829  * Now socket state including sk->sk_err is changed only under lock,
1830  * hence we may omit checks after joining wait queue.
1831  * We check receive queue before schedule() only as optimization;
1832  * it is very likely that release_sock() added new data.
1833  */
1834 int sk_wait_data(struct sock *sk, long *timeo)
1835 {
1836 	int rc;
1837 	DEFINE_WAIT(wait);
1838 
1839 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1840 	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1841 	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1842 	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1843 	finish_wait(sk_sleep(sk), &wait);
1844 	return rc;
1845 }
1846 EXPORT_SYMBOL(sk_wait_data);
1847 
1848 /**
1849  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1850  *	@sk: socket
1851  *	@size: memory size to allocate
1852  *	@kind: allocation type
1853  *
1854  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1855  *	rmem allocation. This function assumes that protocols which have
1856  *	memory_pressure use sk_wmem_queued as write buffer accounting.
1857  */
1858 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1859 {
1860 	struct proto *prot = sk->sk_prot;
1861 	int amt = sk_mem_pages(size);
1862 	long allocated;
1863 	int parent_status = UNDER_LIMIT;
1864 
1865 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1866 
1867 	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1868 
1869 	/* Under limit. */
1870 	if (parent_status == UNDER_LIMIT &&
1871 			allocated <= sk_prot_mem_limits(sk, 0)) {
1872 		sk_leave_memory_pressure(sk);
1873 		return 1;
1874 	}
1875 
1876 	/* Under pressure. (we or our parents) */
1877 	if ((parent_status > SOFT_LIMIT) ||
1878 			allocated > sk_prot_mem_limits(sk, 1))
1879 		sk_enter_memory_pressure(sk);
1880 
1881 	/* Over hard limit (we or our parents) */
1882 	if ((parent_status == OVER_LIMIT) ||
1883 			(allocated > sk_prot_mem_limits(sk, 2)))
1884 		goto suppress_allocation;
1885 
1886 	/* guarantee minimum buffer size under pressure */
1887 	if (kind == SK_MEM_RECV) {
1888 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1889 			return 1;
1890 
1891 	} else { /* SK_MEM_SEND */
1892 		if (sk->sk_type == SOCK_STREAM) {
1893 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1894 				return 1;
1895 		} else if (atomic_read(&sk->sk_wmem_alloc) <
1896 			   prot->sysctl_wmem[0])
1897 				return 1;
1898 	}
1899 
1900 	if (sk_has_memory_pressure(sk)) {
1901 		int alloc;
1902 
1903 		if (!sk_under_memory_pressure(sk))
1904 			return 1;
1905 		alloc = sk_sockets_allocated_read_positive(sk);
1906 		if (sk_prot_mem_limits(sk, 2) > alloc *
1907 		    sk_mem_pages(sk->sk_wmem_queued +
1908 				 atomic_read(&sk->sk_rmem_alloc) +
1909 				 sk->sk_forward_alloc))
1910 			return 1;
1911 	}
1912 
1913 suppress_allocation:
1914 
1915 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1916 		sk_stream_moderate_sndbuf(sk);
1917 
1918 		/* Fail only if socket is _under_ its sndbuf.
1919 		 * In this case we cannot block, so that we have to fail.
1920 		 */
1921 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1922 			return 1;
1923 	}
1924 
1925 	trace_sock_exceed_buf_limit(sk, prot, allocated);
1926 
1927 	/* Alas. Undo changes. */
1928 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1929 
1930 	sk_memory_allocated_sub(sk, amt);
1931 
1932 	return 0;
1933 }
1934 EXPORT_SYMBOL(__sk_mem_schedule);
1935 
1936 /**
1937  *	__sk_reclaim - reclaim memory_allocated
1938  *	@sk: socket
1939  */
1940 void __sk_mem_reclaim(struct sock *sk)
1941 {
1942 	sk_memory_allocated_sub(sk,
1943 				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
1944 	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1945 
1946 	if (sk_under_memory_pressure(sk) &&
1947 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
1948 		sk_leave_memory_pressure(sk);
1949 }
1950 EXPORT_SYMBOL(__sk_mem_reclaim);
1951 
1952 
1953 /*
1954  * Set of default routines for initialising struct proto_ops when
1955  * the protocol does not support a particular function. In certain
1956  * cases where it makes no sense for a protocol to have a "do nothing"
1957  * function, some default processing is provided.
1958  */
1959 
1960 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1961 {
1962 	return -EOPNOTSUPP;
1963 }
1964 EXPORT_SYMBOL(sock_no_bind);
1965 
1966 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1967 		    int len, int flags)
1968 {
1969 	return -EOPNOTSUPP;
1970 }
1971 EXPORT_SYMBOL(sock_no_connect);
1972 
1973 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1974 {
1975 	return -EOPNOTSUPP;
1976 }
1977 EXPORT_SYMBOL(sock_no_socketpair);
1978 
1979 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1980 {
1981 	return -EOPNOTSUPP;
1982 }
1983 EXPORT_SYMBOL(sock_no_accept);
1984 
1985 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1986 		    int *len, int peer)
1987 {
1988 	return -EOPNOTSUPP;
1989 }
1990 EXPORT_SYMBOL(sock_no_getname);
1991 
1992 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1993 {
1994 	return 0;
1995 }
1996 EXPORT_SYMBOL(sock_no_poll);
1997 
1998 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1999 {
2000 	return -EOPNOTSUPP;
2001 }
2002 EXPORT_SYMBOL(sock_no_ioctl);
2003 
2004 int sock_no_listen(struct socket *sock, int backlog)
2005 {
2006 	return -EOPNOTSUPP;
2007 }
2008 EXPORT_SYMBOL(sock_no_listen);
2009 
2010 int sock_no_shutdown(struct socket *sock, int how)
2011 {
2012 	return -EOPNOTSUPP;
2013 }
2014 EXPORT_SYMBOL(sock_no_shutdown);
2015 
2016 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2017 		    char __user *optval, unsigned int optlen)
2018 {
2019 	return -EOPNOTSUPP;
2020 }
2021 EXPORT_SYMBOL(sock_no_setsockopt);
2022 
2023 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2024 		    char __user *optval, int __user *optlen)
2025 {
2026 	return -EOPNOTSUPP;
2027 }
2028 EXPORT_SYMBOL(sock_no_getsockopt);
2029 
2030 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2031 		    size_t len)
2032 {
2033 	return -EOPNOTSUPP;
2034 }
2035 EXPORT_SYMBOL(sock_no_sendmsg);
2036 
2037 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
2038 		    size_t len, int flags)
2039 {
2040 	return -EOPNOTSUPP;
2041 }
2042 EXPORT_SYMBOL(sock_no_recvmsg);
2043 
2044 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2045 {
2046 	/* Mirror missing mmap method error code */
2047 	return -ENODEV;
2048 }
2049 EXPORT_SYMBOL(sock_no_mmap);
2050 
2051 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2052 {
2053 	ssize_t res;
2054 	struct msghdr msg = {.msg_flags = flags};
2055 	struct kvec iov;
2056 	char *kaddr = kmap(page);
2057 	iov.iov_base = kaddr + offset;
2058 	iov.iov_len = size;
2059 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2060 	kunmap(page);
2061 	return res;
2062 }
2063 EXPORT_SYMBOL(sock_no_sendpage);
2064 
2065 /*
2066  *	Default Socket Callbacks
2067  */
2068 
2069 static void sock_def_wakeup(struct sock *sk)
2070 {
2071 	struct socket_wq *wq;
2072 
2073 	rcu_read_lock();
2074 	wq = rcu_dereference(sk->sk_wq);
2075 	if (wq_has_sleeper(wq))
2076 		wake_up_interruptible_all(&wq->wait);
2077 	rcu_read_unlock();
2078 }
2079 
2080 static void sock_def_error_report(struct sock *sk)
2081 {
2082 	struct socket_wq *wq;
2083 
2084 	rcu_read_lock();
2085 	wq = rcu_dereference(sk->sk_wq);
2086 	if (wq_has_sleeper(wq))
2087 		wake_up_interruptible_poll(&wq->wait, POLLERR);
2088 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2089 	rcu_read_unlock();
2090 }
2091 
2092 static void sock_def_readable(struct sock *sk, int len)
2093 {
2094 	struct socket_wq *wq;
2095 
2096 	rcu_read_lock();
2097 	wq = rcu_dereference(sk->sk_wq);
2098 	if (wq_has_sleeper(wq))
2099 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2100 						POLLRDNORM | POLLRDBAND);
2101 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2102 	rcu_read_unlock();
2103 }
2104 
2105 static void sock_def_write_space(struct sock *sk)
2106 {
2107 	struct socket_wq *wq;
2108 
2109 	rcu_read_lock();
2110 
2111 	/* Do not wake up a writer until he can make "significant"
2112 	 * progress.  --DaveM
2113 	 */
2114 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2115 		wq = rcu_dereference(sk->sk_wq);
2116 		if (wq_has_sleeper(wq))
2117 			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2118 						POLLWRNORM | POLLWRBAND);
2119 
2120 		/* Should agree with poll, otherwise some programs break */
2121 		if (sock_writeable(sk))
2122 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2123 	}
2124 
2125 	rcu_read_unlock();
2126 }
2127 
2128 static void sock_def_destruct(struct sock *sk)
2129 {
2130 	kfree(sk->sk_protinfo);
2131 }
2132 
2133 void sk_send_sigurg(struct sock *sk)
2134 {
2135 	if (sk->sk_socket && sk->sk_socket->file)
2136 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2137 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2138 }
2139 EXPORT_SYMBOL(sk_send_sigurg);
2140 
2141 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2142 		    unsigned long expires)
2143 {
2144 	if (!mod_timer(timer, expires))
2145 		sock_hold(sk);
2146 }
2147 EXPORT_SYMBOL(sk_reset_timer);
2148 
2149 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2150 {
2151 	if (timer_pending(timer) && del_timer(timer))
2152 		__sock_put(sk);
2153 }
2154 EXPORT_SYMBOL(sk_stop_timer);
2155 
2156 void sock_init_data(struct socket *sock, struct sock *sk)
2157 {
2158 	skb_queue_head_init(&sk->sk_receive_queue);
2159 	skb_queue_head_init(&sk->sk_write_queue);
2160 	skb_queue_head_init(&sk->sk_error_queue);
2161 #ifdef CONFIG_NET_DMA
2162 	skb_queue_head_init(&sk->sk_async_wait_queue);
2163 #endif
2164 
2165 	sk->sk_send_head	=	NULL;
2166 
2167 	init_timer(&sk->sk_timer);
2168 
2169 	sk->sk_allocation	=	GFP_KERNEL;
2170 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2171 	sk->sk_sndbuf		=	sysctl_wmem_default;
2172 	sk->sk_state		=	TCP_CLOSE;
2173 	sk_set_socket(sk, sock);
2174 
2175 	sock_set_flag(sk, SOCK_ZAPPED);
2176 
2177 	if (sock) {
2178 		sk->sk_type	=	sock->type;
2179 		sk->sk_wq	=	sock->wq;
2180 		sock->sk	=	sk;
2181 	} else
2182 		sk->sk_wq	=	NULL;
2183 
2184 	spin_lock_init(&sk->sk_dst_lock);
2185 	rwlock_init(&sk->sk_callback_lock);
2186 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2187 			af_callback_keys + sk->sk_family,
2188 			af_family_clock_key_strings[sk->sk_family]);
2189 
2190 	sk->sk_state_change	=	sock_def_wakeup;
2191 	sk->sk_data_ready	=	sock_def_readable;
2192 	sk->sk_write_space	=	sock_def_write_space;
2193 	sk->sk_error_report	=	sock_def_error_report;
2194 	sk->sk_destruct		=	sock_def_destruct;
2195 
2196 	sk->sk_frag.page	=	NULL;
2197 	sk->sk_frag.offset	=	0;
2198 	sk->sk_peek_off		=	-1;
2199 
2200 	sk->sk_peer_pid 	=	NULL;
2201 	sk->sk_peer_cred	=	NULL;
2202 	sk->sk_write_pending	=	0;
2203 	sk->sk_rcvlowat		=	1;
2204 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2205 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2206 
2207 	sk->sk_stamp = ktime_set(-1L, 0);
2208 
2209 	/*
2210 	 * Before updating sk_refcnt, we must commit prior changes to memory
2211 	 * (Documentation/RCU/rculist_nulls.txt for details)
2212 	 */
2213 	smp_wmb();
2214 	atomic_set(&sk->sk_refcnt, 1);
2215 	atomic_set(&sk->sk_drops, 0);
2216 }
2217 EXPORT_SYMBOL(sock_init_data);
2218 
2219 void lock_sock_nested(struct sock *sk, int subclass)
2220 {
2221 	might_sleep();
2222 	spin_lock_bh(&sk->sk_lock.slock);
2223 	if (sk->sk_lock.owned)
2224 		__lock_sock(sk);
2225 	sk->sk_lock.owned = 1;
2226 	spin_unlock(&sk->sk_lock.slock);
2227 	/*
2228 	 * The sk_lock has mutex_lock() semantics here:
2229 	 */
2230 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2231 	local_bh_enable();
2232 }
2233 EXPORT_SYMBOL(lock_sock_nested);
2234 
2235 void release_sock(struct sock *sk)
2236 {
2237 	/*
2238 	 * The sk_lock has mutex_unlock() semantics:
2239 	 */
2240 	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2241 
2242 	spin_lock_bh(&sk->sk_lock.slock);
2243 	if (sk->sk_backlog.tail)
2244 		__release_sock(sk);
2245 
2246 	if (sk->sk_prot->release_cb)
2247 		sk->sk_prot->release_cb(sk);
2248 
2249 	sk->sk_lock.owned = 0;
2250 	if (waitqueue_active(&sk->sk_lock.wq))
2251 		wake_up(&sk->sk_lock.wq);
2252 	spin_unlock_bh(&sk->sk_lock.slock);
2253 }
2254 EXPORT_SYMBOL(release_sock);
2255 
2256 /**
2257  * lock_sock_fast - fast version of lock_sock
2258  * @sk: socket
2259  *
2260  * This version should be used for very small section, where process wont block
2261  * return false if fast path is taken
2262  *   sk_lock.slock locked, owned = 0, BH disabled
2263  * return true if slow path is taken
2264  *   sk_lock.slock unlocked, owned = 1, BH enabled
2265  */
2266 bool lock_sock_fast(struct sock *sk)
2267 {
2268 	might_sleep();
2269 	spin_lock_bh(&sk->sk_lock.slock);
2270 
2271 	if (!sk->sk_lock.owned)
2272 		/*
2273 		 * Note : We must disable BH
2274 		 */
2275 		return false;
2276 
2277 	__lock_sock(sk);
2278 	sk->sk_lock.owned = 1;
2279 	spin_unlock(&sk->sk_lock.slock);
2280 	/*
2281 	 * The sk_lock has mutex_lock() semantics here:
2282 	 */
2283 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2284 	local_bh_enable();
2285 	return true;
2286 }
2287 EXPORT_SYMBOL(lock_sock_fast);
2288 
2289 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2290 {
2291 	struct timeval tv;
2292 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2293 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2294 	tv = ktime_to_timeval(sk->sk_stamp);
2295 	if (tv.tv_sec == -1)
2296 		return -ENOENT;
2297 	if (tv.tv_sec == 0) {
2298 		sk->sk_stamp = ktime_get_real();
2299 		tv = ktime_to_timeval(sk->sk_stamp);
2300 	}
2301 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2302 }
2303 EXPORT_SYMBOL(sock_get_timestamp);
2304 
2305 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2306 {
2307 	struct timespec ts;
2308 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2309 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2310 	ts = ktime_to_timespec(sk->sk_stamp);
2311 	if (ts.tv_sec == -1)
2312 		return -ENOENT;
2313 	if (ts.tv_sec == 0) {
2314 		sk->sk_stamp = ktime_get_real();
2315 		ts = ktime_to_timespec(sk->sk_stamp);
2316 	}
2317 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2318 }
2319 EXPORT_SYMBOL(sock_get_timestampns);
2320 
2321 void sock_enable_timestamp(struct sock *sk, int flag)
2322 {
2323 	if (!sock_flag(sk, flag)) {
2324 		unsigned long previous_flags = sk->sk_flags;
2325 
2326 		sock_set_flag(sk, flag);
2327 		/*
2328 		 * we just set one of the two flags which require net
2329 		 * time stamping, but time stamping might have been on
2330 		 * already because of the other one
2331 		 */
2332 		if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2333 			net_enable_timestamp();
2334 	}
2335 }
2336 
2337 /*
2338  *	Get a socket option on an socket.
2339  *
2340  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2341  *	asynchronous errors should be reported by getsockopt. We assume
2342  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2343  */
2344 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2345 			   char __user *optval, int __user *optlen)
2346 {
2347 	struct sock *sk = sock->sk;
2348 
2349 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2350 }
2351 EXPORT_SYMBOL(sock_common_getsockopt);
2352 
2353 #ifdef CONFIG_COMPAT
2354 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2355 				  char __user *optval, int __user *optlen)
2356 {
2357 	struct sock *sk = sock->sk;
2358 
2359 	if (sk->sk_prot->compat_getsockopt != NULL)
2360 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2361 						      optval, optlen);
2362 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2363 }
2364 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2365 #endif
2366 
2367 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2368 			struct msghdr *msg, size_t size, int flags)
2369 {
2370 	struct sock *sk = sock->sk;
2371 	int addr_len = 0;
2372 	int err;
2373 
2374 	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2375 				   flags & ~MSG_DONTWAIT, &addr_len);
2376 	if (err >= 0)
2377 		msg->msg_namelen = addr_len;
2378 	return err;
2379 }
2380 EXPORT_SYMBOL(sock_common_recvmsg);
2381 
2382 /*
2383  *	Set socket options on an inet socket.
2384  */
2385 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2386 			   char __user *optval, unsigned int optlen)
2387 {
2388 	struct sock *sk = sock->sk;
2389 
2390 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2391 }
2392 EXPORT_SYMBOL(sock_common_setsockopt);
2393 
2394 #ifdef CONFIG_COMPAT
2395 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2396 				  char __user *optval, unsigned int optlen)
2397 {
2398 	struct sock *sk = sock->sk;
2399 
2400 	if (sk->sk_prot->compat_setsockopt != NULL)
2401 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2402 						      optval, optlen);
2403 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2404 }
2405 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2406 #endif
2407 
2408 void sk_common_release(struct sock *sk)
2409 {
2410 	if (sk->sk_prot->destroy)
2411 		sk->sk_prot->destroy(sk);
2412 
2413 	/*
2414 	 * Observation: when sock_common_release is called, processes have
2415 	 * no access to socket. But net still has.
2416 	 * Step one, detach it from networking:
2417 	 *
2418 	 * A. Remove from hash tables.
2419 	 */
2420 
2421 	sk->sk_prot->unhash(sk);
2422 
2423 	/*
2424 	 * In this point socket cannot receive new packets, but it is possible
2425 	 * that some packets are in flight because some CPU runs receiver and
2426 	 * did hash table lookup before we unhashed socket. They will achieve
2427 	 * receive queue and will be purged by socket destructor.
2428 	 *
2429 	 * Also we still have packets pending on receive queue and probably,
2430 	 * our own packets waiting in device queues. sock_destroy will drain
2431 	 * receive queue, but transmitted packets will delay socket destruction
2432 	 * until the last reference will be released.
2433 	 */
2434 
2435 	sock_orphan(sk);
2436 
2437 	xfrm_sk_free_policy(sk);
2438 
2439 	sk_refcnt_debug_release(sk);
2440 
2441 	if (sk->sk_frag.page) {
2442 		put_page(sk->sk_frag.page);
2443 		sk->sk_frag.page = NULL;
2444 	}
2445 
2446 	sock_put(sk);
2447 }
2448 EXPORT_SYMBOL(sk_common_release);
2449 
2450 #ifdef CONFIG_PROC_FS
2451 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
2452 struct prot_inuse {
2453 	int val[PROTO_INUSE_NR];
2454 };
2455 
2456 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2457 
2458 #ifdef CONFIG_NET_NS
2459 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2460 {
2461 	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2462 }
2463 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2464 
2465 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2466 {
2467 	int cpu, idx = prot->inuse_idx;
2468 	int res = 0;
2469 
2470 	for_each_possible_cpu(cpu)
2471 		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2472 
2473 	return res >= 0 ? res : 0;
2474 }
2475 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2476 
2477 static int __net_init sock_inuse_init_net(struct net *net)
2478 {
2479 	net->core.inuse = alloc_percpu(struct prot_inuse);
2480 	return net->core.inuse ? 0 : -ENOMEM;
2481 }
2482 
2483 static void __net_exit sock_inuse_exit_net(struct net *net)
2484 {
2485 	free_percpu(net->core.inuse);
2486 }
2487 
2488 static struct pernet_operations net_inuse_ops = {
2489 	.init = sock_inuse_init_net,
2490 	.exit = sock_inuse_exit_net,
2491 };
2492 
2493 static __init int net_inuse_init(void)
2494 {
2495 	if (register_pernet_subsys(&net_inuse_ops))
2496 		panic("Cannot initialize net inuse counters");
2497 
2498 	return 0;
2499 }
2500 
2501 core_initcall(net_inuse_init);
2502 #else
2503 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2504 
2505 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2506 {
2507 	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2508 }
2509 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2510 
2511 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2512 {
2513 	int cpu, idx = prot->inuse_idx;
2514 	int res = 0;
2515 
2516 	for_each_possible_cpu(cpu)
2517 		res += per_cpu(prot_inuse, cpu).val[idx];
2518 
2519 	return res >= 0 ? res : 0;
2520 }
2521 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2522 #endif
2523 
2524 static void assign_proto_idx(struct proto *prot)
2525 {
2526 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2527 
2528 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2529 		pr_err("PROTO_INUSE_NR exhausted\n");
2530 		return;
2531 	}
2532 
2533 	set_bit(prot->inuse_idx, proto_inuse_idx);
2534 }
2535 
2536 static void release_proto_idx(struct proto *prot)
2537 {
2538 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2539 		clear_bit(prot->inuse_idx, proto_inuse_idx);
2540 }
2541 #else
2542 static inline void assign_proto_idx(struct proto *prot)
2543 {
2544 }
2545 
2546 static inline void release_proto_idx(struct proto *prot)
2547 {
2548 }
2549 #endif
2550 
2551 int proto_register(struct proto *prot, int alloc_slab)
2552 {
2553 	if (alloc_slab) {
2554 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2555 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2556 					NULL);
2557 
2558 		if (prot->slab == NULL) {
2559 			pr_crit("%s: Can't create sock SLAB cache!\n",
2560 				prot->name);
2561 			goto out;
2562 		}
2563 
2564 		if (prot->rsk_prot != NULL) {
2565 			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2566 			if (prot->rsk_prot->slab_name == NULL)
2567 				goto out_free_sock_slab;
2568 
2569 			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2570 								 prot->rsk_prot->obj_size, 0,
2571 								 SLAB_HWCACHE_ALIGN, NULL);
2572 
2573 			if (prot->rsk_prot->slab == NULL) {
2574 				pr_crit("%s: Can't create request sock SLAB cache!\n",
2575 					prot->name);
2576 				goto out_free_request_sock_slab_name;
2577 			}
2578 		}
2579 
2580 		if (prot->twsk_prot != NULL) {
2581 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2582 
2583 			if (prot->twsk_prot->twsk_slab_name == NULL)
2584 				goto out_free_request_sock_slab;
2585 
2586 			prot->twsk_prot->twsk_slab =
2587 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2588 						  prot->twsk_prot->twsk_obj_size,
2589 						  0,
2590 						  SLAB_HWCACHE_ALIGN |
2591 							prot->slab_flags,
2592 						  NULL);
2593 			if (prot->twsk_prot->twsk_slab == NULL)
2594 				goto out_free_timewait_sock_slab_name;
2595 		}
2596 	}
2597 
2598 	mutex_lock(&proto_list_mutex);
2599 	list_add(&prot->node, &proto_list);
2600 	assign_proto_idx(prot);
2601 	mutex_unlock(&proto_list_mutex);
2602 	return 0;
2603 
2604 out_free_timewait_sock_slab_name:
2605 	kfree(prot->twsk_prot->twsk_slab_name);
2606 out_free_request_sock_slab:
2607 	if (prot->rsk_prot && prot->rsk_prot->slab) {
2608 		kmem_cache_destroy(prot->rsk_prot->slab);
2609 		prot->rsk_prot->slab = NULL;
2610 	}
2611 out_free_request_sock_slab_name:
2612 	if (prot->rsk_prot)
2613 		kfree(prot->rsk_prot->slab_name);
2614 out_free_sock_slab:
2615 	kmem_cache_destroy(prot->slab);
2616 	prot->slab = NULL;
2617 out:
2618 	return -ENOBUFS;
2619 }
2620 EXPORT_SYMBOL(proto_register);
2621 
2622 void proto_unregister(struct proto *prot)
2623 {
2624 	mutex_lock(&proto_list_mutex);
2625 	release_proto_idx(prot);
2626 	list_del(&prot->node);
2627 	mutex_unlock(&proto_list_mutex);
2628 
2629 	if (prot->slab != NULL) {
2630 		kmem_cache_destroy(prot->slab);
2631 		prot->slab = NULL;
2632 	}
2633 
2634 	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2635 		kmem_cache_destroy(prot->rsk_prot->slab);
2636 		kfree(prot->rsk_prot->slab_name);
2637 		prot->rsk_prot->slab = NULL;
2638 	}
2639 
2640 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2641 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2642 		kfree(prot->twsk_prot->twsk_slab_name);
2643 		prot->twsk_prot->twsk_slab = NULL;
2644 	}
2645 }
2646 EXPORT_SYMBOL(proto_unregister);
2647 
2648 #ifdef CONFIG_PROC_FS
2649 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2650 	__acquires(proto_list_mutex)
2651 {
2652 	mutex_lock(&proto_list_mutex);
2653 	return seq_list_start_head(&proto_list, *pos);
2654 }
2655 
2656 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2657 {
2658 	return seq_list_next(v, &proto_list, pos);
2659 }
2660 
2661 static void proto_seq_stop(struct seq_file *seq, void *v)
2662 	__releases(proto_list_mutex)
2663 {
2664 	mutex_unlock(&proto_list_mutex);
2665 }
2666 
2667 static char proto_method_implemented(const void *method)
2668 {
2669 	return method == NULL ? 'n' : 'y';
2670 }
2671 static long sock_prot_memory_allocated(struct proto *proto)
2672 {
2673 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2674 }
2675 
2676 static char *sock_prot_memory_pressure(struct proto *proto)
2677 {
2678 	return proto->memory_pressure != NULL ?
2679 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2680 }
2681 
2682 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2683 {
2684 
2685 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2686 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2687 		   proto->name,
2688 		   proto->obj_size,
2689 		   sock_prot_inuse_get(seq_file_net(seq), proto),
2690 		   sock_prot_memory_allocated(proto),
2691 		   sock_prot_memory_pressure(proto),
2692 		   proto->max_header,
2693 		   proto->slab == NULL ? "no" : "yes",
2694 		   module_name(proto->owner),
2695 		   proto_method_implemented(proto->close),
2696 		   proto_method_implemented(proto->connect),
2697 		   proto_method_implemented(proto->disconnect),
2698 		   proto_method_implemented(proto->accept),
2699 		   proto_method_implemented(proto->ioctl),
2700 		   proto_method_implemented(proto->init),
2701 		   proto_method_implemented(proto->destroy),
2702 		   proto_method_implemented(proto->shutdown),
2703 		   proto_method_implemented(proto->setsockopt),
2704 		   proto_method_implemented(proto->getsockopt),
2705 		   proto_method_implemented(proto->sendmsg),
2706 		   proto_method_implemented(proto->recvmsg),
2707 		   proto_method_implemented(proto->sendpage),
2708 		   proto_method_implemented(proto->bind),
2709 		   proto_method_implemented(proto->backlog_rcv),
2710 		   proto_method_implemented(proto->hash),
2711 		   proto_method_implemented(proto->unhash),
2712 		   proto_method_implemented(proto->get_port),
2713 		   proto_method_implemented(proto->enter_memory_pressure));
2714 }
2715 
2716 static int proto_seq_show(struct seq_file *seq, void *v)
2717 {
2718 	if (v == &proto_list)
2719 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2720 			   "protocol",
2721 			   "size",
2722 			   "sockets",
2723 			   "memory",
2724 			   "press",
2725 			   "maxhdr",
2726 			   "slab",
2727 			   "module",
2728 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2729 	else
2730 		proto_seq_printf(seq, list_entry(v, struct proto, node));
2731 	return 0;
2732 }
2733 
2734 static const struct seq_operations proto_seq_ops = {
2735 	.start  = proto_seq_start,
2736 	.next   = proto_seq_next,
2737 	.stop   = proto_seq_stop,
2738 	.show   = proto_seq_show,
2739 };
2740 
2741 static int proto_seq_open(struct inode *inode, struct file *file)
2742 {
2743 	return seq_open_net(inode, file, &proto_seq_ops,
2744 			    sizeof(struct seq_net_private));
2745 }
2746 
2747 static const struct file_operations proto_seq_fops = {
2748 	.owner		= THIS_MODULE,
2749 	.open		= proto_seq_open,
2750 	.read		= seq_read,
2751 	.llseek		= seq_lseek,
2752 	.release	= seq_release_net,
2753 };
2754 
2755 static __net_init int proto_init_net(struct net *net)
2756 {
2757 	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2758 		return -ENOMEM;
2759 
2760 	return 0;
2761 }
2762 
2763 static __net_exit void proto_exit_net(struct net *net)
2764 {
2765 	proc_net_remove(net, "protocols");
2766 }
2767 
2768 
2769 static __net_initdata struct pernet_operations proto_net_ops = {
2770 	.init = proto_init_net,
2771 	.exit = proto_exit_net,
2772 };
2773 
2774 static int __init proto_init(void)
2775 {
2776 	return register_pernet_subsys(&proto_net_ops);
2777 }
2778 
2779 subsys_initcall(proto_init);
2780 
2781 #endif /* PROC_FS */
2782