xref: /linux/net/core/sock.c (revision dd220a00e8bd5ad7f98ecdc3eed699a7cfabdc27)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #include <linux/capability.h>
93 #include <linux/errno.h>
94 #include <linux/types.h>
95 #include <linux/socket.h>
96 #include <linux/in.h>
97 #include <linux/kernel.h>
98 #include <linux/module.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/sched.h>
102 #include <linux/timer.h>
103 #include <linux/string.h>
104 #include <linux/sockios.h>
105 #include <linux/net.h>
106 #include <linux/mm.h>
107 #include <linux/slab.h>
108 #include <linux/interrupt.h>
109 #include <linux/poll.h>
110 #include <linux/tcp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/jump_label.h>
115 #include <linux/memcontrol.h>
116 
117 #include <asm/uaccess.h>
118 #include <asm/system.h>
119 
120 #include <linux/netdevice.h>
121 #include <net/protocol.h>
122 #include <linux/skbuff.h>
123 #include <net/net_namespace.h>
124 #include <net/request_sock.h>
125 #include <net/sock.h>
126 #include <linux/net_tstamp.h>
127 #include <net/xfrm.h>
128 #include <linux/ipsec.h>
129 #include <net/cls_cgroup.h>
130 #include <net/netprio_cgroup.h>
131 
132 #include <linux/filter.h>
133 
134 #include <trace/events/sock.h>
135 
136 #ifdef CONFIG_INET
137 #include <net/tcp.h>
138 #endif
139 
140 static DEFINE_MUTEX(proto_list_mutex);
141 static LIST_HEAD(proto_list);
142 
143 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
144 int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss)
145 {
146 	struct proto *proto;
147 	int ret = 0;
148 
149 	mutex_lock(&proto_list_mutex);
150 	list_for_each_entry(proto, &proto_list, node) {
151 		if (proto->init_cgroup) {
152 			ret = proto->init_cgroup(cgrp, ss);
153 			if (ret)
154 				goto out;
155 		}
156 	}
157 
158 	mutex_unlock(&proto_list_mutex);
159 	return ret;
160 out:
161 	list_for_each_entry_continue_reverse(proto, &proto_list, node)
162 		if (proto->destroy_cgroup)
163 			proto->destroy_cgroup(cgrp, ss);
164 	mutex_unlock(&proto_list_mutex);
165 	return ret;
166 }
167 
168 void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss)
169 {
170 	struct proto *proto;
171 
172 	mutex_lock(&proto_list_mutex);
173 	list_for_each_entry_reverse(proto, &proto_list, node)
174 		if (proto->destroy_cgroup)
175 			proto->destroy_cgroup(cgrp, ss);
176 	mutex_unlock(&proto_list_mutex);
177 }
178 #endif
179 
180 /*
181  * Each address family might have different locking rules, so we have
182  * one slock key per address family:
183  */
184 static struct lock_class_key af_family_keys[AF_MAX];
185 static struct lock_class_key af_family_slock_keys[AF_MAX];
186 
187 struct jump_label_key memcg_socket_limit_enabled;
188 EXPORT_SYMBOL(memcg_socket_limit_enabled);
189 
190 /*
191  * Make lock validator output more readable. (we pre-construct these
192  * strings build-time, so that runtime initialization of socket
193  * locks is fast):
194  */
195 static const char *const af_family_key_strings[AF_MAX+1] = {
196   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
197   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
198   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
199   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
200   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
201   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
202   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
203   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
204   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
205   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
206   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
207   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
208   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
209   "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
210 };
211 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
212   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
213   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
214   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
215   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
216   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
217   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
218   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
219   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
220   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
221   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
222   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
223   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
224   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
225   "slock-AF_NFC"   , "slock-AF_MAX"
226 };
227 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
228   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
229   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
230   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
231   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
232   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
233   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
234   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
235   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
236   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
237   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
238   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
239   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
240   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
241   "clock-AF_NFC"   , "clock-AF_MAX"
242 };
243 
244 /*
245  * sk_callback_lock locking rules are per-address-family,
246  * so split the lock classes by using a per-AF key:
247  */
248 static struct lock_class_key af_callback_keys[AF_MAX];
249 
250 /* Take into consideration the size of the struct sk_buff overhead in the
251  * determination of these values, since that is non-constant across
252  * platforms.  This makes socket queueing behavior and performance
253  * not depend upon such differences.
254  */
255 #define _SK_MEM_PACKETS		256
256 #define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
257 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
258 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
259 
260 /* Run time adjustable parameters. */
261 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
262 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
263 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
264 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
265 
266 /* Maximal space eaten by iovec or ancillary data plus some space */
267 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
268 EXPORT_SYMBOL(sysctl_optmem_max);
269 
270 #if defined(CONFIG_CGROUPS)
271 #if !defined(CONFIG_NET_CLS_CGROUP)
272 int net_cls_subsys_id = -1;
273 EXPORT_SYMBOL_GPL(net_cls_subsys_id);
274 #endif
275 #if !defined(CONFIG_NETPRIO_CGROUP)
276 int net_prio_subsys_id = -1;
277 EXPORT_SYMBOL_GPL(net_prio_subsys_id);
278 #endif
279 #endif
280 
281 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
282 {
283 	struct timeval tv;
284 
285 	if (optlen < sizeof(tv))
286 		return -EINVAL;
287 	if (copy_from_user(&tv, optval, sizeof(tv)))
288 		return -EFAULT;
289 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
290 		return -EDOM;
291 
292 	if (tv.tv_sec < 0) {
293 		static int warned __read_mostly;
294 
295 		*timeo_p = 0;
296 		if (warned < 10 && net_ratelimit()) {
297 			warned++;
298 			printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
299 			       "tries to set negative timeout\n",
300 				current->comm, task_pid_nr(current));
301 		}
302 		return 0;
303 	}
304 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
305 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
306 		return 0;
307 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
308 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
309 	return 0;
310 }
311 
312 static void sock_warn_obsolete_bsdism(const char *name)
313 {
314 	static int warned;
315 	static char warncomm[TASK_COMM_LEN];
316 	if (strcmp(warncomm, current->comm) && warned < 5) {
317 		strcpy(warncomm,  current->comm);
318 		printk(KERN_WARNING "process `%s' is using obsolete "
319 		       "%s SO_BSDCOMPAT\n", warncomm, name);
320 		warned++;
321 	}
322 }
323 
324 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
325 
326 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
327 {
328 	if (sk->sk_flags & flags) {
329 		sk->sk_flags &= ~flags;
330 		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
331 			net_disable_timestamp();
332 	}
333 }
334 
335 
336 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
337 {
338 	int err;
339 	int skb_len;
340 	unsigned long flags;
341 	struct sk_buff_head *list = &sk->sk_receive_queue;
342 
343 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
344 		atomic_inc(&sk->sk_drops);
345 		trace_sock_rcvqueue_full(sk, skb);
346 		return -ENOMEM;
347 	}
348 
349 	err = sk_filter(sk, skb);
350 	if (err)
351 		return err;
352 
353 	if (!sk_rmem_schedule(sk, skb->truesize)) {
354 		atomic_inc(&sk->sk_drops);
355 		return -ENOBUFS;
356 	}
357 
358 	skb->dev = NULL;
359 	skb_set_owner_r(skb, sk);
360 
361 	/* Cache the SKB length before we tack it onto the receive
362 	 * queue.  Once it is added it no longer belongs to us and
363 	 * may be freed by other threads of control pulling packets
364 	 * from the queue.
365 	 */
366 	skb_len = skb->len;
367 
368 	/* we escape from rcu protected region, make sure we dont leak
369 	 * a norefcounted dst
370 	 */
371 	skb_dst_force(skb);
372 
373 	spin_lock_irqsave(&list->lock, flags);
374 	skb->dropcount = atomic_read(&sk->sk_drops);
375 	__skb_queue_tail(list, skb);
376 	spin_unlock_irqrestore(&list->lock, flags);
377 
378 	if (!sock_flag(sk, SOCK_DEAD))
379 		sk->sk_data_ready(sk, skb_len);
380 	return 0;
381 }
382 EXPORT_SYMBOL(sock_queue_rcv_skb);
383 
384 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
385 {
386 	int rc = NET_RX_SUCCESS;
387 
388 	if (sk_filter(sk, skb))
389 		goto discard_and_relse;
390 
391 	skb->dev = NULL;
392 
393 	if (sk_rcvqueues_full(sk, skb)) {
394 		atomic_inc(&sk->sk_drops);
395 		goto discard_and_relse;
396 	}
397 	if (nested)
398 		bh_lock_sock_nested(sk);
399 	else
400 		bh_lock_sock(sk);
401 	if (!sock_owned_by_user(sk)) {
402 		/*
403 		 * trylock + unlock semantics:
404 		 */
405 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
406 
407 		rc = sk_backlog_rcv(sk, skb);
408 
409 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
410 	} else if (sk_add_backlog(sk, skb)) {
411 		bh_unlock_sock(sk);
412 		atomic_inc(&sk->sk_drops);
413 		goto discard_and_relse;
414 	}
415 
416 	bh_unlock_sock(sk);
417 out:
418 	sock_put(sk);
419 	return rc;
420 discard_and_relse:
421 	kfree_skb(skb);
422 	goto out;
423 }
424 EXPORT_SYMBOL(sk_receive_skb);
425 
426 void sk_reset_txq(struct sock *sk)
427 {
428 	sk_tx_queue_clear(sk);
429 }
430 EXPORT_SYMBOL(sk_reset_txq);
431 
432 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
433 {
434 	struct dst_entry *dst = __sk_dst_get(sk);
435 
436 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
437 		sk_tx_queue_clear(sk);
438 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
439 		dst_release(dst);
440 		return NULL;
441 	}
442 
443 	return dst;
444 }
445 EXPORT_SYMBOL(__sk_dst_check);
446 
447 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
448 {
449 	struct dst_entry *dst = sk_dst_get(sk);
450 
451 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
452 		sk_dst_reset(sk);
453 		dst_release(dst);
454 		return NULL;
455 	}
456 
457 	return dst;
458 }
459 EXPORT_SYMBOL(sk_dst_check);
460 
461 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
462 {
463 	int ret = -ENOPROTOOPT;
464 #ifdef CONFIG_NETDEVICES
465 	struct net *net = sock_net(sk);
466 	char devname[IFNAMSIZ];
467 	int index;
468 
469 	/* Sorry... */
470 	ret = -EPERM;
471 	if (!capable(CAP_NET_RAW))
472 		goto out;
473 
474 	ret = -EINVAL;
475 	if (optlen < 0)
476 		goto out;
477 
478 	/* Bind this socket to a particular device like "eth0",
479 	 * as specified in the passed interface name. If the
480 	 * name is "" or the option length is zero the socket
481 	 * is not bound.
482 	 */
483 	if (optlen > IFNAMSIZ - 1)
484 		optlen = IFNAMSIZ - 1;
485 	memset(devname, 0, sizeof(devname));
486 
487 	ret = -EFAULT;
488 	if (copy_from_user(devname, optval, optlen))
489 		goto out;
490 
491 	index = 0;
492 	if (devname[0] != '\0') {
493 		struct net_device *dev;
494 
495 		rcu_read_lock();
496 		dev = dev_get_by_name_rcu(net, devname);
497 		if (dev)
498 			index = dev->ifindex;
499 		rcu_read_unlock();
500 		ret = -ENODEV;
501 		if (!dev)
502 			goto out;
503 	}
504 
505 	lock_sock(sk);
506 	sk->sk_bound_dev_if = index;
507 	sk_dst_reset(sk);
508 	release_sock(sk);
509 
510 	ret = 0;
511 
512 out:
513 #endif
514 
515 	return ret;
516 }
517 
518 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
519 {
520 	if (valbool)
521 		sock_set_flag(sk, bit);
522 	else
523 		sock_reset_flag(sk, bit);
524 }
525 
526 /*
527  *	This is meant for all protocols to use and covers goings on
528  *	at the socket level. Everything here is generic.
529  */
530 
531 int sock_setsockopt(struct socket *sock, int level, int optname,
532 		    char __user *optval, unsigned int optlen)
533 {
534 	struct sock *sk = sock->sk;
535 	int val;
536 	int valbool;
537 	struct linger ling;
538 	int ret = 0;
539 
540 	/*
541 	 *	Options without arguments
542 	 */
543 
544 	if (optname == SO_BINDTODEVICE)
545 		return sock_bindtodevice(sk, optval, optlen);
546 
547 	if (optlen < sizeof(int))
548 		return -EINVAL;
549 
550 	if (get_user(val, (int __user *)optval))
551 		return -EFAULT;
552 
553 	valbool = val ? 1 : 0;
554 
555 	lock_sock(sk);
556 
557 	switch (optname) {
558 	case SO_DEBUG:
559 		if (val && !capable(CAP_NET_ADMIN))
560 			ret = -EACCES;
561 		else
562 			sock_valbool_flag(sk, SOCK_DBG, valbool);
563 		break;
564 	case SO_REUSEADDR:
565 		sk->sk_reuse = valbool;
566 		break;
567 	case SO_TYPE:
568 	case SO_PROTOCOL:
569 	case SO_DOMAIN:
570 	case SO_ERROR:
571 		ret = -ENOPROTOOPT;
572 		break;
573 	case SO_DONTROUTE:
574 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
575 		break;
576 	case SO_BROADCAST:
577 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
578 		break;
579 	case SO_SNDBUF:
580 		/* Don't error on this BSD doesn't and if you think
581 		   about it this is right. Otherwise apps have to
582 		   play 'guess the biggest size' games. RCVBUF/SNDBUF
583 		   are treated in BSD as hints */
584 
585 		if (val > sysctl_wmem_max)
586 			val = sysctl_wmem_max;
587 set_sndbuf:
588 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
589 		if ((val * 2) < SOCK_MIN_SNDBUF)
590 			sk->sk_sndbuf = SOCK_MIN_SNDBUF;
591 		else
592 			sk->sk_sndbuf = val * 2;
593 
594 		/*
595 		 *	Wake up sending tasks if we
596 		 *	upped the value.
597 		 */
598 		sk->sk_write_space(sk);
599 		break;
600 
601 	case SO_SNDBUFFORCE:
602 		if (!capable(CAP_NET_ADMIN)) {
603 			ret = -EPERM;
604 			break;
605 		}
606 		goto set_sndbuf;
607 
608 	case SO_RCVBUF:
609 		/* Don't error on this BSD doesn't and if you think
610 		   about it this is right. Otherwise apps have to
611 		   play 'guess the biggest size' games. RCVBUF/SNDBUF
612 		   are treated in BSD as hints */
613 
614 		if (val > sysctl_rmem_max)
615 			val = sysctl_rmem_max;
616 set_rcvbuf:
617 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
618 		/*
619 		 * We double it on the way in to account for
620 		 * "struct sk_buff" etc. overhead.   Applications
621 		 * assume that the SO_RCVBUF setting they make will
622 		 * allow that much actual data to be received on that
623 		 * socket.
624 		 *
625 		 * Applications are unaware that "struct sk_buff" and
626 		 * other overheads allocate from the receive buffer
627 		 * during socket buffer allocation.
628 		 *
629 		 * And after considering the possible alternatives,
630 		 * returning the value we actually used in getsockopt
631 		 * is the most desirable behavior.
632 		 */
633 		if ((val * 2) < SOCK_MIN_RCVBUF)
634 			sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
635 		else
636 			sk->sk_rcvbuf = val * 2;
637 		break;
638 
639 	case SO_RCVBUFFORCE:
640 		if (!capable(CAP_NET_ADMIN)) {
641 			ret = -EPERM;
642 			break;
643 		}
644 		goto set_rcvbuf;
645 
646 	case SO_KEEPALIVE:
647 #ifdef CONFIG_INET
648 		if (sk->sk_protocol == IPPROTO_TCP)
649 			tcp_set_keepalive(sk, valbool);
650 #endif
651 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
652 		break;
653 
654 	case SO_OOBINLINE:
655 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
656 		break;
657 
658 	case SO_NO_CHECK:
659 		sk->sk_no_check = valbool;
660 		break;
661 
662 	case SO_PRIORITY:
663 		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
664 			sk->sk_priority = val;
665 		else
666 			ret = -EPERM;
667 		break;
668 
669 	case SO_LINGER:
670 		if (optlen < sizeof(ling)) {
671 			ret = -EINVAL;	/* 1003.1g */
672 			break;
673 		}
674 		if (copy_from_user(&ling, optval, sizeof(ling))) {
675 			ret = -EFAULT;
676 			break;
677 		}
678 		if (!ling.l_onoff)
679 			sock_reset_flag(sk, SOCK_LINGER);
680 		else {
681 #if (BITS_PER_LONG == 32)
682 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
683 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
684 			else
685 #endif
686 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
687 			sock_set_flag(sk, SOCK_LINGER);
688 		}
689 		break;
690 
691 	case SO_BSDCOMPAT:
692 		sock_warn_obsolete_bsdism("setsockopt");
693 		break;
694 
695 	case SO_PASSCRED:
696 		if (valbool)
697 			set_bit(SOCK_PASSCRED, &sock->flags);
698 		else
699 			clear_bit(SOCK_PASSCRED, &sock->flags);
700 		break;
701 
702 	case SO_TIMESTAMP:
703 	case SO_TIMESTAMPNS:
704 		if (valbool)  {
705 			if (optname == SO_TIMESTAMP)
706 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
707 			else
708 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
709 			sock_set_flag(sk, SOCK_RCVTSTAMP);
710 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
711 		} else {
712 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
713 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
714 		}
715 		break;
716 
717 	case SO_TIMESTAMPING:
718 		if (val & ~SOF_TIMESTAMPING_MASK) {
719 			ret = -EINVAL;
720 			break;
721 		}
722 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
723 				  val & SOF_TIMESTAMPING_TX_HARDWARE);
724 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
725 				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
726 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
727 				  val & SOF_TIMESTAMPING_RX_HARDWARE);
728 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
729 			sock_enable_timestamp(sk,
730 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
731 		else
732 			sock_disable_timestamp(sk,
733 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
734 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
735 				  val & SOF_TIMESTAMPING_SOFTWARE);
736 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
737 				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
738 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
739 				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
740 		break;
741 
742 	case SO_RCVLOWAT:
743 		if (val < 0)
744 			val = INT_MAX;
745 		sk->sk_rcvlowat = val ? : 1;
746 		break;
747 
748 	case SO_RCVTIMEO:
749 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
750 		break;
751 
752 	case SO_SNDTIMEO:
753 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
754 		break;
755 
756 	case SO_ATTACH_FILTER:
757 		ret = -EINVAL;
758 		if (optlen == sizeof(struct sock_fprog)) {
759 			struct sock_fprog fprog;
760 
761 			ret = -EFAULT;
762 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
763 				break;
764 
765 			ret = sk_attach_filter(&fprog, sk);
766 		}
767 		break;
768 
769 	case SO_DETACH_FILTER:
770 		ret = sk_detach_filter(sk);
771 		break;
772 
773 	case SO_PASSSEC:
774 		if (valbool)
775 			set_bit(SOCK_PASSSEC, &sock->flags);
776 		else
777 			clear_bit(SOCK_PASSSEC, &sock->flags);
778 		break;
779 	case SO_MARK:
780 		if (!capable(CAP_NET_ADMIN))
781 			ret = -EPERM;
782 		else
783 			sk->sk_mark = val;
784 		break;
785 
786 		/* We implement the SO_SNDLOWAT etc to
787 		   not be settable (1003.1g 5.3) */
788 	case SO_RXQ_OVFL:
789 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
790 		break;
791 
792 	case SO_WIFI_STATUS:
793 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
794 		break;
795 
796 	default:
797 		ret = -ENOPROTOOPT;
798 		break;
799 	}
800 	release_sock(sk);
801 	return ret;
802 }
803 EXPORT_SYMBOL(sock_setsockopt);
804 
805 
806 void cred_to_ucred(struct pid *pid, const struct cred *cred,
807 		   struct ucred *ucred)
808 {
809 	ucred->pid = pid_vnr(pid);
810 	ucred->uid = ucred->gid = -1;
811 	if (cred) {
812 		struct user_namespace *current_ns = current_user_ns();
813 
814 		ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid);
815 		ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid);
816 	}
817 }
818 EXPORT_SYMBOL_GPL(cred_to_ucred);
819 
820 int sock_getsockopt(struct socket *sock, int level, int optname,
821 		    char __user *optval, int __user *optlen)
822 {
823 	struct sock *sk = sock->sk;
824 
825 	union {
826 		int val;
827 		struct linger ling;
828 		struct timeval tm;
829 	} v;
830 
831 	int lv = sizeof(int);
832 	int len;
833 
834 	if (get_user(len, optlen))
835 		return -EFAULT;
836 	if (len < 0)
837 		return -EINVAL;
838 
839 	memset(&v, 0, sizeof(v));
840 
841 	switch (optname) {
842 	case SO_DEBUG:
843 		v.val = sock_flag(sk, SOCK_DBG);
844 		break;
845 
846 	case SO_DONTROUTE:
847 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
848 		break;
849 
850 	case SO_BROADCAST:
851 		v.val = !!sock_flag(sk, SOCK_BROADCAST);
852 		break;
853 
854 	case SO_SNDBUF:
855 		v.val = sk->sk_sndbuf;
856 		break;
857 
858 	case SO_RCVBUF:
859 		v.val = sk->sk_rcvbuf;
860 		break;
861 
862 	case SO_REUSEADDR:
863 		v.val = sk->sk_reuse;
864 		break;
865 
866 	case SO_KEEPALIVE:
867 		v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
868 		break;
869 
870 	case SO_TYPE:
871 		v.val = sk->sk_type;
872 		break;
873 
874 	case SO_PROTOCOL:
875 		v.val = sk->sk_protocol;
876 		break;
877 
878 	case SO_DOMAIN:
879 		v.val = sk->sk_family;
880 		break;
881 
882 	case SO_ERROR:
883 		v.val = -sock_error(sk);
884 		if (v.val == 0)
885 			v.val = xchg(&sk->sk_err_soft, 0);
886 		break;
887 
888 	case SO_OOBINLINE:
889 		v.val = !!sock_flag(sk, SOCK_URGINLINE);
890 		break;
891 
892 	case SO_NO_CHECK:
893 		v.val = sk->sk_no_check;
894 		break;
895 
896 	case SO_PRIORITY:
897 		v.val = sk->sk_priority;
898 		break;
899 
900 	case SO_LINGER:
901 		lv		= sizeof(v.ling);
902 		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
903 		v.ling.l_linger	= sk->sk_lingertime / HZ;
904 		break;
905 
906 	case SO_BSDCOMPAT:
907 		sock_warn_obsolete_bsdism("getsockopt");
908 		break;
909 
910 	case SO_TIMESTAMP:
911 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
912 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
913 		break;
914 
915 	case SO_TIMESTAMPNS:
916 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
917 		break;
918 
919 	case SO_TIMESTAMPING:
920 		v.val = 0;
921 		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
922 			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
923 		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
924 			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
925 		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
926 			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
927 		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
928 			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
929 		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
930 			v.val |= SOF_TIMESTAMPING_SOFTWARE;
931 		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
932 			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
933 		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
934 			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
935 		break;
936 
937 	case SO_RCVTIMEO:
938 		lv = sizeof(struct timeval);
939 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
940 			v.tm.tv_sec = 0;
941 			v.tm.tv_usec = 0;
942 		} else {
943 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
944 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
945 		}
946 		break;
947 
948 	case SO_SNDTIMEO:
949 		lv = sizeof(struct timeval);
950 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
951 			v.tm.tv_sec = 0;
952 			v.tm.tv_usec = 0;
953 		} else {
954 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
955 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
956 		}
957 		break;
958 
959 	case SO_RCVLOWAT:
960 		v.val = sk->sk_rcvlowat;
961 		break;
962 
963 	case SO_SNDLOWAT:
964 		v.val = 1;
965 		break;
966 
967 	case SO_PASSCRED:
968 		v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
969 		break;
970 
971 	case SO_PEERCRED:
972 	{
973 		struct ucred peercred;
974 		if (len > sizeof(peercred))
975 			len = sizeof(peercred);
976 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
977 		if (copy_to_user(optval, &peercred, len))
978 			return -EFAULT;
979 		goto lenout;
980 	}
981 
982 	case SO_PEERNAME:
983 	{
984 		char address[128];
985 
986 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
987 			return -ENOTCONN;
988 		if (lv < len)
989 			return -EINVAL;
990 		if (copy_to_user(optval, address, len))
991 			return -EFAULT;
992 		goto lenout;
993 	}
994 
995 	/* Dubious BSD thing... Probably nobody even uses it, but
996 	 * the UNIX standard wants it for whatever reason... -DaveM
997 	 */
998 	case SO_ACCEPTCONN:
999 		v.val = sk->sk_state == TCP_LISTEN;
1000 		break;
1001 
1002 	case SO_PASSSEC:
1003 		v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
1004 		break;
1005 
1006 	case SO_PEERSEC:
1007 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1008 
1009 	case SO_MARK:
1010 		v.val = sk->sk_mark;
1011 		break;
1012 
1013 	case SO_RXQ_OVFL:
1014 		v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
1015 		break;
1016 
1017 	case SO_WIFI_STATUS:
1018 		v.val = !!sock_flag(sk, SOCK_WIFI_STATUS);
1019 		break;
1020 
1021 	default:
1022 		return -ENOPROTOOPT;
1023 	}
1024 
1025 	if (len > lv)
1026 		len = lv;
1027 	if (copy_to_user(optval, &v, len))
1028 		return -EFAULT;
1029 lenout:
1030 	if (put_user(len, optlen))
1031 		return -EFAULT;
1032 	return 0;
1033 }
1034 
1035 /*
1036  * Initialize an sk_lock.
1037  *
1038  * (We also register the sk_lock with the lock validator.)
1039  */
1040 static inline void sock_lock_init(struct sock *sk)
1041 {
1042 	sock_lock_init_class_and_name(sk,
1043 			af_family_slock_key_strings[sk->sk_family],
1044 			af_family_slock_keys + sk->sk_family,
1045 			af_family_key_strings[sk->sk_family],
1046 			af_family_keys + sk->sk_family);
1047 }
1048 
1049 /*
1050  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1051  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1052  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1053  */
1054 static void sock_copy(struct sock *nsk, const struct sock *osk)
1055 {
1056 #ifdef CONFIG_SECURITY_NETWORK
1057 	void *sptr = nsk->sk_security;
1058 #endif
1059 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1060 
1061 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1062 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1063 
1064 #ifdef CONFIG_SECURITY_NETWORK
1065 	nsk->sk_security = sptr;
1066 	security_sk_clone(osk, nsk);
1067 #endif
1068 }
1069 
1070 /*
1071  * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1072  * un-modified. Special care is taken when initializing object to zero.
1073  */
1074 static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1075 {
1076 	if (offsetof(struct sock, sk_node.next) != 0)
1077 		memset(sk, 0, offsetof(struct sock, sk_node.next));
1078 	memset(&sk->sk_node.pprev, 0,
1079 	       size - offsetof(struct sock, sk_node.pprev));
1080 }
1081 
1082 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1083 {
1084 	unsigned long nulls1, nulls2;
1085 
1086 	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1087 	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1088 	if (nulls1 > nulls2)
1089 		swap(nulls1, nulls2);
1090 
1091 	if (nulls1 != 0)
1092 		memset((char *)sk, 0, nulls1);
1093 	memset((char *)sk + nulls1 + sizeof(void *), 0,
1094 	       nulls2 - nulls1 - sizeof(void *));
1095 	memset((char *)sk + nulls2 + sizeof(void *), 0,
1096 	       size - nulls2 - sizeof(void *));
1097 }
1098 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1099 
1100 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1101 		int family)
1102 {
1103 	struct sock *sk;
1104 	struct kmem_cache *slab;
1105 
1106 	slab = prot->slab;
1107 	if (slab != NULL) {
1108 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1109 		if (!sk)
1110 			return sk;
1111 		if (priority & __GFP_ZERO) {
1112 			if (prot->clear_sk)
1113 				prot->clear_sk(sk, prot->obj_size);
1114 			else
1115 				sk_prot_clear_nulls(sk, prot->obj_size);
1116 		}
1117 	} else
1118 		sk = kmalloc(prot->obj_size, priority);
1119 
1120 	if (sk != NULL) {
1121 		kmemcheck_annotate_bitfield(sk, flags);
1122 
1123 		if (security_sk_alloc(sk, family, priority))
1124 			goto out_free;
1125 
1126 		if (!try_module_get(prot->owner))
1127 			goto out_free_sec;
1128 		sk_tx_queue_clear(sk);
1129 	}
1130 
1131 	return sk;
1132 
1133 out_free_sec:
1134 	security_sk_free(sk);
1135 out_free:
1136 	if (slab != NULL)
1137 		kmem_cache_free(slab, sk);
1138 	else
1139 		kfree(sk);
1140 	return NULL;
1141 }
1142 
1143 static void sk_prot_free(struct proto *prot, struct sock *sk)
1144 {
1145 	struct kmem_cache *slab;
1146 	struct module *owner;
1147 
1148 	owner = prot->owner;
1149 	slab = prot->slab;
1150 
1151 	security_sk_free(sk);
1152 	if (slab != NULL)
1153 		kmem_cache_free(slab, sk);
1154 	else
1155 		kfree(sk);
1156 	module_put(owner);
1157 }
1158 
1159 #ifdef CONFIG_CGROUPS
1160 void sock_update_classid(struct sock *sk)
1161 {
1162 	u32 classid;
1163 
1164 	rcu_read_lock();  /* doing current task, which cannot vanish. */
1165 	classid = task_cls_classid(current);
1166 	rcu_read_unlock();
1167 	if (classid && classid != sk->sk_classid)
1168 		sk->sk_classid = classid;
1169 }
1170 EXPORT_SYMBOL(sock_update_classid);
1171 
1172 void sock_update_netprioidx(struct sock *sk)
1173 {
1174 	struct cgroup_netprio_state *state;
1175 	if (in_interrupt())
1176 		return;
1177 	rcu_read_lock();
1178 	state = task_netprio_state(current);
1179 	sk->sk_cgrp_prioidx = state ? state->prioidx : 0;
1180 	rcu_read_unlock();
1181 }
1182 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1183 #endif
1184 
1185 /**
1186  *	sk_alloc - All socket objects are allocated here
1187  *	@net: the applicable net namespace
1188  *	@family: protocol family
1189  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1190  *	@prot: struct proto associated with this new sock instance
1191  */
1192 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1193 		      struct proto *prot)
1194 {
1195 	struct sock *sk;
1196 
1197 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1198 	if (sk) {
1199 		sk->sk_family = family;
1200 		/*
1201 		 * See comment in struct sock definition to understand
1202 		 * why we need sk_prot_creator -acme
1203 		 */
1204 		sk->sk_prot = sk->sk_prot_creator = prot;
1205 		sock_lock_init(sk);
1206 		sock_net_set(sk, get_net(net));
1207 		atomic_set(&sk->sk_wmem_alloc, 1);
1208 
1209 		sock_update_classid(sk);
1210 		sock_update_netprioidx(sk);
1211 	}
1212 
1213 	return sk;
1214 }
1215 EXPORT_SYMBOL(sk_alloc);
1216 
1217 static void __sk_free(struct sock *sk)
1218 {
1219 	struct sk_filter *filter;
1220 
1221 	if (sk->sk_destruct)
1222 		sk->sk_destruct(sk);
1223 
1224 	filter = rcu_dereference_check(sk->sk_filter,
1225 				       atomic_read(&sk->sk_wmem_alloc) == 0);
1226 	if (filter) {
1227 		sk_filter_uncharge(sk, filter);
1228 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1229 	}
1230 
1231 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1232 
1233 	if (atomic_read(&sk->sk_omem_alloc))
1234 		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1235 		       __func__, atomic_read(&sk->sk_omem_alloc));
1236 
1237 	if (sk->sk_peer_cred)
1238 		put_cred(sk->sk_peer_cred);
1239 	put_pid(sk->sk_peer_pid);
1240 	put_net(sock_net(sk));
1241 	sk_prot_free(sk->sk_prot_creator, sk);
1242 }
1243 
1244 void sk_free(struct sock *sk)
1245 {
1246 	/*
1247 	 * We subtract one from sk_wmem_alloc and can know if
1248 	 * some packets are still in some tx queue.
1249 	 * If not null, sock_wfree() will call __sk_free(sk) later
1250 	 */
1251 	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1252 		__sk_free(sk);
1253 }
1254 EXPORT_SYMBOL(sk_free);
1255 
1256 /*
1257  * Last sock_put should drop reference to sk->sk_net. It has already
1258  * been dropped in sk_change_net. Taking reference to stopping namespace
1259  * is not an option.
1260  * Take reference to a socket to remove it from hash _alive_ and after that
1261  * destroy it in the context of init_net.
1262  */
1263 void sk_release_kernel(struct sock *sk)
1264 {
1265 	if (sk == NULL || sk->sk_socket == NULL)
1266 		return;
1267 
1268 	sock_hold(sk);
1269 	sock_release(sk->sk_socket);
1270 	release_net(sock_net(sk));
1271 	sock_net_set(sk, get_net(&init_net));
1272 	sock_put(sk);
1273 }
1274 EXPORT_SYMBOL(sk_release_kernel);
1275 
1276 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1277 {
1278 	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1279 		sock_update_memcg(newsk);
1280 }
1281 
1282 /**
1283  *	sk_clone_lock - clone a socket, and lock its clone
1284  *	@sk: the socket to clone
1285  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1286  *
1287  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1288  */
1289 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1290 {
1291 	struct sock *newsk;
1292 
1293 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1294 	if (newsk != NULL) {
1295 		struct sk_filter *filter;
1296 
1297 		sock_copy(newsk, sk);
1298 
1299 		/* SANITY */
1300 		get_net(sock_net(newsk));
1301 		sk_node_init(&newsk->sk_node);
1302 		sock_lock_init(newsk);
1303 		bh_lock_sock(newsk);
1304 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1305 		newsk->sk_backlog.len = 0;
1306 
1307 		atomic_set(&newsk->sk_rmem_alloc, 0);
1308 		/*
1309 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1310 		 */
1311 		atomic_set(&newsk->sk_wmem_alloc, 1);
1312 		atomic_set(&newsk->sk_omem_alloc, 0);
1313 		skb_queue_head_init(&newsk->sk_receive_queue);
1314 		skb_queue_head_init(&newsk->sk_write_queue);
1315 #ifdef CONFIG_NET_DMA
1316 		skb_queue_head_init(&newsk->sk_async_wait_queue);
1317 #endif
1318 
1319 		spin_lock_init(&newsk->sk_dst_lock);
1320 		rwlock_init(&newsk->sk_callback_lock);
1321 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1322 				af_callback_keys + newsk->sk_family,
1323 				af_family_clock_key_strings[newsk->sk_family]);
1324 
1325 		newsk->sk_dst_cache	= NULL;
1326 		newsk->sk_wmem_queued	= 0;
1327 		newsk->sk_forward_alloc = 0;
1328 		newsk->sk_send_head	= NULL;
1329 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1330 
1331 		sock_reset_flag(newsk, SOCK_DONE);
1332 		skb_queue_head_init(&newsk->sk_error_queue);
1333 
1334 		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1335 		if (filter != NULL)
1336 			sk_filter_charge(newsk, filter);
1337 
1338 		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1339 			/* It is still raw copy of parent, so invalidate
1340 			 * destructor and make plain sk_free() */
1341 			newsk->sk_destruct = NULL;
1342 			bh_unlock_sock(newsk);
1343 			sk_free(newsk);
1344 			newsk = NULL;
1345 			goto out;
1346 		}
1347 
1348 		newsk->sk_err	   = 0;
1349 		newsk->sk_priority = 0;
1350 		/*
1351 		 * Before updating sk_refcnt, we must commit prior changes to memory
1352 		 * (Documentation/RCU/rculist_nulls.txt for details)
1353 		 */
1354 		smp_wmb();
1355 		atomic_set(&newsk->sk_refcnt, 2);
1356 
1357 		/*
1358 		 * Increment the counter in the same struct proto as the master
1359 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1360 		 * is the same as sk->sk_prot->socks, as this field was copied
1361 		 * with memcpy).
1362 		 *
1363 		 * This _changes_ the previous behaviour, where
1364 		 * tcp_create_openreq_child always was incrementing the
1365 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1366 		 * to be taken into account in all callers. -acme
1367 		 */
1368 		sk_refcnt_debug_inc(newsk);
1369 		sk_set_socket(newsk, NULL);
1370 		newsk->sk_wq = NULL;
1371 
1372 		sk_update_clone(sk, newsk);
1373 
1374 		if (newsk->sk_prot->sockets_allocated)
1375 			sk_sockets_allocated_inc(newsk);
1376 
1377 		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1378 			net_enable_timestamp();
1379 	}
1380 out:
1381 	return newsk;
1382 }
1383 EXPORT_SYMBOL_GPL(sk_clone_lock);
1384 
1385 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1386 {
1387 	__sk_dst_set(sk, dst);
1388 	sk->sk_route_caps = dst->dev->features;
1389 	if (sk->sk_route_caps & NETIF_F_GSO)
1390 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1391 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1392 	if (sk_can_gso(sk)) {
1393 		if (dst->header_len) {
1394 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1395 		} else {
1396 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1397 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1398 		}
1399 	}
1400 }
1401 EXPORT_SYMBOL_GPL(sk_setup_caps);
1402 
1403 void __init sk_init(void)
1404 {
1405 	if (totalram_pages <= 4096) {
1406 		sysctl_wmem_max = 32767;
1407 		sysctl_rmem_max = 32767;
1408 		sysctl_wmem_default = 32767;
1409 		sysctl_rmem_default = 32767;
1410 	} else if (totalram_pages >= 131072) {
1411 		sysctl_wmem_max = 131071;
1412 		sysctl_rmem_max = 131071;
1413 	}
1414 }
1415 
1416 /*
1417  *	Simple resource managers for sockets.
1418  */
1419 
1420 
1421 /*
1422  * Write buffer destructor automatically called from kfree_skb.
1423  */
1424 void sock_wfree(struct sk_buff *skb)
1425 {
1426 	struct sock *sk = skb->sk;
1427 	unsigned int len = skb->truesize;
1428 
1429 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1430 		/*
1431 		 * Keep a reference on sk_wmem_alloc, this will be released
1432 		 * after sk_write_space() call
1433 		 */
1434 		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1435 		sk->sk_write_space(sk);
1436 		len = 1;
1437 	}
1438 	/*
1439 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1440 	 * could not do because of in-flight packets
1441 	 */
1442 	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1443 		__sk_free(sk);
1444 }
1445 EXPORT_SYMBOL(sock_wfree);
1446 
1447 /*
1448  * Read buffer destructor automatically called from kfree_skb.
1449  */
1450 void sock_rfree(struct sk_buff *skb)
1451 {
1452 	struct sock *sk = skb->sk;
1453 	unsigned int len = skb->truesize;
1454 
1455 	atomic_sub(len, &sk->sk_rmem_alloc);
1456 	sk_mem_uncharge(sk, len);
1457 }
1458 EXPORT_SYMBOL(sock_rfree);
1459 
1460 
1461 int sock_i_uid(struct sock *sk)
1462 {
1463 	int uid;
1464 
1465 	read_lock_bh(&sk->sk_callback_lock);
1466 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1467 	read_unlock_bh(&sk->sk_callback_lock);
1468 	return uid;
1469 }
1470 EXPORT_SYMBOL(sock_i_uid);
1471 
1472 unsigned long sock_i_ino(struct sock *sk)
1473 {
1474 	unsigned long ino;
1475 
1476 	read_lock_bh(&sk->sk_callback_lock);
1477 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1478 	read_unlock_bh(&sk->sk_callback_lock);
1479 	return ino;
1480 }
1481 EXPORT_SYMBOL(sock_i_ino);
1482 
1483 /*
1484  * Allocate a skb from the socket's send buffer.
1485  */
1486 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1487 			     gfp_t priority)
1488 {
1489 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1490 		struct sk_buff *skb = alloc_skb(size, priority);
1491 		if (skb) {
1492 			skb_set_owner_w(skb, sk);
1493 			return skb;
1494 		}
1495 	}
1496 	return NULL;
1497 }
1498 EXPORT_SYMBOL(sock_wmalloc);
1499 
1500 /*
1501  * Allocate a skb from the socket's receive buffer.
1502  */
1503 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1504 			     gfp_t priority)
1505 {
1506 	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1507 		struct sk_buff *skb = alloc_skb(size, priority);
1508 		if (skb) {
1509 			skb_set_owner_r(skb, sk);
1510 			return skb;
1511 		}
1512 	}
1513 	return NULL;
1514 }
1515 
1516 /*
1517  * Allocate a memory block from the socket's option memory buffer.
1518  */
1519 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1520 {
1521 	if ((unsigned)size <= sysctl_optmem_max &&
1522 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1523 		void *mem;
1524 		/* First do the add, to avoid the race if kmalloc
1525 		 * might sleep.
1526 		 */
1527 		atomic_add(size, &sk->sk_omem_alloc);
1528 		mem = kmalloc(size, priority);
1529 		if (mem)
1530 			return mem;
1531 		atomic_sub(size, &sk->sk_omem_alloc);
1532 	}
1533 	return NULL;
1534 }
1535 EXPORT_SYMBOL(sock_kmalloc);
1536 
1537 /*
1538  * Free an option memory block.
1539  */
1540 void sock_kfree_s(struct sock *sk, void *mem, int size)
1541 {
1542 	kfree(mem);
1543 	atomic_sub(size, &sk->sk_omem_alloc);
1544 }
1545 EXPORT_SYMBOL(sock_kfree_s);
1546 
1547 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1548    I think, these locks should be removed for datagram sockets.
1549  */
1550 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1551 {
1552 	DEFINE_WAIT(wait);
1553 
1554 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1555 	for (;;) {
1556 		if (!timeo)
1557 			break;
1558 		if (signal_pending(current))
1559 			break;
1560 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1561 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1562 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1563 			break;
1564 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1565 			break;
1566 		if (sk->sk_err)
1567 			break;
1568 		timeo = schedule_timeout(timeo);
1569 	}
1570 	finish_wait(sk_sleep(sk), &wait);
1571 	return timeo;
1572 }
1573 
1574 
1575 /*
1576  *	Generic send/receive buffer handlers
1577  */
1578 
1579 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1580 				     unsigned long data_len, int noblock,
1581 				     int *errcode)
1582 {
1583 	struct sk_buff *skb;
1584 	gfp_t gfp_mask;
1585 	long timeo;
1586 	int err;
1587 
1588 	gfp_mask = sk->sk_allocation;
1589 	if (gfp_mask & __GFP_WAIT)
1590 		gfp_mask |= __GFP_REPEAT;
1591 
1592 	timeo = sock_sndtimeo(sk, noblock);
1593 	while (1) {
1594 		err = sock_error(sk);
1595 		if (err != 0)
1596 			goto failure;
1597 
1598 		err = -EPIPE;
1599 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1600 			goto failure;
1601 
1602 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1603 			skb = alloc_skb(header_len, gfp_mask);
1604 			if (skb) {
1605 				int npages;
1606 				int i;
1607 
1608 				/* No pages, we're done... */
1609 				if (!data_len)
1610 					break;
1611 
1612 				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1613 				skb->truesize += data_len;
1614 				skb_shinfo(skb)->nr_frags = npages;
1615 				for (i = 0; i < npages; i++) {
1616 					struct page *page;
1617 
1618 					page = alloc_pages(sk->sk_allocation, 0);
1619 					if (!page) {
1620 						err = -ENOBUFS;
1621 						skb_shinfo(skb)->nr_frags = i;
1622 						kfree_skb(skb);
1623 						goto failure;
1624 					}
1625 
1626 					__skb_fill_page_desc(skb, i,
1627 							page, 0,
1628 							(data_len >= PAGE_SIZE ?
1629 							 PAGE_SIZE :
1630 							 data_len));
1631 					data_len -= PAGE_SIZE;
1632 				}
1633 
1634 				/* Full success... */
1635 				break;
1636 			}
1637 			err = -ENOBUFS;
1638 			goto failure;
1639 		}
1640 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1641 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1642 		err = -EAGAIN;
1643 		if (!timeo)
1644 			goto failure;
1645 		if (signal_pending(current))
1646 			goto interrupted;
1647 		timeo = sock_wait_for_wmem(sk, timeo);
1648 	}
1649 
1650 	skb_set_owner_w(skb, sk);
1651 	return skb;
1652 
1653 interrupted:
1654 	err = sock_intr_errno(timeo);
1655 failure:
1656 	*errcode = err;
1657 	return NULL;
1658 }
1659 EXPORT_SYMBOL(sock_alloc_send_pskb);
1660 
1661 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1662 				    int noblock, int *errcode)
1663 {
1664 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1665 }
1666 EXPORT_SYMBOL(sock_alloc_send_skb);
1667 
1668 static void __lock_sock(struct sock *sk)
1669 	__releases(&sk->sk_lock.slock)
1670 	__acquires(&sk->sk_lock.slock)
1671 {
1672 	DEFINE_WAIT(wait);
1673 
1674 	for (;;) {
1675 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1676 					TASK_UNINTERRUPTIBLE);
1677 		spin_unlock_bh(&sk->sk_lock.slock);
1678 		schedule();
1679 		spin_lock_bh(&sk->sk_lock.slock);
1680 		if (!sock_owned_by_user(sk))
1681 			break;
1682 	}
1683 	finish_wait(&sk->sk_lock.wq, &wait);
1684 }
1685 
1686 static void __release_sock(struct sock *sk)
1687 	__releases(&sk->sk_lock.slock)
1688 	__acquires(&sk->sk_lock.slock)
1689 {
1690 	struct sk_buff *skb = sk->sk_backlog.head;
1691 
1692 	do {
1693 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1694 		bh_unlock_sock(sk);
1695 
1696 		do {
1697 			struct sk_buff *next = skb->next;
1698 
1699 			WARN_ON_ONCE(skb_dst_is_noref(skb));
1700 			skb->next = NULL;
1701 			sk_backlog_rcv(sk, skb);
1702 
1703 			/*
1704 			 * We are in process context here with softirqs
1705 			 * disabled, use cond_resched_softirq() to preempt.
1706 			 * This is safe to do because we've taken the backlog
1707 			 * queue private:
1708 			 */
1709 			cond_resched_softirq();
1710 
1711 			skb = next;
1712 		} while (skb != NULL);
1713 
1714 		bh_lock_sock(sk);
1715 	} while ((skb = sk->sk_backlog.head) != NULL);
1716 
1717 	/*
1718 	 * Doing the zeroing here guarantee we can not loop forever
1719 	 * while a wild producer attempts to flood us.
1720 	 */
1721 	sk->sk_backlog.len = 0;
1722 }
1723 
1724 /**
1725  * sk_wait_data - wait for data to arrive at sk_receive_queue
1726  * @sk:    sock to wait on
1727  * @timeo: for how long
1728  *
1729  * Now socket state including sk->sk_err is changed only under lock,
1730  * hence we may omit checks after joining wait queue.
1731  * We check receive queue before schedule() only as optimization;
1732  * it is very likely that release_sock() added new data.
1733  */
1734 int sk_wait_data(struct sock *sk, long *timeo)
1735 {
1736 	int rc;
1737 	DEFINE_WAIT(wait);
1738 
1739 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1740 	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1741 	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1742 	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1743 	finish_wait(sk_sleep(sk), &wait);
1744 	return rc;
1745 }
1746 EXPORT_SYMBOL(sk_wait_data);
1747 
1748 /**
1749  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1750  *	@sk: socket
1751  *	@size: memory size to allocate
1752  *	@kind: allocation type
1753  *
1754  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1755  *	rmem allocation. This function assumes that protocols which have
1756  *	memory_pressure use sk_wmem_queued as write buffer accounting.
1757  */
1758 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1759 {
1760 	struct proto *prot = sk->sk_prot;
1761 	int amt = sk_mem_pages(size);
1762 	long allocated;
1763 	int parent_status = UNDER_LIMIT;
1764 
1765 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1766 
1767 	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1768 
1769 	/* Under limit. */
1770 	if (parent_status == UNDER_LIMIT &&
1771 			allocated <= sk_prot_mem_limits(sk, 0)) {
1772 		sk_leave_memory_pressure(sk);
1773 		return 1;
1774 	}
1775 
1776 	/* Under pressure. (we or our parents) */
1777 	if ((parent_status > SOFT_LIMIT) ||
1778 			allocated > sk_prot_mem_limits(sk, 1))
1779 		sk_enter_memory_pressure(sk);
1780 
1781 	/* Over hard limit (we or our parents) */
1782 	if ((parent_status == OVER_LIMIT) ||
1783 			(allocated > sk_prot_mem_limits(sk, 2)))
1784 		goto suppress_allocation;
1785 
1786 	/* guarantee minimum buffer size under pressure */
1787 	if (kind == SK_MEM_RECV) {
1788 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1789 			return 1;
1790 
1791 	} else { /* SK_MEM_SEND */
1792 		if (sk->sk_type == SOCK_STREAM) {
1793 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1794 				return 1;
1795 		} else if (atomic_read(&sk->sk_wmem_alloc) <
1796 			   prot->sysctl_wmem[0])
1797 				return 1;
1798 	}
1799 
1800 	if (sk_has_memory_pressure(sk)) {
1801 		int alloc;
1802 
1803 		if (!sk_under_memory_pressure(sk))
1804 			return 1;
1805 		alloc = sk_sockets_allocated_read_positive(sk);
1806 		if (sk_prot_mem_limits(sk, 2) > alloc *
1807 		    sk_mem_pages(sk->sk_wmem_queued +
1808 				 atomic_read(&sk->sk_rmem_alloc) +
1809 				 sk->sk_forward_alloc))
1810 			return 1;
1811 	}
1812 
1813 suppress_allocation:
1814 
1815 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1816 		sk_stream_moderate_sndbuf(sk);
1817 
1818 		/* Fail only if socket is _under_ its sndbuf.
1819 		 * In this case we cannot block, so that we have to fail.
1820 		 */
1821 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1822 			return 1;
1823 	}
1824 
1825 	trace_sock_exceed_buf_limit(sk, prot, allocated);
1826 
1827 	/* Alas. Undo changes. */
1828 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1829 
1830 	sk_memory_allocated_sub(sk, amt);
1831 
1832 	return 0;
1833 }
1834 EXPORT_SYMBOL(__sk_mem_schedule);
1835 
1836 /**
1837  *	__sk_reclaim - reclaim memory_allocated
1838  *	@sk: socket
1839  */
1840 void __sk_mem_reclaim(struct sock *sk)
1841 {
1842 	sk_memory_allocated_sub(sk,
1843 				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
1844 	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1845 
1846 	if (sk_under_memory_pressure(sk) &&
1847 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
1848 		sk_leave_memory_pressure(sk);
1849 }
1850 EXPORT_SYMBOL(__sk_mem_reclaim);
1851 
1852 
1853 /*
1854  * Set of default routines for initialising struct proto_ops when
1855  * the protocol does not support a particular function. In certain
1856  * cases where it makes no sense for a protocol to have a "do nothing"
1857  * function, some default processing is provided.
1858  */
1859 
1860 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1861 {
1862 	return -EOPNOTSUPP;
1863 }
1864 EXPORT_SYMBOL(sock_no_bind);
1865 
1866 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1867 		    int len, int flags)
1868 {
1869 	return -EOPNOTSUPP;
1870 }
1871 EXPORT_SYMBOL(sock_no_connect);
1872 
1873 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1874 {
1875 	return -EOPNOTSUPP;
1876 }
1877 EXPORT_SYMBOL(sock_no_socketpair);
1878 
1879 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1880 {
1881 	return -EOPNOTSUPP;
1882 }
1883 EXPORT_SYMBOL(sock_no_accept);
1884 
1885 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1886 		    int *len, int peer)
1887 {
1888 	return -EOPNOTSUPP;
1889 }
1890 EXPORT_SYMBOL(sock_no_getname);
1891 
1892 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1893 {
1894 	return 0;
1895 }
1896 EXPORT_SYMBOL(sock_no_poll);
1897 
1898 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1899 {
1900 	return -EOPNOTSUPP;
1901 }
1902 EXPORT_SYMBOL(sock_no_ioctl);
1903 
1904 int sock_no_listen(struct socket *sock, int backlog)
1905 {
1906 	return -EOPNOTSUPP;
1907 }
1908 EXPORT_SYMBOL(sock_no_listen);
1909 
1910 int sock_no_shutdown(struct socket *sock, int how)
1911 {
1912 	return -EOPNOTSUPP;
1913 }
1914 EXPORT_SYMBOL(sock_no_shutdown);
1915 
1916 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1917 		    char __user *optval, unsigned int optlen)
1918 {
1919 	return -EOPNOTSUPP;
1920 }
1921 EXPORT_SYMBOL(sock_no_setsockopt);
1922 
1923 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1924 		    char __user *optval, int __user *optlen)
1925 {
1926 	return -EOPNOTSUPP;
1927 }
1928 EXPORT_SYMBOL(sock_no_getsockopt);
1929 
1930 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1931 		    size_t len)
1932 {
1933 	return -EOPNOTSUPP;
1934 }
1935 EXPORT_SYMBOL(sock_no_sendmsg);
1936 
1937 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1938 		    size_t len, int flags)
1939 {
1940 	return -EOPNOTSUPP;
1941 }
1942 EXPORT_SYMBOL(sock_no_recvmsg);
1943 
1944 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1945 {
1946 	/* Mirror missing mmap method error code */
1947 	return -ENODEV;
1948 }
1949 EXPORT_SYMBOL(sock_no_mmap);
1950 
1951 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1952 {
1953 	ssize_t res;
1954 	struct msghdr msg = {.msg_flags = flags};
1955 	struct kvec iov;
1956 	char *kaddr = kmap(page);
1957 	iov.iov_base = kaddr + offset;
1958 	iov.iov_len = size;
1959 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1960 	kunmap(page);
1961 	return res;
1962 }
1963 EXPORT_SYMBOL(sock_no_sendpage);
1964 
1965 /*
1966  *	Default Socket Callbacks
1967  */
1968 
1969 static void sock_def_wakeup(struct sock *sk)
1970 {
1971 	struct socket_wq *wq;
1972 
1973 	rcu_read_lock();
1974 	wq = rcu_dereference(sk->sk_wq);
1975 	if (wq_has_sleeper(wq))
1976 		wake_up_interruptible_all(&wq->wait);
1977 	rcu_read_unlock();
1978 }
1979 
1980 static void sock_def_error_report(struct sock *sk)
1981 {
1982 	struct socket_wq *wq;
1983 
1984 	rcu_read_lock();
1985 	wq = rcu_dereference(sk->sk_wq);
1986 	if (wq_has_sleeper(wq))
1987 		wake_up_interruptible_poll(&wq->wait, POLLERR);
1988 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1989 	rcu_read_unlock();
1990 }
1991 
1992 static void sock_def_readable(struct sock *sk, int len)
1993 {
1994 	struct socket_wq *wq;
1995 
1996 	rcu_read_lock();
1997 	wq = rcu_dereference(sk->sk_wq);
1998 	if (wq_has_sleeper(wq))
1999 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2000 						POLLRDNORM | POLLRDBAND);
2001 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2002 	rcu_read_unlock();
2003 }
2004 
2005 static void sock_def_write_space(struct sock *sk)
2006 {
2007 	struct socket_wq *wq;
2008 
2009 	rcu_read_lock();
2010 
2011 	/* Do not wake up a writer until he can make "significant"
2012 	 * progress.  --DaveM
2013 	 */
2014 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2015 		wq = rcu_dereference(sk->sk_wq);
2016 		if (wq_has_sleeper(wq))
2017 			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2018 						POLLWRNORM | POLLWRBAND);
2019 
2020 		/* Should agree with poll, otherwise some programs break */
2021 		if (sock_writeable(sk))
2022 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2023 	}
2024 
2025 	rcu_read_unlock();
2026 }
2027 
2028 static void sock_def_destruct(struct sock *sk)
2029 {
2030 	kfree(sk->sk_protinfo);
2031 }
2032 
2033 void sk_send_sigurg(struct sock *sk)
2034 {
2035 	if (sk->sk_socket && sk->sk_socket->file)
2036 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2037 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2038 }
2039 EXPORT_SYMBOL(sk_send_sigurg);
2040 
2041 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2042 		    unsigned long expires)
2043 {
2044 	if (!mod_timer(timer, expires))
2045 		sock_hold(sk);
2046 }
2047 EXPORT_SYMBOL(sk_reset_timer);
2048 
2049 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2050 {
2051 	if (timer_pending(timer) && del_timer(timer))
2052 		__sock_put(sk);
2053 }
2054 EXPORT_SYMBOL(sk_stop_timer);
2055 
2056 void sock_init_data(struct socket *sock, struct sock *sk)
2057 {
2058 	skb_queue_head_init(&sk->sk_receive_queue);
2059 	skb_queue_head_init(&sk->sk_write_queue);
2060 	skb_queue_head_init(&sk->sk_error_queue);
2061 #ifdef CONFIG_NET_DMA
2062 	skb_queue_head_init(&sk->sk_async_wait_queue);
2063 #endif
2064 
2065 	sk->sk_send_head	=	NULL;
2066 
2067 	init_timer(&sk->sk_timer);
2068 
2069 	sk->sk_allocation	=	GFP_KERNEL;
2070 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2071 	sk->sk_sndbuf		=	sysctl_wmem_default;
2072 	sk->sk_state		=	TCP_CLOSE;
2073 	sk_set_socket(sk, sock);
2074 
2075 	sock_set_flag(sk, SOCK_ZAPPED);
2076 
2077 	if (sock) {
2078 		sk->sk_type	=	sock->type;
2079 		sk->sk_wq	=	sock->wq;
2080 		sock->sk	=	sk;
2081 	} else
2082 		sk->sk_wq	=	NULL;
2083 
2084 	spin_lock_init(&sk->sk_dst_lock);
2085 	rwlock_init(&sk->sk_callback_lock);
2086 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2087 			af_callback_keys + sk->sk_family,
2088 			af_family_clock_key_strings[sk->sk_family]);
2089 
2090 	sk->sk_state_change	=	sock_def_wakeup;
2091 	sk->sk_data_ready	=	sock_def_readable;
2092 	sk->sk_write_space	=	sock_def_write_space;
2093 	sk->sk_error_report	=	sock_def_error_report;
2094 	sk->sk_destruct		=	sock_def_destruct;
2095 
2096 	sk->sk_sndmsg_page	=	NULL;
2097 	sk->sk_sndmsg_off	=	0;
2098 
2099 	sk->sk_peer_pid 	=	NULL;
2100 	sk->sk_peer_cred	=	NULL;
2101 	sk->sk_write_pending	=	0;
2102 	sk->sk_rcvlowat		=	1;
2103 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2104 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2105 
2106 	sk->sk_stamp = ktime_set(-1L, 0);
2107 
2108 	/*
2109 	 * Before updating sk_refcnt, we must commit prior changes to memory
2110 	 * (Documentation/RCU/rculist_nulls.txt for details)
2111 	 */
2112 	smp_wmb();
2113 	atomic_set(&sk->sk_refcnt, 1);
2114 	atomic_set(&sk->sk_drops, 0);
2115 }
2116 EXPORT_SYMBOL(sock_init_data);
2117 
2118 void lock_sock_nested(struct sock *sk, int subclass)
2119 {
2120 	might_sleep();
2121 	spin_lock_bh(&sk->sk_lock.slock);
2122 	if (sk->sk_lock.owned)
2123 		__lock_sock(sk);
2124 	sk->sk_lock.owned = 1;
2125 	spin_unlock(&sk->sk_lock.slock);
2126 	/*
2127 	 * The sk_lock has mutex_lock() semantics here:
2128 	 */
2129 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2130 	local_bh_enable();
2131 }
2132 EXPORT_SYMBOL(lock_sock_nested);
2133 
2134 void release_sock(struct sock *sk)
2135 {
2136 	/*
2137 	 * The sk_lock has mutex_unlock() semantics:
2138 	 */
2139 	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2140 
2141 	spin_lock_bh(&sk->sk_lock.slock);
2142 	if (sk->sk_backlog.tail)
2143 		__release_sock(sk);
2144 	sk->sk_lock.owned = 0;
2145 	if (waitqueue_active(&sk->sk_lock.wq))
2146 		wake_up(&sk->sk_lock.wq);
2147 	spin_unlock_bh(&sk->sk_lock.slock);
2148 }
2149 EXPORT_SYMBOL(release_sock);
2150 
2151 /**
2152  * lock_sock_fast - fast version of lock_sock
2153  * @sk: socket
2154  *
2155  * This version should be used for very small section, where process wont block
2156  * return false if fast path is taken
2157  *   sk_lock.slock locked, owned = 0, BH disabled
2158  * return true if slow path is taken
2159  *   sk_lock.slock unlocked, owned = 1, BH enabled
2160  */
2161 bool lock_sock_fast(struct sock *sk)
2162 {
2163 	might_sleep();
2164 	spin_lock_bh(&sk->sk_lock.slock);
2165 
2166 	if (!sk->sk_lock.owned)
2167 		/*
2168 		 * Note : We must disable BH
2169 		 */
2170 		return false;
2171 
2172 	__lock_sock(sk);
2173 	sk->sk_lock.owned = 1;
2174 	spin_unlock(&sk->sk_lock.slock);
2175 	/*
2176 	 * The sk_lock has mutex_lock() semantics here:
2177 	 */
2178 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2179 	local_bh_enable();
2180 	return true;
2181 }
2182 EXPORT_SYMBOL(lock_sock_fast);
2183 
2184 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2185 {
2186 	struct timeval tv;
2187 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2188 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2189 	tv = ktime_to_timeval(sk->sk_stamp);
2190 	if (tv.tv_sec == -1)
2191 		return -ENOENT;
2192 	if (tv.tv_sec == 0) {
2193 		sk->sk_stamp = ktime_get_real();
2194 		tv = ktime_to_timeval(sk->sk_stamp);
2195 	}
2196 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2197 }
2198 EXPORT_SYMBOL(sock_get_timestamp);
2199 
2200 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2201 {
2202 	struct timespec ts;
2203 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2204 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2205 	ts = ktime_to_timespec(sk->sk_stamp);
2206 	if (ts.tv_sec == -1)
2207 		return -ENOENT;
2208 	if (ts.tv_sec == 0) {
2209 		sk->sk_stamp = ktime_get_real();
2210 		ts = ktime_to_timespec(sk->sk_stamp);
2211 	}
2212 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2213 }
2214 EXPORT_SYMBOL(sock_get_timestampns);
2215 
2216 void sock_enable_timestamp(struct sock *sk, int flag)
2217 {
2218 	if (!sock_flag(sk, flag)) {
2219 		unsigned long previous_flags = sk->sk_flags;
2220 
2221 		sock_set_flag(sk, flag);
2222 		/*
2223 		 * we just set one of the two flags which require net
2224 		 * time stamping, but time stamping might have been on
2225 		 * already because of the other one
2226 		 */
2227 		if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2228 			net_enable_timestamp();
2229 	}
2230 }
2231 
2232 /*
2233  *	Get a socket option on an socket.
2234  *
2235  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2236  *	asynchronous errors should be reported by getsockopt. We assume
2237  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2238  */
2239 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2240 			   char __user *optval, int __user *optlen)
2241 {
2242 	struct sock *sk = sock->sk;
2243 
2244 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2245 }
2246 EXPORT_SYMBOL(sock_common_getsockopt);
2247 
2248 #ifdef CONFIG_COMPAT
2249 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2250 				  char __user *optval, int __user *optlen)
2251 {
2252 	struct sock *sk = sock->sk;
2253 
2254 	if (sk->sk_prot->compat_getsockopt != NULL)
2255 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2256 						      optval, optlen);
2257 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2258 }
2259 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2260 #endif
2261 
2262 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2263 			struct msghdr *msg, size_t size, int flags)
2264 {
2265 	struct sock *sk = sock->sk;
2266 	int addr_len = 0;
2267 	int err;
2268 
2269 	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2270 				   flags & ~MSG_DONTWAIT, &addr_len);
2271 	if (err >= 0)
2272 		msg->msg_namelen = addr_len;
2273 	return err;
2274 }
2275 EXPORT_SYMBOL(sock_common_recvmsg);
2276 
2277 /*
2278  *	Set socket options on an inet socket.
2279  */
2280 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2281 			   char __user *optval, unsigned int optlen)
2282 {
2283 	struct sock *sk = sock->sk;
2284 
2285 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2286 }
2287 EXPORT_SYMBOL(sock_common_setsockopt);
2288 
2289 #ifdef CONFIG_COMPAT
2290 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2291 				  char __user *optval, unsigned int optlen)
2292 {
2293 	struct sock *sk = sock->sk;
2294 
2295 	if (sk->sk_prot->compat_setsockopt != NULL)
2296 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2297 						      optval, optlen);
2298 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2299 }
2300 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2301 #endif
2302 
2303 void sk_common_release(struct sock *sk)
2304 {
2305 	if (sk->sk_prot->destroy)
2306 		sk->sk_prot->destroy(sk);
2307 
2308 	/*
2309 	 * Observation: when sock_common_release is called, processes have
2310 	 * no access to socket. But net still has.
2311 	 * Step one, detach it from networking:
2312 	 *
2313 	 * A. Remove from hash tables.
2314 	 */
2315 
2316 	sk->sk_prot->unhash(sk);
2317 
2318 	/*
2319 	 * In this point socket cannot receive new packets, but it is possible
2320 	 * that some packets are in flight because some CPU runs receiver and
2321 	 * did hash table lookup before we unhashed socket. They will achieve
2322 	 * receive queue and will be purged by socket destructor.
2323 	 *
2324 	 * Also we still have packets pending on receive queue and probably,
2325 	 * our own packets waiting in device queues. sock_destroy will drain
2326 	 * receive queue, but transmitted packets will delay socket destruction
2327 	 * until the last reference will be released.
2328 	 */
2329 
2330 	sock_orphan(sk);
2331 
2332 	xfrm_sk_free_policy(sk);
2333 
2334 	sk_refcnt_debug_release(sk);
2335 	sock_put(sk);
2336 }
2337 EXPORT_SYMBOL(sk_common_release);
2338 
2339 #ifdef CONFIG_PROC_FS
2340 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
2341 struct prot_inuse {
2342 	int val[PROTO_INUSE_NR];
2343 };
2344 
2345 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2346 
2347 #ifdef CONFIG_NET_NS
2348 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2349 {
2350 	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2351 }
2352 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2353 
2354 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2355 {
2356 	int cpu, idx = prot->inuse_idx;
2357 	int res = 0;
2358 
2359 	for_each_possible_cpu(cpu)
2360 		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2361 
2362 	return res >= 0 ? res : 0;
2363 }
2364 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2365 
2366 static int __net_init sock_inuse_init_net(struct net *net)
2367 {
2368 	net->core.inuse = alloc_percpu(struct prot_inuse);
2369 	return net->core.inuse ? 0 : -ENOMEM;
2370 }
2371 
2372 static void __net_exit sock_inuse_exit_net(struct net *net)
2373 {
2374 	free_percpu(net->core.inuse);
2375 }
2376 
2377 static struct pernet_operations net_inuse_ops = {
2378 	.init = sock_inuse_init_net,
2379 	.exit = sock_inuse_exit_net,
2380 };
2381 
2382 static __init int net_inuse_init(void)
2383 {
2384 	if (register_pernet_subsys(&net_inuse_ops))
2385 		panic("Cannot initialize net inuse counters");
2386 
2387 	return 0;
2388 }
2389 
2390 core_initcall(net_inuse_init);
2391 #else
2392 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2393 
2394 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2395 {
2396 	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2397 }
2398 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2399 
2400 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2401 {
2402 	int cpu, idx = prot->inuse_idx;
2403 	int res = 0;
2404 
2405 	for_each_possible_cpu(cpu)
2406 		res += per_cpu(prot_inuse, cpu).val[idx];
2407 
2408 	return res >= 0 ? res : 0;
2409 }
2410 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2411 #endif
2412 
2413 static void assign_proto_idx(struct proto *prot)
2414 {
2415 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2416 
2417 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2418 		printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2419 		return;
2420 	}
2421 
2422 	set_bit(prot->inuse_idx, proto_inuse_idx);
2423 }
2424 
2425 static void release_proto_idx(struct proto *prot)
2426 {
2427 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2428 		clear_bit(prot->inuse_idx, proto_inuse_idx);
2429 }
2430 #else
2431 static inline void assign_proto_idx(struct proto *prot)
2432 {
2433 }
2434 
2435 static inline void release_proto_idx(struct proto *prot)
2436 {
2437 }
2438 #endif
2439 
2440 int proto_register(struct proto *prot, int alloc_slab)
2441 {
2442 	if (alloc_slab) {
2443 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2444 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2445 					NULL);
2446 
2447 		if (prot->slab == NULL) {
2448 			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2449 			       prot->name);
2450 			goto out;
2451 		}
2452 
2453 		if (prot->rsk_prot != NULL) {
2454 			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2455 			if (prot->rsk_prot->slab_name == NULL)
2456 				goto out_free_sock_slab;
2457 
2458 			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2459 								 prot->rsk_prot->obj_size, 0,
2460 								 SLAB_HWCACHE_ALIGN, NULL);
2461 
2462 			if (prot->rsk_prot->slab == NULL) {
2463 				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2464 				       prot->name);
2465 				goto out_free_request_sock_slab_name;
2466 			}
2467 		}
2468 
2469 		if (prot->twsk_prot != NULL) {
2470 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2471 
2472 			if (prot->twsk_prot->twsk_slab_name == NULL)
2473 				goto out_free_request_sock_slab;
2474 
2475 			prot->twsk_prot->twsk_slab =
2476 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2477 						  prot->twsk_prot->twsk_obj_size,
2478 						  0,
2479 						  SLAB_HWCACHE_ALIGN |
2480 							prot->slab_flags,
2481 						  NULL);
2482 			if (prot->twsk_prot->twsk_slab == NULL)
2483 				goto out_free_timewait_sock_slab_name;
2484 		}
2485 	}
2486 
2487 	mutex_lock(&proto_list_mutex);
2488 	list_add(&prot->node, &proto_list);
2489 	assign_proto_idx(prot);
2490 	mutex_unlock(&proto_list_mutex);
2491 	return 0;
2492 
2493 out_free_timewait_sock_slab_name:
2494 	kfree(prot->twsk_prot->twsk_slab_name);
2495 out_free_request_sock_slab:
2496 	if (prot->rsk_prot && prot->rsk_prot->slab) {
2497 		kmem_cache_destroy(prot->rsk_prot->slab);
2498 		prot->rsk_prot->slab = NULL;
2499 	}
2500 out_free_request_sock_slab_name:
2501 	if (prot->rsk_prot)
2502 		kfree(prot->rsk_prot->slab_name);
2503 out_free_sock_slab:
2504 	kmem_cache_destroy(prot->slab);
2505 	prot->slab = NULL;
2506 out:
2507 	return -ENOBUFS;
2508 }
2509 EXPORT_SYMBOL(proto_register);
2510 
2511 void proto_unregister(struct proto *prot)
2512 {
2513 	mutex_lock(&proto_list_mutex);
2514 	release_proto_idx(prot);
2515 	list_del(&prot->node);
2516 	mutex_unlock(&proto_list_mutex);
2517 
2518 	if (prot->slab != NULL) {
2519 		kmem_cache_destroy(prot->slab);
2520 		prot->slab = NULL;
2521 	}
2522 
2523 	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2524 		kmem_cache_destroy(prot->rsk_prot->slab);
2525 		kfree(prot->rsk_prot->slab_name);
2526 		prot->rsk_prot->slab = NULL;
2527 	}
2528 
2529 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2530 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2531 		kfree(prot->twsk_prot->twsk_slab_name);
2532 		prot->twsk_prot->twsk_slab = NULL;
2533 	}
2534 }
2535 EXPORT_SYMBOL(proto_unregister);
2536 
2537 #ifdef CONFIG_PROC_FS
2538 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2539 	__acquires(proto_list_mutex)
2540 {
2541 	mutex_lock(&proto_list_mutex);
2542 	return seq_list_start_head(&proto_list, *pos);
2543 }
2544 
2545 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2546 {
2547 	return seq_list_next(v, &proto_list, pos);
2548 }
2549 
2550 static void proto_seq_stop(struct seq_file *seq, void *v)
2551 	__releases(proto_list_mutex)
2552 {
2553 	mutex_unlock(&proto_list_mutex);
2554 }
2555 
2556 static char proto_method_implemented(const void *method)
2557 {
2558 	return method == NULL ? 'n' : 'y';
2559 }
2560 static long sock_prot_memory_allocated(struct proto *proto)
2561 {
2562 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto): -1L;
2563 }
2564 
2565 static char *sock_prot_memory_pressure(struct proto *proto)
2566 {
2567 	return proto->memory_pressure != NULL ?
2568 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2569 }
2570 
2571 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2572 {
2573 
2574 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2575 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2576 		   proto->name,
2577 		   proto->obj_size,
2578 		   sock_prot_inuse_get(seq_file_net(seq), proto),
2579 		   sock_prot_memory_allocated(proto),
2580 		   sock_prot_memory_pressure(proto),
2581 		   proto->max_header,
2582 		   proto->slab == NULL ? "no" : "yes",
2583 		   module_name(proto->owner),
2584 		   proto_method_implemented(proto->close),
2585 		   proto_method_implemented(proto->connect),
2586 		   proto_method_implemented(proto->disconnect),
2587 		   proto_method_implemented(proto->accept),
2588 		   proto_method_implemented(proto->ioctl),
2589 		   proto_method_implemented(proto->init),
2590 		   proto_method_implemented(proto->destroy),
2591 		   proto_method_implemented(proto->shutdown),
2592 		   proto_method_implemented(proto->setsockopt),
2593 		   proto_method_implemented(proto->getsockopt),
2594 		   proto_method_implemented(proto->sendmsg),
2595 		   proto_method_implemented(proto->recvmsg),
2596 		   proto_method_implemented(proto->sendpage),
2597 		   proto_method_implemented(proto->bind),
2598 		   proto_method_implemented(proto->backlog_rcv),
2599 		   proto_method_implemented(proto->hash),
2600 		   proto_method_implemented(proto->unhash),
2601 		   proto_method_implemented(proto->get_port),
2602 		   proto_method_implemented(proto->enter_memory_pressure));
2603 }
2604 
2605 static int proto_seq_show(struct seq_file *seq, void *v)
2606 {
2607 	if (v == &proto_list)
2608 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2609 			   "protocol",
2610 			   "size",
2611 			   "sockets",
2612 			   "memory",
2613 			   "press",
2614 			   "maxhdr",
2615 			   "slab",
2616 			   "module",
2617 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2618 	else
2619 		proto_seq_printf(seq, list_entry(v, struct proto, node));
2620 	return 0;
2621 }
2622 
2623 static const struct seq_operations proto_seq_ops = {
2624 	.start  = proto_seq_start,
2625 	.next   = proto_seq_next,
2626 	.stop   = proto_seq_stop,
2627 	.show   = proto_seq_show,
2628 };
2629 
2630 static int proto_seq_open(struct inode *inode, struct file *file)
2631 {
2632 	return seq_open_net(inode, file, &proto_seq_ops,
2633 			    sizeof(struct seq_net_private));
2634 }
2635 
2636 static const struct file_operations proto_seq_fops = {
2637 	.owner		= THIS_MODULE,
2638 	.open		= proto_seq_open,
2639 	.read		= seq_read,
2640 	.llseek		= seq_lseek,
2641 	.release	= seq_release_net,
2642 };
2643 
2644 static __net_init int proto_init_net(struct net *net)
2645 {
2646 	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2647 		return -ENOMEM;
2648 
2649 	return 0;
2650 }
2651 
2652 static __net_exit void proto_exit_net(struct net *net)
2653 {
2654 	proc_net_remove(net, "protocols");
2655 }
2656 
2657 
2658 static __net_initdata struct pernet_operations proto_net_ops = {
2659 	.init = proto_init_net,
2660 	.exit = proto_exit_net,
2661 };
2662 
2663 static int __init proto_init(void)
2664 {
2665 	return register_pernet_subsys(&proto_net_ops);
2666 }
2667 
2668 subsys_initcall(proto_init);
2669 
2670 #endif /* PROC_FS */
2671