xref: /linux/net/core/sock.c (revision 9d14070f656addddce3d63fd483de46930b51850)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #include <linux/capability.h>
93 #include <linux/errno.h>
94 #include <linux/types.h>
95 #include <linux/socket.h>
96 #include <linux/in.h>
97 #include <linux/kernel.h>
98 #include <linux/module.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/sched.h>
102 #include <linux/timer.h>
103 #include <linux/string.h>
104 #include <linux/sockios.h>
105 #include <linux/net.h>
106 #include <linux/mm.h>
107 #include <linux/slab.h>
108 #include <linux/interrupt.h>
109 #include <linux/poll.h>
110 #include <linux/tcp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/jump_label.h>
115 
116 #include <asm/uaccess.h>
117 #include <asm/system.h>
118 
119 #include <linux/netdevice.h>
120 #include <net/protocol.h>
121 #include <linux/skbuff.h>
122 #include <net/net_namespace.h>
123 #include <net/request_sock.h>
124 #include <net/sock.h>
125 #include <linux/net_tstamp.h>
126 #include <net/xfrm.h>
127 #include <linux/ipsec.h>
128 #include <net/cls_cgroup.h>
129 #include <net/netprio_cgroup.h>
130 
131 #include <linux/filter.h>
132 
133 #include <trace/events/sock.h>
134 
135 #ifdef CONFIG_INET
136 #include <net/tcp.h>
137 #endif
138 
139 static DEFINE_MUTEX(proto_list_mutex);
140 static LIST_HEAD(proto_list);
141 
142 #ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
143 int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss)
144 {
145 	struct proto *proto;
146 	int ret = 0;
147 
148 	mutex_lock(&proto_list_mutex);
149 	list_for_each_entry(proto, &proto_list, node) {
150 		if (proto->init_cgroup) {
151 			ret = proto->init_cgroup(cgrp, ss);
152 			if (ret)
153 				goto out;
154 		}
155 	}
156 
157 	mutex_unlock(&proto_list_mutex);
158 	return ret;
159 out:
160 	list_for_each_entry_continue_reverse(proto, &proto_list, node)
161 		if (proto->destroy_cgroup)
162 			proto->destroy_cgroup(cgrp, ss);
163 	mutex_unlock(&proto_list_mutex);
164 	return ret;
165 }
166 
167 void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss)
168 {
169 	struct proto *proto;
170 
171 	mutex_lock(&proto_list_mutex);
172 	list_for_each_entry_reverse(proto, &proto_list, node)
173 		if (proto->destroy_cgroup)
174 			proto->destroy_cgroup(cgrp, ss);
175 	mutex_unlock(&proto_list_mutex);
176 }
177 #endif
178 
179 /*
180  * Each address family might have different locking rules, so we have
181  * one slock key per address family:
182  */
183 static struct lock_class_key af_family_keys[AF_MAX];
184 static struct lock_class_key af_family_slock_keys[AF_MAX];
185 
186 struct jump_label_key memcg_socket_limit_enabled;
187 EXPORT_SYMBOL(memcg_socket_limit_enabled);
188 
189 /*
190  * Make lock validator output more readable. (we pre-construct these
191  * strings build-time, so that runtime initialization of socket
192  * locks is fast):
193  */
194 static const char *const af_family_key_strings[AF_MAX+1] = {
195   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
196   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
197   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
198   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
199   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
200   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
201   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
202   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
203   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
204   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
205   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
206   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
207   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
208   "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
209 };
210 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
211   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
212   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
213   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
214   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
215   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
216   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
217   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
218   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
219   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
220   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
221   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
222   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
223   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
224   "slock-AF_NFC"   , "slock-AF_MAX"
225 };
226 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
227   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
228   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
229   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
230   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
231   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
232   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
233   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
234   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
235   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
236   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
237   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
238   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
239   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
240   "clock-AF_NFC"   , "clock-AF_MAX"
241 };
242 
243 /*
244  * sk_callback_lock locking rules are per-address-family,
245  * so split the lock classes by using a per-AF key:
246  */
247 static struct lock_class_key af_callback_keys[AF_MAX];
248 
249 /* Take into consideration the size of the struct sk_buff overhead in the
250  * determination of these values, since that is non-constant across
251  * platforms.  This makes socket queueing behavior and performance
252  * not depend upon such differences.
253  */
254 #define _SK_MEM_PACKETS		256
255 #define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
256 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
257 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
258 
259 /* Run time adjustable parameters. */
260 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
261 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
262 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
263 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
264 
265 /* Maximal space eaten by iovec or ancillary data plus some space */
266 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
267 EXPORT_SYMBOL(sysctl_optmem_max);
268 
269 #if defined(CONFIG_CGROUPS)
270 #if !defined(CONFIG_NET_CLS_CGROUP)
271 int net_cls_subsys_id = -1;
272 EXPORT_SYMBOL_GPL(net_cls_subsys_id);
273 #endif
274 #if !defined(CONFIG_NETPRIO_CGROUP)
275 int net_prio_subsys_id = -1;
276 EXPORT_SYMBOL_GPL(net_prio_subsys_id);
277 #endif
278 #endif
279 
280 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
281 {
282 	struct timeval tv;
283 
284 	if (optlen < sizeof(tv))
285 		return -EINVAL;
286 	if (copy_from_user(&tv, optval, sizeof(tv)))
287 		return -EFAULT;
288 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
289 		return -EDOM;
290 
291 	if (tv.tv_sec < 0) {
292 		static int warned __read_mostly;
293 
294 		*timeo_p = 0;
295 		if (warned < 10 && net_ratelimit()) {
296 			warned++;
297 			printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
298 			       "tries to set negative timeout\n",
299 				current->comm, task_pid_nr(current));
300 		}
301 		return 0;
302 	}
303 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
304 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
305 		return 0;
306 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
307 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
308 	return 0;
309 }
310 
311 static void sock_warn_obsolete_bsdism(const char *name)
312 {
313 	static int warned;
314 	static char warncomm[TASK_COMM_LEN];
315 	if (strcmp(warncomm, current->comm) && warned < 5) {
316 		strcpy(warncomm,  current->comm);
317 		printk(KERN_WARNING "process `%s' is using obsolete "
318 		       "%s SO_BSDCOMPAT\n", warncomm, name);
319 		warned++;
320 	}
321 }
322 
323 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
324 
325 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
326 {
327 	if (sk->sk_flags & flags) {
328 		sk->sk_flags &= ~flags;
329 		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
330 			net_disable_timestamp();
331 	}
332 }
333 
334 
335 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
336 {
337 	int err;
338 	int skb_len;
339 	unsigned long flags;
340 	struct sk_buff_head *list = &sk->sk_receive_queue;
341 
342 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
343 		atomic_inc(&sk->sk_drops);
344 		trace_sock_rcvqueue_full(sk, skb);
345 		return -ENOMEM;
346 	}
347 
348 	err = sk_filter(sk, skb);
349 	if (err)
350 		return err;
351 
352 	if (!sk_rmem_schedule(sk, skb->truesize)) {
353 		atomic_inc(&sk->sk_drops);
354 		return -ENOBUFS;
355 	}
356 
357 	skb->dev = NULL;
358 	skb_set_owner_r(skb, sk);
359 
360 	/* Cache the SKB length before we tack it onto the receive
361 	 * queue.  Once it is added it no longer belongs to us and
362 	 * may be freed by other threads of control pulling packets
363 	 * from the queue.
364 	 */
365 	skb_len = skb->len;
366 
367 	/* we escape from rcu protected region, make sure we dont leak
368 	 * a norefcounted dst
369 	 */
370 	skb_dst_force(skb);
371 
372 	spin_lock_irqsave(&list->lock, flags);
373 	skb->dropcount = atomic_read(&sk->sk_drops);
374 	__skb_queue_tail(list, skb);
375 	spin_unlock_irqrestore(&list->lock, flags);
376 
377 	if (!sock_flag(sk, SOCK_DEAD))
378 		sk->sk_data_ready(sk, skb_len);
379 	return 0;
380 }
381 EXPORT_SYMBOL(sock_queue_rcv_skb);
382 
383 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
384 {
385 	int rc = NET_RX_SUCCESS;
386 
387 	if (sk_filter(sk, skb))
388 		goto discard_and_relse;
389 
390 	skb->dev = NULL;
391 
392 	if (sk_rcvqueues_full(sk, skb)) {
393 		atomic_inc(&sk->sk_drops);
394 		goto discard_and_relse;
395 	}
396 	if (nested)
397 		bh_lock_sock_nested(sk);
398 	else
399 		bh_lock_sock(sk);
400 	if (!sock_owned_by_user(sk)) {
401 		/*
402 		 * trylock + unlock semantics:
403 		 */
404 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
405 
406 		rc = sk_backlog_rcv(sk, skb);
407 
408 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
409 	} else if (sk_add_backlog(sk, skb)) {
410 		bh_unlock_sock(sk);
411 		atomic_inc(&sk->sk_drops);
412 		goto discard_and_relse;
413 	}
414 
415 	bh_unlock_sock(sk);
416 out:
417 	sock_put(sk);
418 	return rc;
419 discard_and_relse:
420 	kfree_skb(skb);
421 	goto out;
422 }
423 EXPORT_SYMBOL(sk_receive_skb);
424 
425 void sk_reset_txq(struct sock *sk)
426 {
427 	sk_tx_queue_clear(sk);
428 }
429 EXPORT_SYMBOL(sk_reset_txq);
430 
431 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
432 {
433 	struct dst_entry *dst = __sk_dst_get(sk);
434 
435 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
436 		sk_tx_queue_clear(sk);
437 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
438 		dst_release(dst);
439 		return NULL;
440 	}
441 
442 	return dst;
443 }
444 EXPORT_SYMBOL(__sk_dst_check);
445 
446 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
447 {
448 	struct dst_entry *dst = sk_dst_get(sk);
449 
450 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
451 		sk_dst_reset(sk);
452 		dst_release(dst);
453 		return NULL;
454 	}
455 
456 	return dst;
457 }
458 EXPORT_SYMBOL(sk_dst_check);
459 
460 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
461 {
462 	int ret = -ENOPROTOOPT;
463 #ifdef CONFIG_NETDEVICES
464 	struct net *net = sock_net(sk);
465 	char devname[IFNAMSIZ];
466 	int index;
467 
468 	/* Sorry... */
469 	ret = -EPERM;
470 	if (!capable(CAP_NET_RAW))
471 		goto out;
472 
473 	ret = -EINVAL;
474 	if (optlen < 0)
475 		goto out;
476 
477 	/* Bind this socket to a particular device like "eth0",
478 	 * as specified in the passed interface name. If the
479 	 * name is "" or the option length is zero the socket
480 	 * is not bound.
481 	 */
482 	if (optlen > IFNAMSIZ - 1)
483 		optlen = IFNAMSIZ - 1;
484 	memset(devname, 0, sizeof(devname));
485 
486 	ret = -EFAULT;
487 	if (copy_from_user(devname, optval, optlen))
488 		goto out;
489 
490 	index = 0;
491 	if (devname[0] != '\0') {
492 		struct net_device *dev;
493 
494 		rcu_read_lock();
495 		dev = dev_get_by_name_rcu(net, devname);
496 		if (dev)
497 			index = dev->ifindex;
498 		rcu_read_unlock();
499 		ret = -ENODEV;
500 		if (!dev)
501 			goto out;
502 	}
503 
504 	lock_sock(sk);
505 	sk->sk_bound_dev_if = index;
506 	sk_dst_reset(sk);
507 	release_sock(sk);
508 
509 	ret = 0;
510 
511 out:
512 #endif
513 
514 	return ret;
515 }
516 
517 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
518 {
519 	if (valbool)
520 		sock_set_flag(sk, bit);
521 	else
522 		sock_reset_flag(sk, bit);
523 }
524 
525 /*
526  *	This is meant for all protocols to use and covers goings on
527  *	at the socket level. Everything here is generic.
528  */
529 
530 int sock_setsockopt(struct socket *sock, int level, int optname,
531 		    char __user *optval, unsigned int optlen)
532 {
533 	struct sock *sk = sock->sk;
534 	int val;
535 	int valbool;
536 	struct linger ling;
537 	int ret = 0;
538 
539 	/*
540 	 *	Options without arguments
541 	 */
542 
543 	if (optname == SO_BINDTODEVICE)
544 		return sock_bindtodevice(sk, optval, optlen);
545 
546 	if (optlen < sizeof(int))
547 		return -EINVAL;
548 
549 	if (get_user(val, (int __user *)optval))
550 		return -EFAULT;
551 
552 	valbool = val ? 1 : 0;
553 
554 	lock_sock(sk);
555 
556 	switch (optname) {
557 	case SO_DEBUG:
558 		if (val && !capable(CAP_NET_ADMIN))
559 			ret = -EACCES;
560 		else
561 			sock_valbool_flag(sk, SOCK_DBG, valbool);
562 		break;
563 	case SO_REUSEADDR:
564 		sk->sk_reuse = valbool;
565 		break;
566 	case SO_TYPE:
567 	case SO_PROTOCOL:
568 	case SO_DOMAIN:
569 	case SO_ERROR:
570 		ret = -ENOPROTOOPT;
571 		break;
572 	case SO_DONTROUTE:
573 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
574 		break;
575 	case SO_BROADCAST:
576 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
577 		break;
578 	case SO_SNDBUF:
579 		/* Don't error on this BSD doesn't and if you think
580 		   about it this is right. Otherwise apps have to
581 		   play 'guess the biggest size' games. RCVBUF/SNDBUF
582 		   are treated in BSD as hints */
583 
584 		if (val > sysctl_wmem_max)
585 			val = sysctl_wmem_max;
586 set_sndbuf:
587 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
588 		if ((val * 2) < SOCK_MIN_SNDBUF)
589 			sk->sk_sndbuf = SOCK_MIN_SNDBUF;
590 		else
591 			sk->sk_sndbuf = val * 2;
592 
593 		/*
594 		 *	Wake up sending tasks if we
595 		 *	upped the value.
596 		 */
597 		sk->sk_write_space(sk);
598 		break;
599 
600 	case SO_SNDBUFFORCE:
601 		if (!capable(CAP_NET_ADMIN)) {
602 			ret = -EPERM;
603 			break;
604 		}
605 		goto set_sndbuf;
606 
607 	case SO_RCVBUF:
608 		/* Don't error on this BSD doesn't and if you think
609 		   about it this is right. Otherwise apps have to
610 		   play 'guess the biggest size' games. RCVBUF/SNDBUF
611 		   are treated in BSD as hints */
612 
613 		if (val > sysctl_rmem_max)
614 			val = sysctl_rmem_max;
615 set_rcvbuf:
616 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
617 		/*
618 		 * We double it on the way in to account for
619 		 * "struct sk_buff" etc. overhead.   Applications
620 		 * assume that the SO_RCVBUF setting they make will
621 		 * allow that much actual data to be received on that
622 		 * socket.
623 		 *
624 		 * Applications are unaware that "struct sk_buff" and
625 		 * other overheads allocate from the receive buffer
626 		 * during socket buffer allocation.
627 		 *
628 		 * And after considering the possible alternatives,
629 		 * returning the value we actually used in getsockopt
630 		 * is the most desirable behavior.
631 		 */
632 		if ((val * 2) < SOCK_MIN_RCVBUF)
633 			sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
634 		else
635 			sk->sk_rcvbuf = val * 2;
636 		break;
637 
638 	case SO_RCVBUFFORCE:
639 		if (!capable(CAP_NET_ADMIN)) {
640 			ret = -EPERM;
641 			break;
642 		}
643 		goto set_rcvbuf;
644 
645 	case SO_KEEPALIVE:
646 #ifdef CONFIG_INET
647 		if (sk->sk_protocol == IPPROTO_TCP)
648 			tcp_set_keepalive(sk, valbool);
649 #endif
650 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
651 		break;
652 
653 	case SO_OOBINLINE:
654 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
655 		break;
656 
657 	case SO_NO_CHECK:
658 		sk->sk_no_check = valbool;
659 		break;
660 
661 	case SO_PRIORITY:
662 		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
663 			sk->sk_priority = val;
664 		else
665 			ret = -EPERM;
666 		break;
667 
668 	case SO_LINGER:
669 		if (optlen < sizeof(ling)) {
670 			ret = -EINVAL;	/* 1003.1g */
671 			break;
672 		}
673 		if (copy_from_user(&ling, optval, sizeof(ling))) {
674 			ret = -EFAULT;
675 			break;
676 		}
677 		if (!ling.l_onoff)
678 			sock_reset_flag(sk, SOCK_LINGER);
679 		else {
680 #if (BITS_PER_LONG == 32)
681 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
682 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
683 			else
684 #endif
685 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
686 			sock_set_flag(sk, SOCK_LINGER);
687 		}
688 		break;
689 
690 	case SO_BSDCOMPAT:
691 		sock_warn_obsolete_bsdism("setsockopt");
692 		break;
693 
694 	case SO_PASSCRED:
695 		if (valbool)
696 			set_bit(SOCK_PASSCRED, &sock->flags);
697 		else
698 			clear_bit(SOCK_PASSCRED, &sock->flags);
699 		break;
700 
701 	case SO_TIMESTAMP:
702 	case SO_TIMESTAMPNS:
703 		if (valbool)  {
704 			if (optname == SO_TIMESTAMP)
705 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
706 			else
707 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
708 			sock_set_flag(sk, SOCK_RCVTSTAMP);
709 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
710 		} else {
711 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
712 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
713 		}
714 		break;
715 
716 	case SO_TIMESTAMPING:
717 		if (val & ~SOF_TIMESTAMPING_MASK) {
718 			ret = -EINVAL;
719 			break;
720 		}
721 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
722 				  val & SOF_TIMESTAMPING_TX_HARDWARE);
723 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
724 				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
725 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
726 				  val & SOF_TIMESTAMPING_RX_HARDWARE);
727 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
728 			sock_enable_timestamp(sk,
729 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
730 		else
731 			sock_disable_timestamp(sk,
732 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
733 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
734 				  val & SOF_TIMESTAMPING_SOFTWARE);
735 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
736 				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
737 		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
738 				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
739 		break;
740 
741 	case SO_RCVLOWAT:
742 		if (val < 0)
743 			val = INT_MAX;
744 		sk->sk_rcvlowat = val ? : 1;
745 		break;
746 
747 	case SO_RCVTIMEO:
748 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
749 		break;
750 
751 	case SO_SNDTIMEO:
752 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
753 		break;
754 
755 	case SO_ATTACH_FILTER:
756 		ret = -EINVAL;
757 		if (optlen == sizeof(struct sock_fprog)) {
758 			struct sock_fprog fprog;
759 
760 			ret = -EFAULT;
761 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
762 				break;
763 
764 			ret = sk_attach_filter(&fprog, sk);
765 		}
766 		break;
767 
768 	case SO_DETACH_FILTER:
769 		ret = sk_detach_filter(sk);
770 		break;
771 
772 	case SO_PASSSEC:
773 		if (valbool)
774 			set_bit(SOCK_PASSSEC, &sock->flags);
775 		else
776 			clear_bit(SOCK_PASSSEC, &sock->flags);
777 		break;
778 	case SO_MARK:
779 		if (!capable(CAP_NET_ADMIN))
780 			ret = -EPERM;
781 		else
782 			sk->sk_mark = val;
783 		break;
784 
785 		/* We implement the SO_SNDLOWAT etc to
786 		   not be settable (1003.1g 5.3) */
787 	case SO_RXQ_OVFL:
788 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
789 		break;
790 
791 	case SO_WIFI_STATUS:
792 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
793 		break;
794 
795 	default:
796 		ret = -ENOPROTOOPT;
797 		break;
798 	}
799 	release_sock(sk);
800 	return ret;
801 }
802 EXPORT_SYMBOL(sock_setsockopt);
803 
804 
805 void cred_to_ucred(struct pid *pid, const struct cred *cred,
806 		   struct ucred *ucred)
807 {
808 	ucred->pid = pid_vnr(pid);
809 	ucred->uid = ucred->gid = -1;
810 	if (cred) {
811 		struct user_namespace *current_ns = current_user_ns();
812 
813 		ucred->uid = user_ns_map_uid(current_ns, cred, cred->euid);
814 		ucred->gid = user_ns_map_gid(current_ns, cred, cred->egid);
815 	}
816 }
817 EXPORT_SYMBOL_GPL(cred_to_ucred);
818 
819 int sock_getsockopt(struct socket *sock, int level, int optname,
820 		    char __user *optval, int __user *optlen)
821 {
822 	struct sock *sk = sock->sk;
823 
824 	union {
825 		int val;
826 		struct linger ling;
827 		struct timeval tm;
828 	} v;
829 
830 	int lv = sizeof(int);
831 	int len;
832 
833 	if (get_user(len, optlen))
834 		return -EFAULT;
835 	if (len < 0)
836 		return -EINVAL;
837 
838 	memset(&v, 0, sizeof(v));
839 
840 	switch (optname) {
841 	case SO_DEBUG:
842 		v.val = sock_flag(sk, SOCK_DBG);
843 		break;
844 
845 	case SO_DONTROUTE:
846 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
847 		break;
848 
849 	case SO_BROADCAST:
850 		v.val = !!sock_flag(sk, SOCK_BROADCAST);
851 		break;
852 
853 	case SO_SNDBUF:
854 		v.val = sk->sk_sndbuf;
855 		break;
856 
857 	case SO_RCVBUF:
858 		v.val = sk->sk_rcvbuf;
859 		break;
860 
861 	case SO_REUSEADDR:
862 		v.val = sk->sk_reuse;
863 		break;
864 
865 	case SO_KEEPALIVE:
866 		v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
867 		break;
868 
869 	case SO_TYPE:
870 		v.val = sk->sk_type;
871 		break;
872 
873 	case SO_PROTOCOL:
874 		v.val = sk->sk_protocol;
875 		break;
876 
877 	case SO_DOMAIN:
878 		v.val = sk->sk_family;
879 		break;
880 
881 	case SO_ERROR:
882 		v.val = -sock_error(sk);
883 		if (v.val == 0)
884 			v.val = xchg(&sk->sk_err_soft, 0);
885 		break;
886 
887 	case SO_OOBINLINE:
888 		v.val = !!sock_flag(sk, SOCK_URGINLINE);
889 		break;
890 
891 	case SO_NO_CHECK:
892 		v.val = sk->sk_no_check;
893 		break;
894 
895 	case SO_PRIORITY:
896 		v.val = sk->sk_priority;
897 		break;
898 
899 	case SO_LINGER:
900 		lv		= sizeof(v.ling);
901 		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
902 		v.ling.l_linger	= sk->sk_lingertime / HZ;
903 		break;
904 
905 	case SO_BSDCOMPAT:
906 		sock_warn_obsolete_bsdism("getsockopt");
907 		break;
908 
909 	case SO_TIMESTAMP:
910 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
911 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
912 		break;
913 
914 	case SO_TIMESTAMPNS:
915 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
916 		break;
917 
918 	case SO_TIMESTAMPING:
919 		v.val = 0;
920 		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
921 			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
922 		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
923 			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
924 		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
925 			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
926 		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
927 			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
928 		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
929 			v.val |= SOF_TIMESTAMPING_SOFTWARE;
930 		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
931 			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
932 		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
933 			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
934 		break;
935 
936 	case SO_RCVTIMEO:
937 		lv = sizeof(struct timeval);
938 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
939 			v.tm.tv_sec = 0;
940 			v.tm.tv_usec = 0;
941 		} else {
942 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
943 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
944 		}
945 		break;
946 
947 	case SO_SNDTIMEO:
948 		lv = sizeof(struct timeval);
949 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
950 			v.tm.tv_sec = 0;
951 			v.tm.tv_usec = 0;
952 		} else {
953 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
954 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
955 		}
956 		break;
957 
958 	case SO_RCVLOWAT:
959 		v.val = sk->sk_rcvlowat;
960 		break;
961 
962 	case SO_SNDLOWAT:
963 		v.val = 1;
964 		break;
965 
966 	case SO_PASSCRED:
967 		v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
968 		break;
969 
970 	case SO_PEERCRED:
971 	{
972 		struct ucred peercred;
973 		if (len > sizeof(peercred))
974 			len = sizeof(peercred);
975 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
976 		if (copy_to_user(optval, &peercred, len))
977 			return -EFAULT;
978 		goto lenout;
979 	}
980 
981 	case SO_PEERNAME:
982 	{
983 		char address[128];
984 
985 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
986 			return -ENOTCONN;
987 		if (lv < len)
988 			return -EINVAL;
989 		if (copy_to_user(optval, address, len))
990 			return -EFAULT;
991 		goto lenout;
992 	}
993 
994 	/* Dubious BSD thing... Probably nobody even uses it, but
995 	 * the UNIX standard wants it for whatever reason... -DaveM
996 	 */
997 	case SO_ACCEPTCONN:
998 		v.val = sk->sk_state == TCP_LISTEN;
999 		break;
1000 
1001 	case SO_PASSSEC:
1002 		v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
1003 		break;
1004 
1005 	case SO_PEERSEC:
1006 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1007 
1008 	case SO_MARK:
1009 		v.val = sk->sk_mark;
1010 		break;
1011 
1012 	case SO_RXQ_OVFL:
1013 		v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
1014 		break;
1015 
1016 	case SO_WIFI_STATUS:
1017 		v.val = !!sock_flag(sk, SOCK_WIFI_STATUS);
1018 		break;
1019 
1020 	default:
1021 		return -ENOPROTOOPT;
1022 	}
1023 
1024 	if (len > lv)
1025 		len = lv;
1026 	if (copy_to_user(optval, &v, len))
1027 		return -EFAULT;
1028 lenout:
1029 	if (put_user(len, optlen))
1030 		return -EFAULT;
1031 	return 0;
1032 }
1033 
1034 /*
1035  * Initialize an sk_lock.
1036  *
1037  * (We also register the sk_lock with the lock validator.)
1038  */
1039 static inline void sock_lock_init(struct sock *sk)
1040 {
1041 	sock_lock_init_class_and_name(sk,
1042 			af_family_slock_key_strings[sk->sk_family],
1043 			af_family_slock_keys + sk->sk_family,
1044 			af_family_key_strings[sk->sk_family],
1045 			af_family_keys + sk->sk_family);
1046 }
1047 
1048 /*
1049  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1050  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1051  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1052  */
1053 static void sock_copy(struct sock *nsk, const struct sock *osk)
1054 {
1055 #ifdef CONFIG_SECURITY_NETWORK
1056 	void *sptr = nsk->sk_security;
1057 #endif
1058 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1059 
1060 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1061 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1062 
1063 #ifdef CONFIG_SECURITY_NETWORK
1064 	nsk->sk_security = sptr;
1065 	security_sk_clone(osk, nsk);
1066 #endif
1067 }
1068 
1069 /*
1070  * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
1071  * un-modified. Special care is taken when initializing object to zero.
1072  */
1073 static inline void sk_prot_clear_nulls(struct sock *sk, int size)
1074 {
1075 	if (offsetof(struct sock, sk_node.next) != 0)
1076 		memset(sk, 0, offsetof(struct sock, sk_node.next));
1077 	memset(&sk->sk_node.pprev, 0,
1078 	       size - offsetof(struct sock, sk_node.pprev));
1079 }
1080 
1081 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1082 {
1083 	unsigned long nulls1, nulls2;
1084 
1085 	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1086 	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1087 	if (nulls1 > nulls2)
1088 		swap(nulls1, nulls2);
1089 
1090 	if (nulls1 != 0)
1091 		memset((char *)sk, 0, nulls1);
1092 	memset((char *)sk + nulls1 + sizeof(void *), 0,
1093 	       nulls2 - nulls1 - sizeof(void *));
1094 	memset((char *)sk + nulls2 + sizeof(void *), 0,
1095 	       size - nulls2 - sizeof(void *));
1096 }
1097 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1098 
1099 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1100 		int family)
1101 {
1102 	struct sock *sk;
1103 	struct kmem_cache *slab;
1104 
1105 	slab = prot->slab;
1106 	if (slab != NULL) {
1107 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1108 		if (!sk)
1109 			return sk;
1110 		if (priority & __GFP_ZERO) {
1111 			if (prot->clear_sk)
1112 				prot->clear_sk(sk, prot->obj_size);
1113 			else
1114 				sk_prot_clear_nulls(sk, prot->obj_size);
1115 		}
1116 	} else
1117 		sk = kmalloc(prot->obj_size, priority);
1118 
1119 	if (sk != NULL) {
1120 		kmemcheck_annotate_bitfield(sk, flags);
1121 
1122 		if (security_sk_alloc(sk, family, priority))
1123 			goto out_free;
1124 
1125 		if (!try_module_get(prot->owner))
1126 			goto out_free_sec;
1127 		sk_tx_queue_clear(sk);
1128 	}
1129 
1130 	return sk;
1131 
1132 out_free_sec:
1133 	security_sk_free(sk);
1134 out_free:
1135 	if (slab != NULL)
1136 		kmem_cache_free(slab, sk);
1137 	else
1138 		kfree(sk);
1139 	return NULL;
1140 }
1141 
1142 static void sk_prot_free(struct proto *prot, struct sock *sk)
1143 {
1144 	struct kmem_cache *slab;
1145 	struct module *owner;
1146 
1147 	owner = prot->owner;
1148 	slab = prot->slab;
1149 
1150 	security_sk_free(sk);
1151 	if (slab != NULL)
1152 		kmem_cache_free(slab, sk);
1153 	else
1154 		kfree(sk);
1155 	module_put(owner);
1156 }
1157 
1158 #ifdef CONFIG_CGROUPS
1159 void sock_update_classid(struct sock *sk)
1160 {
1161 	u32 classid;
1162 
1163 	rcu_read_lock();  /* doing current task, which cannot vanish. */
1164 	classid = task_cls_classid(current);
1165 	rcu_read_unlock();
1166 	if (classid && classid != sk->sk_classid)
1167 		sk->sk_classid = classid;
1168 }
1169 EXPORT_SYMBOL(sock_update_classid);
1170 
1171 void sock_update_netprioidx(struct sock *sk)
1172 {
1173 	struct cgroup_netprio_state *state;
1174 	if (in_interrupt())
1175 		return;
1176 	rcu_read_lock();
1177 	state = task_netprio_state(current);
1178 	sk->sk_cgrp_prioidx = state ? state->prioidx : 0;
1179 	rcu_read_unlock();
1180 }
1181 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1182 #endif
1183 
1184 /**
1185  *	sk_alloc - All socket objects are allocated here
1186  *	@net: the applicable net namespace
1187  *	@family: protocol family
1188  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1189  *	@prot: struct proto associated with this new sock instance
1190  */
1191 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1192 		      struct proto *prot)
1193 {
1194 	struct sock *sk;
1195 
1196 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1197 	if (sk) {
1198 		sk->sk_family = family;
1199 		/*
1200 		 * See comment in struct sock definition to understand
1201 		 * why we need sk_prot_creator -acme
1202 		 */
1203 		sk->sk_prot = sk->sk_prot_creator = prot;
1204 		sock_lock_init(sk);
1205 		sock_net_set(sk, get_net(net));
1206 		atomic_set(&sk->sk_wmem_alloc, 1);
1207 
1208 		sock_update_classid(sk);
1209 		sock_update_netprioidx(sk);
1210 	}
1211 
1212 	return sk;
1213 }
1214 EXPORT_SYMBOL(sk_alloc);
1215 
1216 static void __sk_free(struct sock *sk)
1217 {
1218 	struct sk_filter *filter;
1219 
1220 	if (sk->sk_destruct)
1221 		sk->sk_destruct(sk);
1222 
1223 	filter = rcu_dereference_check(sk->sk_filter,
1224 				       atomic_read(&sk->sk_wmem_alloc) == 0);
1225 	if (filter) {
1226 		sk_filter_uncharge(sk, filter);
1227 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1228 	}
1229 
1230 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1231 
1232 	if (atomic_read(&sk->sk_omem_alloc))
1233 		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1234 		       __func__, atomic_read(&sk->sk_omem_alloc));
1235 
1236 	if (sk->sk_peer_cred)
1237 		put_cred(sk->sk_peer_cred);
1238 	put_pid(sk->sk_peer_pid);
1239 	put_net(sock_net(sk));
1240 	sk_prot_free(sk->sk_prot_creator, sk);
1241 }
1242 
1243 void sk_free(struct sock *sk)
1244 {
1245 	/*
1246 	 * We subtract one from sk_wmem_alloc and can know if
1247 	 * some packets are still in some tx queue.
1248 	 * If not null, sock_wfree() will call __sk_free(sk) later
1249 	 */
1250 	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1251 		__sk_free(sk);
1252 }
1253 EXPORT_SYMBOL(sk_free);
1254 
1255 /*
1256  * Last sock_put should drop reference to sk->sk_net. It has already
1257  * been dropped in sk_change_net. Taking reference to stopping namespace
1258  * is not an option.
1259  * Take reference to a socket to remove it from hash _alive_ and after that
1260  * destroy it in the context of init_net.
1261  */
1262 void sk_release_kernel(struct sock *sk)
1263 {
1264 	if (sk == NULL || sk->sk_socket == NULL)
1265 		return;
1266 
1267 	sock_hold(sk);
1268 	sock_release(sk->sk_socket);
1269 	release_net(sock_net(sk));
1270 	sock_net_set(sk, get_net(&init_net));
1271 	sock_put(sk);
1272 }
1273 EXPORT_SYMBOL(sk_release_kernel);
1274 
1275 /**
1276  *	sk_clone_lock - clone a socket, and lock its clone
1277  *	@sk: the socket to clone
1278  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1279  *
1280  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1281  */
1282 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1283 {
1284 	struct sock *newsk;
1285 
1286 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1287 	if (newsk != NULL) {
1288 		struct sk_filter *filter;
1289 
1290 		sock_copy(newsk, sk);
1291 
1292 		/* SANITY */
1293 		get_net(sock_net(newsk));
1294 		sk_node_init(&newsk->sk_node);
1295 		sock_lock_init(newsk);
1296 		bh_lock_sock(newsk);
1297 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1298 		newsk->sk_backlog.len = 0;
1299 
1300 		atomic_set(&newsk->sk_rmem_alloc, 0);
1301 		/*
1302 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1303 		 */
1304 		atomic_set(&newsk->sk_wmem_alloc, 1);
1305 		atomic_set(&newsk->sk_omem_alloc, 0);
1306 		skb_queue_head_init(&newsk->sk_receive_queue);
1307 		skb_queue_head_init(&newsk->sk_write_queue);
1308 #ifdef CONFIG_NET_DMA
1309 		skb_queue_head_init(&newsk->sk_async_wait_queue);
1310 #endif
1311 
1312 		spin_lock_init(&newsk->sk_dst_lock);
1313 		rwlock_init(&newsk->sk_callback_lock);
1314 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1315 				af_callback_keys + newsk->sk_family,
1316 				af_family_clock_key_strings[newsk->sk_family]);
1317 
1318 		newsk->sk_dst_cache	= NULL;
1319 		newsk->sk_wmem_queued	= 0;
1320 		newsk->sk_forward_alloc = 0;
1321 		newsk->sk_send_head	= NULL;
1322 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1323 
1324 		sock_reset_flag(newsk, SOCK_DONE);
1325 		skb_queue_head_init(&newsk->sk_error_queue);
1326 
1327 		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1328 		if (filter != NULL)
1329 			sk_filter_charge(newsk, filter);
1330 
1331 		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1332 			/* It is still raw copy of parent, so invalidate
1333 			 * destructor and make plain sk_free() */
1334 			newsk->sk_destruct = NULL;
1335 			bh_unlock_sock(newsk);
1336 			sk_free(newsk);
1337 			newsk = NULL;
1338 			goto out;
1339 		}
1340 
1341 		newsk->sk_err	   = 0;
1342 		newsk->sk_priority = 0;
1343 		/*
1344 		 * Before updating sk_refcnt, we must commit prior changes to memory
1345 		 * (Documentation/RCU/rculist_nulls.txt for details)
1346 		 */
1347 		smp_wmb();
1348 		atomic_set(&newsk->sk_refcnt, 2);
1349 
1350 		/*
1351 		 * Increment the counter in the same struct proto as the master
1352 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1353 		 * is the same as sk->sk_prot->socks, as this field was copied
1354 		 * with memcpy).
1355 		 *
1356 		 * This _changes_ the previous behaviour, where
1357 		 * tcp_create_openreq_child always was incrementing the
1358 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1359 		 * to be taken into account in all callers. -acme
1360 		 */
1361 		sk_refcnt_debug_inc(newsk);
1362 		sk_set_socket(newsk, NULL);
1363 		newsk->sk_wq = NULL;
1364 
1365 		if (newsk->sk_prot->sockets_allocated)
1366 			sk_sockets_allocated_inc(newsk);
1367 
1368 		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1369 			net_enable_timestamp();
1370 	}
1371 out:
1372 	return newsk;
1373 }
1374 EXPORT_SYMBOL_GPL(sk_clone_lock);
1375 
1376 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1377 {
1378 	__sk_dst_set(sk, dst);
1379 	sk->sk_route_caps = dst->dev->features;
1380 	if (sk->sk_route_caps & NETIF_F_GSO)
1381 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1382 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1383 	if (sk_can_gso(sk)) {
1384 		if (dst->header_len) {
1385 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1386 		} else {
1387 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1388 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1389 		}
1390 	}
1391 }
1392 EXPORT_SYMBOL_GPL(sk_setup_caps);
1393 
1394 void __init sk_init(void)
1395 {
1396 	if (totalram_pages <= 4096) {
1397 		sysctl_wmem_max = 32767;
1398 		sysctl_rmem_max = 32767;
1399 		sysctl_wmem_default = 32767;
1400 		sysctl_rmem_default = 32767;
1401 	} else if (totalram_pages >= 131072) {
1402 		sysctl_wmem_max = 131071;
1403 		sysctl_rmem_max = 131071;
1404 	}
1405 }
1406 
1407 /*
1408  *	Simple resource managers for sockets.
1409  */
1410 
1411 
1412 /*
1413  * Write buffer destructor automatically called from kfree_skb.
1414  */
1415 void sock_wfree(struct sk_buff *skb)
1416 {
1417 	struct sock *sk = skb->sk;
1418 	unsigned int len = skb->truesize;
1419 
1420 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1421 		/*
1422 		 * Keep a reference on sk_wmem_alloc, this will be released
1423 		 * after sk_write_space() call
1424 		 */
1425 		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1426 		sk->sk_write_space(sk);
1427 		len = 1;
1428 	}
1429 	/*
1430 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1431 	 * could not do because of in-flight packets
1432 	 */
1433 	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1434 		__sk_free(sk);
1435 }
1436 EXPORT_SYMBOL(sock_wfree);
1437 
1438 /*
1439  * Read buffer destructor automatically called from kfree_skb.
1440  */
1441 void sock_rfree(struct sk_buff *skb)
1442 {
1443 	struct sock *sk = skb->sk;
1444 	unsigned int len = skb->truesize;
1445 
1446 	atomic_sub(len, &sk->sk_rmem_alloc);
1447 	sk_mem_uncharge(sk, len);
1448 }
1449 EXPORT_SYMBOL(sock_rfree);
1450 
1451 
1452 int sock_i_uid(struct sock *sk)
1453 {
1454 	int uid;
1455 
1456 	read_lock_bh(&sk->sk_callback_lock);
1457 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1458 	read_unlock_bh(&sk->sk_callback_lock);
1459 	return uid;
1460 }
1461 EXPORT_SYMBOL(sock_i_uid);
1462 
1463 unsigned long sock_i_ino(struct sock *sk)
1464 {
1465 	unsigned long ino;
1466 
1467 	read_lock_bh(&sk->sk_callback_lock);
1468 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1469 	read_unlock_bh(&sk->sk_callback_lock);
1470 	return ino;
1471 }
1472 EXPORT_SYMBOL(sock_i_ino);
1473 
1474 /*
1475  * Allocate a skb from the socket's send buffer.
1476  */
1477 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1478 			     gfp_t priority)
1479 {
1480 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1481 		struct sk_buff *skb = alloc_skb(size, priority);
1482 		if (skb) {
1483 			skb_set_owner_w(skb, sk);
1484 			return skb;
1485 		}
1486 	}
1487 	return NULL;
1488 }
1489 EXPORT_SYMBOL(sock_wmalloc);
1490 
1491 /*
1492  * Allocate a skb from the socket's receive buffer.
1493  */
1494 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1495 			     gfp_t priority)
1496 {
1497 	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1498 		struct sk_buff *skb = alloc_skb(size, priority);
1499 		if (skb) {
1500 			skb_set_owner_r(skb, sk);
1501 			return skb;
1502 		}
1503 	}
1504 	return NULL;
1505 }
1506 
1507 /*
1508  * Allocate a memory block from the socket's option memory buffer.
1509  */
1510 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1511 {
1512 	if ((unsigned)size <= sysctl_optmem_max &&
1513 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1514 		void *mem;
1515 		/* First do the add, to avoid the race if kmalloc
1516 		 * might sleep.
1517 		 */
1518 		atomic_add(size, &sk->sk_omem_alloc);
1519 		mem = kmalloc(size, priority);
1520 		if (mem)
1521 			return mem;
1522 		atomic_sub(size, &sk->sk_omem_alloc);
1523 	}
1524 	return NULL;
1525 }
1526 EXPORT_SYMBOL(sock_kmalloc);
1527 
1528 /*
1529  * Free an option memory block.
1530  */
1531 void sock_kfree_s(struct sock *sk, void *mem, int size)
1532 {
1533 	kfree(mem);
1534 	atomic_sub(size, &sk->sk_omem_alloc);
1535 }
1536 EXPORT_SYMBOL(sock_kfree_s);
1537 
1538 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1539    I think, these locks should be removed for datagram sockets.
1540  */
1541 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1542 {
1543 	DEFINE_WAIT(wait);
1544 
1545 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1546 	for (;;) {
1547 		if (!timeo)
1548 			break;
1549 		if (signal_pending(current))
1550 			break;
1551 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1552 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1553 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1554 			break;
1555 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1556 			break;
1557 		if (sk->sk_err)
1558 			break;
1559 		timeo = schedule_timeout(timeo);
1560 	}
1561 	finish_wait(sk_sleep(sk), &wait);
1562 	return timeo;
1563 }
1564 
1565 
1566 /*
1567  *	Generic send/receive buffer handlers
1568  */
1569 
1570 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1571 				     unsigned long data_len, int noblock,
1572 				     int *errcode)
1573 {
1574 	struct sk_buff *skb;
1575 	gfp_t gfp_mask;
1576 	long timeo;
1577 	int err;
1578 
1579 	gfp_mask = sk->sk_allocation;
1580 	if (gfp_mask & __GFP_WAIT)
1581 		gfp_mask |= __GFP_REPEAT;
1582 
1583 	timeo = sock_sndtimeo(sk, noblock);
1584 	while (1) {
1585 		err = sock_error(sk);
1586 		if (err != 0)
1587 			goto failure;
1588 
1589 		err = -EPIPE;
1590 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1591 			goto failure;
1592 
1593 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1594 			skb = alloc_skb(header_len, gfp_mask);
1595 			if (skb) {
1596 				int npages;
1597 				int i;
1598 
1599 				/* No pages, we're done... */
1600 				if (!data_len)
1601 					break;
1602 
1603 				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1604 				skb->truesize += data_len;
1605 				skb_shinfo(skb)->nr_frags = npages;
1606 				for (i = 0; i < npages; i++) {
1607 					struct page *page;
1608 
1609 					page = alloc_pages(sk->sk_allocation, 0);
1610 					if (!page) {
1611 						err = -ENOBUFS;
1612 						skb_shinfo(skb)->nr_frags = i;
1613 						kfree_skb(skb);
1614 						goto failure;
1615 					}
1616 
1617 					__skb_fill_page_desc(skb, i,
1618 							page, 0,
1619 							(data_len >= PAGE_SIZE ?
1620 							 PAGE_SIZE :
1621 							 data_len));
1622 					data_len -= PAGE_SIZE;
1623 				}
1624 
1625 				/* Full success... */
1626 				break;
1627 			}
1628 			err = -ENOBUFS;
1629 			goto failure;
1630 		}
1631 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1632 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1633 		err = -EAGAIN;
1634 		if (!timeo)
1635 			goto failure;
1636 		if (signal_pending(current))
1637 			goto interrupted;
1638 		timeo = sock_wait_for_wmem(sk, timeo);
1639 	}
1640 
1641 	skb_set_owner_w(skb, sk);
1642 	return skb;
1643 
1644 interrupted:
1645 	err = sock_intr_errno(timeo);
1646 failure:
1647 	*errcode = err;
1648 	return NULL;
1649 }
1650 EXPORT_SYMBOL(sock_alloc_send_pskb);
1651 
1652 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1653 				    int noblock, int *errcode)
1654 {
1655 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1656 }
1657 EXPORT_SYMBOL(sock_alloc_send_skb);
1658 
1659 static void __lock_sock(struct sock *sk)
1660 	__releases(&sk->sk_lock.slock)
1661 	__acquires(&sk->sk_lock.slock)
1662 {
1663 	DEFINE_WAIT(wait);
1664 
1665 	for (;;) {
1666 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1667 					TASK_UNINTERRUPTIBLE);
1668 		spin_unlock_bh(&sk->sk_lock.slock);
1669 		schedule();
1670 		spin_lock_bh(&sk->sk_lock.slock);
1671 		if (!sock_owned_by_user(sk))
1672 			break;
1673 	}
1674 	finish_wait(&sk->sk_lock.wq, &wait);
1675 }
1676 
1677 static void __release_sock(struct sock *sk)
1678 	__releases(&sk->sk_lock.slock)
1679 	__acquires(&sk->sk_lock.slock)
1680 {
1681 	struct sk_buff *skb = sk->sk_backlog.head;
1682 
1683 	do {
1684 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1685 		bh_unlock_sock(sk);
1686 
1687 		do {
1688 			struct sk_buff *next = skb->next;
1689 
1690 			WARN_ON_ONCE(skb_dst_is_noref(skb));
1691 			skb->next = NULL;
1692 			sk_backlog_rcv(sk, skb);
1693 
1694 			/*
1695 			 * We are in process context here with softirqs
1696 			 * disabled, use cond_resched_softirq() to preempt.
1697 			 * This is safe to do because we've taken the backlog
1698 			 * queue private:
1699 			 */
1700 			cond_resched_softirq();
1701 
1702 			skb = next;
1703 		} while (skb != NULL);
1704 
1705 		bh_lock_sock(sk);
1706 	} while ((skb = sk->sk_backlog.head) != NULL);
1707 
1708 	/*
1709 	 * Doing the zeroing here guarantee we can not loop forever
1710 	 * while a wild producer attempts to flood us.
1711 	 */
1712 	sk->sk_backlog.len = 0;
1713 }
1714 
1715 /**
1716  * sk_wait_data - wait for data to arrive at sk_receive_queue
1717  * @sk:    sock to wait on
1718  * @timeo: for how long
1719  *
1720  * Now socket state including sk->sk_err is changed only under lock,
1721  * hence we may omit checks after joining wait queue.
1722  * We check receive queue before schedule() only as optimization;
1723  * it is very likely that release_sock() added new data.
1724  */
1725 int sk_wait_data(struct sock *sk, long *timeo)
1726 {
1727 	int rc;
1728 	DEFINE_WAIT(wait);
1729 
1730 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1731 	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1732 	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1733 	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1734 	finish_wait(sk_sleep(sk), &wait);
1735 	return rc;
1736 }
1737 EXPORT_SYMBOL(sk_wait_data);
1738 
1739 /**
1740  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1741  *	@sk: socket
1742  *	@size: memory size to allocate
1743  *	@kind: allocation type
1744  *
1745  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1746  *	rmem allocation. This function assumes that protocols which have
1747  *	memory_pressure use sk_wmem_queued as write buffer accounting.
1748  */
1749 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1750 {
1751 	struct proto *prot = sk->sk_prot;
1752 	int amt = sk_mem_pages(size);
1753 	long allocated;
1754 	int parent_status = UNDER_LIMIT;
1755 
1756 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1757 
1758 	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
1759 
1760 	/* Under limit. */
1761 	if (parent_status == UNDER_LIMIT &&
1762 			allocated <= sk_prot_mem_limits(sk, 0)) {
1763 		sk_leave_memory_pressure(sk);
1764 		return 1;
1765 	}
1766 
1767 	/* Under pressure. (we or our parents) */
1768 	if ((parent_status > SOFT_LIMIT) ||
1769 			allocated > sk_prot_mem_limits(sk, 1))
1770 		sk_enter_memory_pressure(sk);
1771 
1772 	/* Over hard limit (we or our parents) */
1773 	if ((parent_status == OVER_LIMIT) ||
1774 			(allocated > sk_prot_mem_limits(sk, 2)))
1775 		goto suppress_allocation;
1776 
1777 	/* guarantee minimum buffer size under pressure */
1778 	if (kind == SK_MEM_RECV) {
1779 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1780 			return 1;
1781 
1782 	} else { /* SK_MEM_SEND */
1783 		if (sk->sk_type == SOCK_STREAM) {
1784 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1785 				return 1;
1786 		} else if (atomic_read(&sk->sk_wmem_alloc) <
1787 			   prot->sysctl_wmem[0])
1788 				return 1;
1789 	}
1790 
1791 	if (sk_has_memory_pressure(sk)) {
1792 		int alloc;
1793 
1794 		if (!sk_under_memory_pressure(sk))
1795 			return 1;
1796 		alloc = sk_sockets_allocated_read_positive(sk);
1797 		if (sk_prot_mem_limits(sk, 2) > alloc *
1798 		    sk_mem_pages(sk->sk_wmem_queued +
1799 				 atomic_read(&sk->sk_rmem_alloc) +
1800 				 sk->sk_forward_alloc))
1801 			return 1;
1802 	}
1803 
1804 suppress_allocation:
1805 
1806 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1807 		sk_stream_moderate_sndbuf(sk);
1808 
1809 		/* Fail only if socket is _under_ its sndbuf.
1810 		 * In this case we cannot block, so that we have to fail.
1811 		 */
1812 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1813 			return 1;
1814 	}
1815 
1816 	trace_sock_exceed_buf_limit(sk, prot, allocated);
1817 
1818 	/* Alas. Undo changes. */
1819 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1820 
1821 	sk_memory_allocated_sub(sk, amt, parent_status);
1822 
1823 	return 0;
1824 }
1825 EXPORT_SYMBOL(__sk_mem_schedule);
1826 
1827 /**
1828  *	__sk_reclaim - reclaim memory_allocated
1829  *	@sk: socket
1830  */
1831 void __sk_mem_reclaim(struct sock *sk)
1832 {
1833 	sk_memory_allocated_sub(sk,
1834 				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, 0);
1835 	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1836 
1837 	if (sk_under_memory_pressure(sk) &&
1838 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
1839 		sk_leave_memory_pressure(sk);
1840 }
1841 EXPORT_SYMBOL(__sk_mem_reclaim);
1842 
1843 
1844 /*
1845  * Set of default routines for initialising struct proto_ops when
1846  * the protocol does not support a particular function. In certain
1847  * cases where it makes no sense for a protocol to have a "do nothing"
1848  * function, some default processing is provided.
1849  */
1850 
1851 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1852 {
1853 	return -EOPNOTSUPP;
1854 }
1855 EXPORT_SYMBOL(sock_no_bind);
1856 
1857 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1858 		    int len, int flags)
1859 {
1860 	return -EOPNOTSUPP;
1861 }
1862 EXPORT_SYMBOL(sock_no_connect);
1863 
1864 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1865 {
1866 	return -EOPNOTSUPP;
1867 }
1868 EXPORT_SYMBOL(sock_no_socketpair);
1869 
1870 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1871 {
1872 	return -EOPNOTSUPP;
1873 }
1874 EXPORT_SYMBOL(sock_no_accept);
1875 
1876 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1877 		    int *len, int peer)
1878 {
1879 	return -EOPNOTSUPP;
1880 }
1881 EXPORT_SYMBOL(sock_no_getname);
1882 
1883 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1884 {
1885 	return 0;
1886 }
1887 EXPORT_SYMBOL(sock_no_poll);
1888 
1889 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1890 {
1891 	return -EOPNOTSUPP;
1892 }
1893 EXPORT_SYMBOL(sock_no_ioctl);
1894 
1895 int sock_no_listen(struct socket *sock, int backlog)
1896 {
1897 	return -EOPNOTSUPP;
1898 }
1899 EXPORT_SYMBOL(sock_no_listen);
1900 
1901 int sock_no_shutdown(struct socket *sock, int how)
1902 {
1903 	return -EOPNOTSUPP;
1904 }
1905 EXPORT_SYMBOL(sock_no_shutdown);
1906 
1907 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1908 		    char __user *optval, unsigned int optlen)
1909 {
1910 	return -EOPNOTSUPP;
1911 }
1912 EXPORT_SYMBOL(sock_no_setsockopt);
1913 
1914 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1915 		    char __user *optval, int __user *optlen)
1916 {
1917 	return -EOPNOTSUPP;
1918 }
1919 EXPORT_SYMBOL(sock_no_getsockopt);
1920 
1921 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1922 		    size_t len)
1923 {
1924 	return -EOPNOTSUPP;
1925 }
1926 EXPORT_SYMBOL(sock_no_sendmsg);
1927 
1928 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1929 		    size_t len, int flags)
1930 {
1931 	return -EOPNOTSUPP;
1932 }
1933 EXPORT_SYMBOL(sock_no_recvmsg);
1934 
1935 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1936 {
1937 	/* Mirror missing mmap method error code */
1938 	return -ENODEV;
1939 }
1940 EXPORT_SYMBOL(sock_no_mmap);
1941 
1942 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1943 {
1944 	ssize_t res;
1945 	struct msghdr msg = {.msg_flags = flags};
1946 	struct kvec iov;
1947 	char *kaddr = kmap(page);
1948 	iov.iov_base = kaddr + offset;
1949 	iov.iov_len = size;
1950 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1951 	kunmap(page);
1952 	return res;
1953 }
1954 EXPORT_SYMBOL(sock_no_sendpage);
1955 
1956 /*
1957  *	Default Socket Callbacks
1958  */
1959 
1960 static void sock_def_wakeup(struct sock *sk)
1961 {
1962 	struct socket_wq *wq;
1963 
1964 	rcu_read_lock();
1965 	wq = rcu_dereference(sk->sk_wq);
1966 	if (wq_has_sleeper(wq))
1967 		wake_up_interruptible_all(&wq->wait);
1968 	rcu_read_unlock();
1969 }
1970 
1971 static void sock_def_error_report(struct sock *sk)
1972 {
1973 	struct socket_wq *wq;
1974 
1975 	rcu_read_lock();
1976 	wq = rcu_dereference(sk->sk_wq);
1977 	if (wq_has_sleeper(wq))
1978 		wake_up_interruptible_poll(&wq->wait, POLLERR);
1979 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1980 	rcu_read_unlock();
1981 }
1982 
1983 static void sock_def_readable(struct sock *sk, int len)
1984 {
1985 	struct socket_wq *wq;
1986 
1987 	rcu_read_lock();
1988 	wq = rcu_dereference(sk->sk_wq);
1989 	if (wq_has_sleeper(wq))
1990 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
1991 						POLLRDNORM | POLLRDBAND);
1992 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1993 	rcu_read_unlock();
1994 }
1995 
1996 static void sock_def_write_space(struct sock *sk)
1997 {
1998 	struct socket_wq *wq;
1999 
2000 	rcu_read_lock();
2001 
2002 	/* Do not wake up a writer until he can make "significant"
2003 	 * progress.  --DaveM
2004 	 */
2005 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2006 		wq = rcu_dereference(sk->sk_wq);
2007 		if (wq_has_sleeper(wq))
2008 			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2009 						POLLWRNORM | POLLWRBAND);
2010 
2011 		/* Should agree with poll, otherwise some programs break */
2012 		if (sock_writeable(sk))
2013 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2014 	}
2015 
2016 	rcu_read_unlock();
2017 }
2018 
2019 static void sock_def_destruct(struct sock *sk)
2020 {
2021 	kfree(sk->sk_protinfo);
2022 }
2023 
2024 void sk_send_sigurg(struct sock *sk)
2025 {
2026 	if (sk->sk_socket && sk->sk_socket->file)
2027 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2028 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2029 }
2030 EXPORT_SYMBOL(sk_send_sigurg);
2031 
2032 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2033 		    unsigned long expires)
2034 {
2035 	if (!mod_timer(timer, expires))
2036 		sock_hold(sk);
2037 }
2038 EXPORT_SYMBOL(sk_reset_timer);
2039 
2040 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2041 {
2042 	if (timer_pending(timer) && del_timer(timer))
2043 		__sock_put(sk);
2044 }
2045 EXPORT_SYMBOL(sk_stop_timer);
2046 
2047 void sock_init_data(struct socket *sock, struct sock *sk)
2048 {
2049 	skb_queue_head_init(&sk->sk_receive_queue);
2050 	skb_queue_head_init(&sk->sk_write_queue);
2051 	skb_queue_head_init(&sk->sk_error_queue);
2052 #ifdef CONFIG_NET_DMA
2053 	skb_queue_head_init(&sk->sk_async_wait_queue);
2054 #endif
2055 
2056 	sk->sk_send_head	=	NULL;
2057 
2058 	init_timer(&sk->sk_timer);
2059 
2060 	sk->sk_allocation	=	GFP_KERNEL;
2061 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2062 	sk->sk_sndbuf		=	sysctl_wmem_default;
2063 	sk->sk_state		=	TCP_CLOSE;
2064 	sk_set_socket(sk, sock);
2065 
2066 	sock_set_flag(sk, SOCK_ZAPPED);
2067 
2068 	if (sock) {
2069 		sk->sk_type	=	sock->type;
2070 		sk->sk_wq	=	sock->wq;
2071 		sock->sk	=	sk;
2072 	} else
2073 		sk->sk_wq	=	NULL;
2074 
2075 	spin_lock_init(&sk->sk_dst_lock);
2076 	rwlock_init(&sk->sk_callback_lock);
2077 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2078 			af_callback_keys + sk->sk_family,
2079 			af_family_clock_key_strings[sk->sk_family]);
2080 
2081 	sk->sk_state_change	=	sock_def_wakeup;
2082 	sk->sk_data_ready	=	sock_def_readable;
2083 	sk->sk_write_space	=	sock_def_write_space;
2084 	sk->sk_error_report	=	sock_def_error_report;
2085 	sk->sk_destruct		=	sock_def_destruct;
2086 
2087 	sk->sk_sndmsg_page	=	NULL;
2088 	sk->sk_sndmsg_off	=	0;
2089 
2090 	sk->sk_peer_pid 	=	NULL;
2091 	sk->sk_peer_cred	=	NULL;
2092 	sk->sk_write_pending	=	0;
2093 	sk->sk_rcvlowat		=	1;
2094 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2095 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2096 
2097 	sk->sk_stamp = ktime_set(-1L, 0);
2098 
2099 	/*
2100 	 * Before updating sk_refcnt, we must commit prior changes to memory
2101 	 * (Documentation/RCU/rculist_nulls.txt for details)
2102 	 */
2103 	smp_wmb();
2104 	atomic_set(&sk->sk_refcnt, 1);
2105 	atomic_set(&sk->sk_drops, 0);
2106 }
2107 EXPORT_SYMBOL(sock_init_data);
2108 
2109 void lock_sock_nested(struct sock *sk, int subclass)
2110 {
2111 	might_sleep();
2112 	spin_lock_bh(&sk->sk_lock.slock);
2113 	if (sk->sk_lock.owned)
2114 		__lock_sock(sk);
2115 	sk->sk_lock.owned = 1;
2116 	spin_unlock(&sk->sk_lock.slock);
2117 	/*
2118 	 * The sk_lock has mutex_lock() semantics here:
2119 	 */
2120 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2121 	local_bh_enable();
2122 }
2123 EXPORT_SYMBOL(lock_sock_nested);
2124 
2125 void release_sock(struct sock *sk)
2126 {
2127 	/*
2128 	 * The sk_lock has mutex_unlock() semantics:
2129 	 */
2130 	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2131 
2132 	spin_lock_bh(&sk->sk_lock.slock);
2133 	if (sk->sk_backlog.tail)
2134 		__release_sock(sk);
2135 	sk->sk_lock.owned = 0;
2136 	if (waitqueue_active(&sk->sk_lock.wq))
2137 		wake_up(&sk->sk_lock.wq);
2138 	spin_unlock_bh(&sk->sk_lock.slock);
2139 }
2140 EXPORT_SYMBOL(release_sock);
2141 
2142 /**
2143  * lock_sock_fast - fast version of lock_sock
2144  * @sk: socket
2145  *
2146  * This version should be used for very small section, where process wont block
2147  * return false if fast path is taken
2148  *   sk_lock.slock locked, owned = 0, BH disabled
2149  * return true if slow path is taken
2150  *   sk_lock.slock unlocked, owned = 1, BH enabled
2151  */
2152 bool lock_sock_fast(struct sock *sk)
2153 {
2154 	might_sleep();
2155 	spin_lock_bh(&sk->sk_lock.slock);
2156 
2157 	if (!sk->sk_lock.owned)
2158 		/*
2159 		 * Note : We must disable BH
2160 		 */
2161 		return false;
2162 
2163 	__lock_sock(sk);
2164 	sk->sk_lock.owned = 1;
2165 	spin_unlock(&sk->sk_lock.slock);
2166 	/*
2167 	 * The sk_lock has mutex_lock() semantics here:
2168 	 */
2169 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2170 	local_bh_enable();
2171 	return true;
2172 }
2173 EXPORT_SYMBOL(lock_sock_fast);
2174 
2175 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2176 {
2177 	struct timeval tv;
2178 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2179 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2180 	tv = ktime_to_timeval(sk->sk_stamp);
2181 	if (tv.tv_sec == -1)
2182 		return -ENOENT;
2183 	if (tv.tv_sec == 0) {
2184 		sk->sk_stamp = ktime_get_real();
2185 		tv = ktime_to_timeval(sk->sk_stamp);
2186 	}
2187 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2188 }
2189 EXPORT_SYMBOL(sock_get_timestamp);
2190 
2191 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2192 {
2193 	struct timespec ts;
2194 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2195 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2196 	ts = ktime_to_timespec(sk->sk_stamp);
2197 	if (ts.tv_sec == -1)
2198 		return -ENOENT;
2199 	if (ts.tv_sec == 0) {
2200 		sk->sk_stamp = ktime_get_real();
2201 		ts = ktime_to_timespec(sk->sk_stamp);
2202 	}
2203 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2204 }
2205 EXPORT_SYMBOL(sock_get_timestampns);
2206 
2207 void sock_enable_timestamp(struct sock *sk, int flag)
2208 {
2209 	if (!sock_flag(sk, flag)) {
2210 		unsigned long previous_flags = sk->sk_flags;
2211 
2212 		sock_set_flag(sk, flag);
2213 		/*
2214 		 * we just set one of the two flags which require net
2215 		 * time stamping, but time stamping might have been on
2216 		 * already because of the other one
2217 		 */
2218 		if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2219 			net_enable_timestamp();
2220 	}
2221 }
2222 
2223 /*
2224  *	Get a socket option on an socket.
2225  *
2226  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2227  *	asynchronous errors should be reported by getsockopt. We assume
2228  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2229  */
2230 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2231 			   char __user *optval, int __user *optlen)
2232 {
2233 	struct sock *sk = sock->sk;
2234 
2235 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2236 }
2237 EXPORT_SYMBOL(sock_common_getsockopt);
2238 
2239 #ifdef CONFIG_COMPAT
2240 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2241 				  char __user *optval, int __user *optlen)
2242 {
2243 	struct sock *sk = sock->sk;
2244 
2245 	if (sk->sk_prot->compat_getsockopt != NULL)
2246 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2247 						      optval, optlen);
2248 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2249 }
2250 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2251 #endif
2252 
2253 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2254 			struct msghdr *msg, size_t size, int flags)
2255 {
2256 	struct sock *sk = sock->sk;
2257 	int addr_len = 0;
2258 	int err;
2259 
2260 	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2261 				   flags & ~MSG_DONTWAIT, &addr_len);
2262 	if (err >= 0)
2263 		msg->msg_namelen = addr_len;
2264 	return err;
2265 }
2266 EXPORT_SYMBOL(sock_common_recvmsg);
2267 
2268 /*
2269  *	Set socket options on an inet socket.
2270  */
2271 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2272 			   char __user *optval, unsigned int optlen)
2273 {
2274 	struct sock *sk = sock->sk;
2275 
2276 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2277 }
2278 EXPORT_SYMBOL(sock_common_setsockopt);
2279 
2280 #ifdef CONFIG_COMPAT
2281 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2282 				  char __user *optval, unsigned int optlen)
2283 {
2284 	struct sock *sk = sock->sk;
2285 
2286 	if (sk->sk_prot->compat_setsockopt != NULL)
2287 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2288 						      optval, optlen);
2289 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2290 }
2291 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2292 #endif
2293 
2294 void sk_common_release(struct sock *sk)
2295 {
2296 	if (sk->sk_prot->destroy)
2297 		sk->sk_prot->destroy(sk);
2298 
2299 	/*
2300 	 * Observation: when sock_common_release is called, processes have
2301 	 * no access to socket. But net still has.
2302 	 * Step one, detach it from networking:
2303 	 *
2304 	 * A. Remove from hash tables.
2305 	 */
2306 
2307 	sk->sk_prot->unhash(sk);
2308 
2309 	/*
2310 	 * In this point socket cannot receive new packets, but it is possible
2311 	 * that some packets are in flight because some CPU runs receiver and
2312 	 * did hash table lookup before we unhashed socket. They will achieve
2313 	 * receive queue and will be purged by socket destructor.
2314 	 *
2315 	 * Also we still have packets pending on receive queue and probably,
2316 	 * our own packets waiting in device queues. sock_destroy will drain
2317 	 * receive queue, but transmitted packets will delay socket destruction
2318 	 * until the last reference will be released.
2319 	 */
2320 
2321 	sock_orphan(sk);
2322 
2323 	xfrm_sk_free_policy(sk);
2324 
2325 	sk_refcnt_debug_release(sk);
2326 	sock_put(sk);
2327 }
2328 EXPORT_SYMBOL(sk_common_release);
2329 
2330 #ifdef CONFIG_PROC_FS
2331 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
2332 struct prot_inuse {
2333 	int val[PROTO_INUSE_NR];
2334 };
2335 
2336 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2337 
2338 #ifdef CONFIG_NET_NS
2339 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2340 {
2341 	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2342 }
2343 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2344 
2345 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2346 {
2347 	int cpu, idx = prot->inuse_idx;
2348 	int res = 0;
2349 
2350 	for_each_possible_cpu(cpu)
2351 		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2352 
2353 	return res >= 0 ? res : 0;
2354 }
2355 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2356 
2357 static int __net_init sock_inuse_init_net(struct net *net)
2358 {
2359 	net->core.inuse = alloc_percpu(struct prot_inuse);
2360 	return net->core.inuse ? 0 : -ENOMEM;
2361 }
2362 
2363 static void __net_exit sock_inuse_exit_net(struct net *net)
2364 {
2365 	free_percpu(net->core.inuse);
2366 }
2367 
2368 static struct pernet_operations net_inuse_ops = {
2369 	.init = sock_inuse_init_net,
2370 	.exit = sock_inuse_exit_net,
2371 };
2372 
2373 static __init int net_inuse_init(void)
2374 {
2375 	if (register_pernet_subsys(&net_inuse_ops))
2376 		panic("Cannot initialize net inuse counters");
2377 
2378 	return 0;
2379 }
2380 
2381 core_initcall(net_inuse_init);
2382 #else
2383 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2384 
2385 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2386 {
2387 	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2388 }
2389 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2390 
2391 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2392 {
2393 	int cpu, idx = prot->inuse_idx;
2394 	int res = 0;
2395 
2396 	for_each_possible_cpu(cpu)
2397 		res += per_cpu(prot_inuse, cpu).val[idx];
2398 
2399 	return res >= 0 ? res : 0;
2400 }
2401 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2402 #endif
2403 
2404 static void assign_proto_idx(struct proto *prot)
2405 {
2406 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2407 
2408 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2409 		printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2410 		return;
2411 	}
2412 
2413 	set_bit(prot->inuse_idx, proto_inuse_idx);
2414 }
2415 
2416 static void release_proto_idx(struct proto *prot)
2417 {
2418 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2419 		clear_bit(prot->inuse_idx, proto_inuse_idx);
2420 }
2421 #else
2422 static inline void assign_proto_idx(struct proto *prot)
2423 {
2424 }
2425 
2426 static inline void release_proto_idx(struct proto *prot)
2427 {
2428 }
2429 #endif
2430 
2431 int proto_register(struct proto *prot, int alloc_slab)
2432 {
2433 	if (alloc_slab) {
2434 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2435 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2436 					NULL);
2437 
2438 		if (prot->slab == NULL) {
2439 			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2440 			       prot->name);
2441 			goto out;
2442 		}
2443 
2444 		if (prot->rsk_prot != NULL) {
2445 			prot->rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", prot->name);
2446 			if (prot->rsk_prot->slab_name == NULL)
2447 				goto out_free_sock_slab;
2448 
2449 			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2450 								 prot->rsk_prot->obj_size, 0,
2451 								 SLAB_HWCACHE_ALIGN, NULL);
2452 
2453 			if (prot->rsk_prot->slab == NULL) {
2454 				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2455 				       prot->name);
2456 				goto out_free_request_sock_slab_name;
2457 			}
2458 		}
2459 
2460 		if (prot->twsk_prot != NULL) {
2461 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2462 
2463 			if (prot->twsk_prot->twsk_slab_name == NULL)
2464 				goto out_free_request_sock_slab;
2465 
2466 			prot->twsk_prot->twsk_slab =
2467 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2468 						  prot->twsk_prot->twsk_obj_size,
2469 						  0,
2470 						  SLAB_HWCACHE_ALIGN |
2471 							prot->slab_flags,
2472 						  NULL);
2473 			if (prot->twsk_prot->twsk_slab == NULL)
2474 				goto out_free_timewait_sock_slab_name;
2475 		}
2476 	}
2477 
2478 	mutex_lock(&proto_list_mutex);
2479 	list_add(&prot->node, &proto_list);
2480 	assign_proto_idx(prot);
2481 	mutex_unlock(&proto_list_mutex);
2482 	return 0;
2483 
2484 out_free_timewait_sock_slab_name:
2485 	kfree(prot->twsk_prot->twsk_slab_name);
2486 out_free_request_sock_slab:
2487 	if (prot->rsk_prot && prot->rsk_prot->slab) {
2488 		kmem_cache_destroy(prot->rsk_prot->slab);
2489 		prot->rsk_prot->slab = NULL;
2490 	}
2491 out_free_request_sock_slab_name:
2492 	if (prot->rsk_prot)
2493 		kfree(prot->rsk_prot->slab_name);
2494 out_free_sock_slab:
2495 	kmem_cache_destroy(prot->slab);
2496 	prot->slab = NULL;
2497 out:
2498 	return -ENOBUFS;
2499 }
2500 EXPORT_SYMBOL(proto_register);
2501 
2502 void proto_unregister(struct proto *prot)
2503 {
2504 	mutex_lock(&proto_list_mutex);
2505 	release_proto_idx(prot);
2506 	list_del(&prot->node);
2507 	mutex_unlock(&proto_list_mutex);
2508 
2509 	if (prot->slab != NULL) {
2510 		kmem_cache_destroy(prot->slab);
2511 		prot->slab = NULL;
2512 	}
2513 
2514 	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2515 		kmem_cache_destroy(prot->rsk_prot->slab);
2516 		kfree(prot->rsk_prot->slab_name);
2517 		prot->rsk_prot->slab = NULL;
2518 	}
2519 
2520 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2521 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2522 		kfree(prot->twsk_prot->twsk_slab_name);
2523 		prot->twsk_prot->twsk_slab = NULL;
2524 	}
2525 }
2526 EXPORT_SYMBOL(proto_unregister);
2527 
2528 #ifdef CONFIG_PROC_FS
2529 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2530 	__acquires(proto_list_mutex)
2531 {
2532 	mutex_lock(&proto_list_mutex);
2533 	return seq_list_start_head(&proto_list, *pos);
2534 }
2535 
2536 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2537 {
2538 	return seq_list_next(v, &proto_list, pos);
2539 }
2540 
2541 static void proto_seq_stop(struct seq_file *seq, void *v)
2542 	__releases(proto_list_mutex)
2543 {
2544 	mutex_unlock(&proto_list_mutex);
2545 }
2546 
2547 static char proto_method_implemented(const void *method)
2548 {
2549 	return method == NULL ? 'n' : 'y';
2550 }
2551 static long sock_prot_memory_allocated(struct proto *proto)
2552 {
2553 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto): -1L;
2554 }
2555 
2556 static char *sock_prot_memory_pressure(struct proto *proto)
2557 {
2558 	return proto->memory_pressure != NULL ?
2559 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2560 }
2561 
2562 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2563 {
2564 
2565 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2566 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2567 		   proto->name,
2568 		   proto->obj_size,
2569 		   sock_prot_inuse_get(seq_file_net(seq), proto),
2570 		   sock_prot_memory_allocated(proto),
2571 		   sock_prot_memory_pressure(proto),
2572 		   proto->max_header,
2573 		   proto->slab == NULL ? "no" : "yes",
2574 		   module_name(proto->owner),
2575 		   proto_method_implemented(proto->close),
2576 		   proto_method_implemented(proto->connect),
2577 		   proto_method_implemented(proto->disconnect),
2578 		   proto_method_implemented(proto->accept),
2579 		   proto_method_implemented(proto->ioctl),
2580 		   proto_method_implemented(proto->init),
2581 		   proto_method_implemented(proto->destroy),
2582 		   proto_method_implemented(proto->shutdown),
2583 		   proto_method_implemented(proto->setsockopt),
2584 		   proto_method_implemented(proto->getsockopt),
2585 		   proto_method_implemented(proto->sendmsg),
2586 		   proto_method_implemented(proto->recvmsg),
2587 		   proto_method_implemented(proto->sendpage),
2588 		   proto_method_implemented(proto->bind),
2589 		   proto_method_implemented(proto->backlog_rcv),
2590 		   proto_method_implemented(proto->hash),
2591 		   proto_method_implemented(proto->unhash),
2592 		   proto_method_implemented(proto->get_port),
2593 		   proto_method_implemented(proto->enter_memory_pressure));
2594 }
2595 
2596 static int proto_seq_show(struct seq_file *seq, void *v)
2597 {
2598 	if (v == &proto_list)
2599 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2600 			   "protocol",
2601 			   "size",
2602 			   "sockets",
2603 			   "memory",
2604 			   "press",
2605 			   "maxhdr",
2606 			   "slab",
2607 			   "module",
2608 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2609 	else
2610 		proto_seq_printf(seq, list_entry(v, struct proto, node));
2611 	return 0;
2612 }
2613 
2614 static const struct seq_operations proto_seq_ops = {
2615 	.start  = proto_seq_start,
2616 	.next   = proto_seq_next,
2617 	.stop   = proto_seq_stop,
2618 	.show   = proto_seq_show,
2619 };
2620 
2621 static int proto_seq_open(struct inode *inode, struct file *file)
2622 {
2623 	return seq_open_net(inode, file, &proto_seq_ops,
2624 			    sizeof(struct seq_net_private));
2625 }
2626 
2627 static const struct file_operations proto_seq_fops = {
2628 	.owner		= THIS_MODULE,
2629 	.open		= proto_seq_open,
2630 	.read		= seq_read,
2631 	.llseek		= seq_lseek,
2632 	.release	= seq_release_net,
2633 };
2634 
2635 static __net_init int proto_init_net(struct net *net)
2636 {
2637 	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2638 		return -ENOMEM;
2639 
2640 	return 0;
2641 }
2642 
2643 static __net_exit void proto_exit_net(struct net *net)
2644 {
2645 	proc_net_remove(net, "protocols");
2646 }
2647 
2648 
2649 static __net_initdata struct pernet_operations proto_net_ops = {
2650 	.init = proto_init_net,
2651 	.exit = proto_exit_net,
2652 };
2653 
2654 static int __init proto_init(void)
2655 {
2656 	return register_pernet_subsys(&proto_net_ops);
2657 }
2658 
2659 subsys_initcall(proto_init);
2660 
2661 #endif /* PROC_FS */
2662