xref: /linux/net/core/sock.c (revision 93d546399c2b7d66a54d5fbd5eee17de19246bf6)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #include <linux/capability.h>
93 #include <linux/errno.h>
94 #include <linux/types.h>
95 #include <linux/socket.h>
96 #include <linux/in.h>
97 #include <linux/kernel.h>
98 #include <linux/module.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/sched.h>
102 #include <linux/timer.h>
103 #include <linux/string.h>
104 #include <linux/sockios.h>
105 #include <linux/net.h>
106 #include <linux/mm.h>
107 #include <linux/slab.h>
108 #include <linux/interrupt.h>
109 #include <linux/poll.h>
110 #include <linux/tcp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 
114 #include <asm/uaccess.h>
115 #include <asm/system.h>
116 
117 #include <linux/netdevice.h>
118 #include <net/protocol.h>
119 #include <linux/skbuff.h>
120 #include <net/net_namespace.h>
121 #include <net/request_sock.h>
122 #include <net/sock.h>
123 #include <net/xfrm.h>
124 #include <linux/ipsec.h>
125 
126 #include <linux/filter.h>
127 
128 #ifdef CONFIG_INET
129 #include <net/tcp.h>
130 #endif
131 
132 /*
133  * Each address family might have different locking rules, so we have
134  * one slock key per address family:
135  */
136 static struct lock_class_key af_family_keys[AF_MAX];
137 static struct lock_class_key af_family_slock_keys[AF_MAX];
138 
139 /*
140  * Make lock validator output more readable. (we pre-construct these
141  * strings build-time, so that runtime initialization of socket
142  * locks is fast):
143  */
144 static const char *af_family_key_strings[AF_MAX+1] = {
145   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
146   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
147   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
148   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
149   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
150   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
151   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
152   "sk_lock-21"       , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
153   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
154   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
155   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
156   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
157   "sk_lock-AF_MAX"
158 };
159 static const char *af_family_slock_key_strings[AF_MAX+1] = {
160   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
161   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
162   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
163   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
164   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
165   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
166   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
167   "slock-21"       , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
168   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
169   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
170   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
171   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
172   "slock-AF_MAX"
173 };
174 static const char *af_family_clock_key_strings[AF_MAX+1] = {
175   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
176   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
177   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
178   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
179   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
180   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
181   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
182   "clock-21"       , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
183   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
184   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
185   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
186   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
187   "clock-AF_MAX"
188 };
189 
190 /*
191  * sk_callback_lock locking rules are per-address-family,
192  * so split the lock classes by using a per-AF key:
193  */
194 static struct lock_class_key af_callback_keys[AF_MAX];
195 
196 /* Take into consideration the size of the struct sk_buff overhead in the
197  * determination of these values, since that is non-constant across
198  * platforms.  This makes socket queueing behavior and performance
199  * not depend upon such differences.
200  */
201 #define _SK_MEM_PACKETS		256
202 #define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
203 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
204 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
205 
206 /* Run time adjustable parameters. */
207 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
208 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
209 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
210 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
211 
212 /* Maximal space eaten by iovec or ancilliary data plus some space */
213 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
214 
215 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
216 {
217 	struct timeval tv;
218 
219 	if (optlen < sizeof(tv))
220 		return -EINVAL;
221 	if (copy_from_user(&tv, optval, sizeof(tv)))
222 		return -EFAULT;
223 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
224 		return -EDOM;
225 
226 	if (tv.tv_sec < 0) {
227 		static int warned __read_mostly;
228 
229 		*timeo_p = 0;
230 		if (warned < 10 && net_ratelimit()) {
231 			warned++;
232 			printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
233 			       "tries to set negative timeout\n",
234 				current->comm, task_pid_nr(current));
235 		}
236 		return 0;
237 	}
238 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
239 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
240 		return 0;
241 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
242 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
243 	return 0;
244 }
245 
246 static void sock_warn_obsolete_bsdism(const char *name)
247 {
248 	static int warned;
249 	static char warncomm[TASK_COMM_LEN];
250 	if (strcmp(warncomm, current->comm) && warned < 5) {
251 		strcpy(warncomm,  current->comm);
252 		printk(KERN_WARNING "process `%s' is using obsolete "
253 		       "%s SO_BSDCOMPAT\n", warncomm, name);
254 		warned++;
255 	}
256 }
257 
258 static void sock_disable_timestamp(struct sock *sk)
259 {
260 	if (sock_flag(sk, SOCK_TIMESTAMP)) {
261 		sock_reset_flag(sk, SOCK_TIMESTAMP);
262 		net_disable_timestamp();
263 	}
264 }
265 
266 
267 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
268 {
269 	int err = 0;
270 	int skb_len;
271 
272 	/* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
273 	   number of warnings when compiling with -W --ANK
274 	 */
275 	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
276 	    (unsigned)sk->sk_rcvbuf) {
277 		err = -ENOMEM;
278 		goto out;
279 	}
280 
281 	err = sk_filter(sk, skb);
282 	if (err)
283 		goto out;
284 
285 	if (!sk_rmem_schedule(sk, skb->truesize)) {
286 		err = -ENOBUFS;
287 		goto out;
288 	}
289 
290 	skb->dev = NULL;
291 	skb_set_owner_r(skb, sk);
292 
293 	/* Cache the SKB length before we tack it onto the receive
294 	 * queue.  Once it is added it no longer belongs to us and
295 	 * may be freed by other threads of control pulling packets
296 	 * from the queue.
297 	 */
298 	skb_len = skb->len;
299 
300 	skb_queue_tail(&sk->sk_receive_queue, skb);
301 
302 	if (!sock_flag(sk, SOCK_DEAD))
303 		sk->sk_data_ready(sk, skb_len);
304 out:
305 	return err;
306 }
307 EXPORT_SYMBOL(sock_queue_rcv_skb);
308 
309 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
310 {
311 	int rc = NET_RX_SUCCESS;
312 
313 	if (sk_filter(sk, skb))
314 		goto discard_and_relse;
315 
316 	skb->dev = NULL;
317 
318 	if (nested)
319 		bh_lock_sock_nested(sk);
320 	else
321 		bh_lock_sock(sk);
322 	if (!sock_owned_by_user(sk)) {
323 		/*
324 		 * trylock + unlock semantics:
325 		 */
326 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
327 
328 		rc = sk_backlog_rcv(sk, skb);
329 
330 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
331 	} else
332 		sk_add_backlog(sk, skb);
333 	bh_unlock_sock(sk);
334 out:
335 	sock_put(sk);
336 	return rc;
337 discard_and_relse:
338 	kfree_skb(skb);
339 	goto out;
340 }
341 EXPORT_SYMBOL(sk_receive_skb);
342 
343 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
344 {
345 	struct dst_entry *dst = sk->sk_dst_cache;
346 
347 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
348 		sk->sk_dst_cache = NULL;
349 		dst_release(dst);
350 		return NULL;
351 	}
352 
353 	return dst;
354 }
355 EXPORT_SYMBOL(__sk_dst_check);
356 
357 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
358 {
359 	struct dst_entry *dst = sk_dst_get(sk);
360 
361 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
362 		sk_dst_reset(sk);
363 		dst_release(dst);
364 		return NULL;
365 	}
366 
367 	return dst;
368 }
369 EXPORT_SYMBOL(sk_dst_check);
370 
371 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
372 {
373 	int ret = -ENOPROTOOPT;
374 #ifdef CONFIG_NETDEVICES
375 	struct net *net = sock_net(sk);
376 	char devname[IFNAMSIZ];
377 	int index;
378 
379 	/* Sorry... */
380 	ret = -EPERM;
381 	if (!capable(CAP_NET_RAW))
382 		goto out;
383 
384 	ret = -EINVAL;
385 	if (optlen < 0)
386 		goto out;
387 
388 	/* Bind this socket to a particular device like "eth0",
389 	 * as specified in the passed interface name. If the
390 	 * name is "" or the option length is zero the socket
391 	 * is not bound.
392 	 */
393 	if (optlen > IFNAMSIZ - 1)
394 		optlen = IFNAMSIZ - 1;
395 	memset(devname, 0, sizeof(devname));
396 
397 	ret = -EFAULT;
398 	if (copy_from_user(devname, optval, optlen))
399 		goto out;
400 
401 	if (devname[0] == '\0') {
402 		index = 0;
403 	} else {
404 		struct net_device *dev = dev_get_by_name(net, devname);
405 
406 		ret = -ENODEV;
407 		if (!dev)
408 			goto out;
409 
410 		index = dev->ifindex;
411 		dev_put(dev);
412 	}
413 
414 	lock_sock(sk);
415 	sk->sk_bound_dev_if = index;
416 	sk_dst_reset(sk);
417 	release_sock(sk);
418 
419 	ret = 0;
420 
421 out:
422 #endif
423 
424 	return ret;
425 }
426 
427 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
428 {
429 	if (valbool)
430 		sock_set_flag(sk, bit);
431 	else
432 		sock_reset_flag(sk, bit);
433 }
434 
435 /*
436  *	This is meant for all protocols to use and covers goings on
437  *	at the socket level. Everything here is generic.
438  */
439 
440 int sock_setsockopt(struct socket *sock, int level, int optname,
441 		    char __user *optval, int optlen)
442 {
443 	struct sock *sk=sock->sk;
444 	int val;
445 	int valbool;
446 	struct linger ling;
447 	int ret = 0;
448 
449 	/*
450 	 *	Options without arguments
451 	 */
452 
453 	if (optname == SO_BINDTODEVICE)
454 		return sock_bindtodevice(sk, optval, optlen);
455 
456 	if (optlen < sizeof(int))
457 		return -EINVAL;
458 
459 	if (get_user(val, (int __user *)optval))
460 		return -EFAULT;
461 
462 	valbool = val?1:0;
463 
464 	lock_sock(sk);
465 
466 	switch(optname) {
467 	case SO_DEBUG:
468 		if (val && !capable(CAP_NET_ADMIN)) {
469 			ret = -EACCES;
470 		} else
471 			sock_valbool_flag(sk, SOCK_DBG, valbool);
472 		break;
473 	case SO_REUSEADDR:
474 		sk->sk_reuse = valbool;
475 		break;
476 	case SO_TYPE:
477 	case SO_ERROR:
478 		ret = -ENOPROTOOPT;
479 		break;
480 	case SO_DONTROUTE:
481 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
482 		break;
483 	case SO_BROADCAST:
484 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
485 		break;
486 	case SO_SNDBUF:
487 		/* Don't error on this BSD doesn't and if you think
488 		   about it this is right. Otherwise apps have to
489 		   play 'guess the biggest size' games. RCVBUF/SNDBUF
490 		   are treated in BSD as hints */
491 
492 		if (val > sysctl_wmem_max)
493 			val = sysctl_wmem_max;
494 set_sndbuf:
495 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
496 		if ((val * 2) < SOCK_MIN_SNDBUF)
497 			sk->sk_sndbuf = SOCK_MIN_SNDBUF;
498 		else
499 			sk->sk_sndbuf = val * 2;
500 
501 		/*
502 		 *	Wake up sending tasks if we
503 		 *	upped the value.
504 		 */
505 		sk->sk_write_space(sk);
506 		break;
507 
508 	case SO_SNDBUFFORCE:
509 		if (!capable(CAP_NET_ADMIN)) {
510 			ret = -EPERM;
511 			break;
512 		}
513 		goto set_sndbuf;
514 
515 	case SO_RCVBUF:
516 		/* Don't error on this BSD doesn't and if you think
517 		   about it this is right. Otherwise apps have to
518 		   play 'guess the biggest size' games. RCVBUF/SNDBUF
519 		   are treated in BSD as hints */
520 
521 		if (val > sysctl_rmem_max)
522 			val = sysctl_rmem_max;
523 set_rcvbuf:
524 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
525 		/*
526 		 * We double it on the way in to account for
527 		 * "struct sk_buff" etc. overhead.   Applications
528 		 * assume that the SO_RCVBUF setting they make will
529 		 * allow that much actual data to be received on that
530 		 * socket.
531 		 *
532 		 * Applications are unaware that "struct sk_buff" and
533 		 * other overheads allocate from the receive buffer
534 		 * during socket buffer allocation.
535 		 *
536 		 * And after considering the possible alternatives,
537 		 * returning the value we actually used in getsockopt
538 		 * is the most desirable behavior.
539 		 */
540 		if ((val * 2) < SOCK_MIN_RCVBUF)
541 			sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
542 		else
543 			sk->sk_rcvbuf = val * 2;
544 		break;
545 
546 	case SO_RCVBUFFORCE:
547 		if (!capable(CAP_NET_ADMIN)) {
548 			ret = -EPERM;
549 			break;
550 		}
551 		goto set_rcvbuf;
552 
553 	case SO_KEEPALIVE:
554 #ifdef CONFIG_INET
555 		if (sk->sk_protocol == IPPROTO_TCP)
556 			tcp_set_keepalive(sk, valbool);
557 #endif
558 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
559 		break;
560 
561 	case SO_OOBINLINE:
562 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
563 		break;
564 
565 	case SO_NO_CHECK:
566 		sk->sk_no_check = valbool;
567 		break;
568 
569 	case SO_PRIORITY:
570 		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
571 			sk->sk_priority = val;
572 		else
573 			ret = -EPERM;
574 		break;
575 
576 	case SO_LINGER:
577 		if (optlen < sizeof(ling)) {
578 			ret = -EINVAL;	/* 1003.1g */
579 			break;
580 		}
581 		if (copy_from_user(&ling,optval,sizeof(ling))) {
582 			ret = -EFAULT;
583 			break;
584 		}
585 		if (!ling.l_onoff)
586 			sock_reset_flag(sk, SOCK_LINGER);
587 		else {
588 #if (BITS_PER_LONG == 32)
589 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
590 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
591 			else
592 #endif
593 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
594 			sock_set_flag(sk, SOCK_LINGER);
595 		}
596 		break;
597 
598 	case SO_BSDCOMPAT:
599 		sock_warn_obsolete_bsdism("setsockopt");
600 		break;
601 
602 	case SO_PASSCRED:
603 		if (valbool)
604 			set_bit(SOCK_PASSCRED, &sock->flags);
605 		else
606 			clear_bit(SOCK_PASSCRED, &sock->flags);
607 		break;
608 
609 	case SO_TIMESTAMP:
610 	case SO_TIMESTAMPNS:
611 		if (valbool)  {
612 			if (optname == SO_TIMESTAMP)
613 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
614 			else
615 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
616 			sock_set_flag(sk, SOCK_RCVTSTAMP);
617 			sock_enable_timestamp(sk);
618 		} else {
619 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
620 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
621 		}
622 		break;
623 
624 	case SO_RCVLOWAT:
625 		if (val < 0)
626 			val = INT_MAX;
627 		sk->sk_rcvlowat = val ? : 1;
628 		break;
629 
630 	case SO_RCVTIMEO:
631 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
632 		break;
633 
634 	case SO_SNDTIMEO:
635 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
636 		break;
637 
638 	case SO_ATTACH_FILTER:
639 		ret = -EINVAL;
640 		if (optlen == sizeof(struct sock_fprog)) {
641 			struct sock_fprog fprog;
642 
643 			ret = -EFAULT;
644 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
645 				break;
646 
647 			ret = sk_attach_filter(&fprog, sk);
648 		}
649 		break;
650 
651 	case SO_DETACH_FILTER:
652 		ret = sk_detach_filter(sk);
653 		break;
654 
655 	case SO_PASSSEC:
656 		if (valbool)
657 			set_bit(SOCK_PASSSEC, &sock->flags);
658 		else
659 			clear_bit(SOCK_PASSSEC, &sock->flags);
660 		break;
661 	case SO_MARK:
662 		if (!capable(CAP_NET_ADMIN))
663 			ret = -EPERM;
664 		else {
665 			sk->sk_mark = val;
666 		}
667 		break;
668 
669 		/* We implement the SO_SNDLOWAT etc to
670 		   not be settable (1003.1g 5.3) */
671 	default:
672 		ret = -ENOPROTOOPT;
673 		break;
674 	}
675 	release_sock(sk);
676 	return ret;
677 }
678 
679 
680 int sock_getsockopt(struct socket *sock, int level, int optname,
681 		    char __user *optval, int __user *optlen)
682 {
683 	struct sock *sk = sock->sk;
684 
685 	union {
686 		int val;
687 		struct linger ling;
688 		struct timeval tm;
689 	} v;
690 
691 	unsigned int lv = sizeof(int);
692 	int len;
693 
694 	if (get_user(len, optlen))
695 		return -EFAULT;
696 	if (len < 0)
697 		return -EINVAL;
698 
699 	switch(optname) {
700 	case SO_DEBUG:
701 		v.val = sock_flag(sk, SOCK_DBG);
702 		break;
703 
704 	case SO_DONTROUTE:
705 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
706 		break;
707 
708 	case SO_BROADCAST:
709 		v.val = !!sock_flag(sk, SOCK_BROADCAST);
710 		break;
711 
712 	case SO_SNDBUF:
713 		v.val = sk->sk_sndbuf;
714 		break;
715 
716 	case SO_RCVBUF:
717 		v.val = sk->sk_rcvbuf;
718 		break;
719 
720 	case SO_REUSEADDR:
721 		v.val = sk->sk_reuse;
722 		break;
723 
724 	case SO_KEEPALIVE:
725 		v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
726 		break;
727 
728 	case SO_TYPE:
729 		v.val = sk->sk_type;
730 		break;
731 
732 	case SO_ERROR:
733 		v.val = -sock_error(sk);
734 		if (v.val==0)
735 			v.val = xchg(&sk->sk_err_soft, 0);
736 		break;
737 
738 	case SO_OOBINLINE:
739 		v.val = !!sock_flag(sk, SOCK_URGINLINE);
740 		break;
741 
742 	case SO_NO_CHECK:
743 		v.val = sk->sk_no_check;
744 		break;
745 
746 	case SO_PRIORITY:
747 		v.val = sk->sk_priority;
748 		break;
749 
750 	case SO_LINGER:
751 		lv		= sizeof(v.ling);
752 		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
753 		v.ling.l_linger	= sk->sk_lingertime / HZ;
754 		break;
755 
756 	case SO_BSDCOMPAT:
757 		sock_warn_obsolete_bsdism("getsockopt");
758 		break;
759 
760 	case SO_TIMESTAMP:
761 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
762 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
763 		break;
764 
765 	case SO_TIMESTAMPNS:
766 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
767 		break;
768 
769 	case SO_RCVTIMEO:
770 		lv=sizeof(struct timeval);
771 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
772 			v.tm.tv_sec = 0;
773 			v.tm.tv_usec = 0;
774 		} else {
775 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
776 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
777 		}
778 		break;
779 
780 	case SO_SNDTIMEO:
781 		lv=sizeof(struct timeval);
782 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
783 			v.tm.tv_sec = 0;
784 			v.tm.tv_usec = 0;
785 		} else {
786 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
787 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
788 		}
789 		break;
790 
791 	case SO_RCVLOWAT:
792 		v.val = sk->sk_rcvlowat;
793 		break;
794 
795 	case SO_SNDLOWAT:
796 		v.val=1;
797 		break;
798 
799 	case SO_PASSCRED:
800 		v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
801 		break;
802 
803 	case SO_PEERCRED:
804 		if (len > sizeof(sk->sk_peercred))
805 			len = sizeof(sk->sk_peercred);
806 		if (copy_to_user(optval, &sk->sk_peercred, len))
807 			return -EFAULT;
808 		goto lenout;
809 
810 	case SO_PEERNAME:
811 	{
812 		char address[128];
813 
814 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
815 			return -ENOTCONN;
816 		if (lv < len)
817 			return -EINVAL;
818 		if (copy_to_user(optval, address, len))
819 			return -EFAULT;
820 		goto lenout;
821 	}
822 
823 	/* Dubious BSD thing... Probably nobody even uses it, but
824 	 * the UNIX standard wants it for whatever reason... -DaveM
825 	 */
826 	case SO_ACCEPTCONN:
827 		v.val = sk->sk_state == TCP_LISTEN;
828 		break;
829 
830 	case SO_PASSSEC:
831 		v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
832 		break;
833 
834 	case SO_PEERSEC:
835 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
836 
837 	case SO_MARK:
838 		v.val = sk->sk_mark;
839 		break;
840 
841 	default:
842 		return -ENOPROTOOPT;
843 	}
844 
845 	if (len > lv)
846 		len = lv;
847 	if (copy_to_user(optval, &v, len))
848 		return -EFAULT;
849 lenout:
850 	if (put_user(len, optlen))
851 		return -EFAULT;
852 	return 0;
853 }
854 
855 /*
856  * Initialize an sk_lock.
857  *
858  * (We also register the sk_lock with the lock validator.)
859  */
860 static inline void sock_lock_init(struct sock *sk)
861 {
862 	sock_lock_init_class_and_name(sk,
863 			af_family_slock_key_strings[sk->sk_family],
864 			af_family_slock_keys + sk->sk_family,
865 			af_family_key_strings[sk->sk_family],
866 			af_family_keys + sk->sk_family);
867 }
868 
869 static void sock_copy(struct sock *nsk, const struct sock *osk)
870 {
871 #ifdef CONFIG_SECURITY_NETWORK
872 	void *sptr = nsk->sk_security;
873 #endif
874 
875 	memcpy(nsk, osk, osk->sk_prot->obj_size);
876 #ifdef CONFIG_SECURITY_NETWORK
877 	nsk->sk_security = sptr;
878 	security_sk_clone(osk, nsk);
879 #endif
880 }
881 
882 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
883 		int family)
884 {
885 	struct sock *sk;
886 	struct kmem_cache *slab;
887 
888 	slab = prot->slab;
889 	if (slab != NULL)
890 		sk = kmem_cache_alloc(slab, priority);
891 	else
892 		sk = kmalloc(prot->obj_size, priority);
893 
894 	if (sk != NULL) {
895 		if (security_sk_alloc(sk, family, priority))
896 			goto out_free;
897 
898 		if (!try_module_get(prot->owner))
899 			goto out_free_sec;
900 	}
901 
902 	return sk;
903 
904 out_free_sec:
905 	security_sk_free(sk);
906 out_free:
907 	if (slab != NULL)
908 		kmem_cache_free(slab, sk);
909 	else
910 		kfree(sk);
911 	return NULL;
912 }
913 
914 static void sk_prot_free(struct proto *prot, struct sock *sk)
915 {
916 	struct kmem_cache *slab;
917 	struct module *owner;
918 
919 	owner = prot->owner;
920 	slab = prot->slab;
921 
922 	security_sk_free(sk);
923 	if (slab != NULL)
924 		kmem_cache_free(slab, sk);
925 	else
926 		kfree(sk);
927 	module_put(owner);
928 }
929 
930 /**
931  *	sk_alloc - All socket objects are allocated here
932  *	@net: the applicable net namespace
933  *	@family: protocol family
934  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
935  *	@prot: struct proto associated with this new sock instance
936  */
937 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
938 		      struct proto *prot)
939 {
940 	struct sock *sk;
941 
942 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
943 	if (sk) {
944 		sk->sk_family = family;
945 		/*
946 		 * See comment in struct sock definition to understand
947 		 * why we need sk_prot_creator -acme
948 		 */
949 		sk->sk_prot = sk->sk_prot_creator = prot;
950 		sock_lock_init(sk);
951 		sock_net_set(sk, get_net(net));
952 	}
953 
954 	return sk;
955 }
956 
957 void sk_free(struct sock *sk)
958 {
959 	struct sk_filter *filter;
960 
961 	if (sk->sk_destruct)
962 		sk->sk_destruct(sk);
963 
964 	filter = rcu_dereference(sk->sk_filter);
965 	if (filter) {
966 		sk_filter_uncharge(sk, filter);
967 		rcu_assign_pointer(sk->sk_filter, NULL);
968 	}
969 
970 	sock_disable_timestamp(sk);
971 
972 	if (atomic_read(&sk->sk_omem_alloc))
973 		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
974 		       __func__, atomic_read(&sk->sk_omem_alloc));
975 
976 	put_net(sock_net(sk));
977 	sk_prot_free(sk->sk_prot_creator, sk);
978 }
979 
980 /*
981  * Last sock_put should drop referrence to sk->sk_net. It has already
982  * been dropped in sk_change_net. Taking referrence to stopping namespace
983  * is not an option.
984  * Take referrence to a socket to remove it from hash _alive_ and after that
985  * destroy it in the context of init_net.
986  */
987 void sk_release_kernel(struct sock *sk)
988 {
989 	if (sk == NULL || sk->sk_socket == NULL)
990 		return;
991 
992 	sock_hold(sk);
993 	sock_release(sk->sk_socket);
994 	release_net(sock_net(sk));
995 	sock_net_set(sk, get_net(&init_net));
996 	sock_put(sk);
997 }
998 EXPORT_SYMBOL(sk_release_kernel);
999 
1000 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1001 {
1002 	struct sock *newsk;
1003 
1004 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1005 	if (newsk != NULL) {
1006 		struct sk_filter *filter;
1007 
1008 		sock_copy(newsk, sk);
1009 
1010 		/* SANITY */
1011 		get_net(sock_net(newsk));
1012 		sk_node_init(&newsk->sk_node);
1013 		sock_lock_init(newsk);
1014 		bh_lock_sock(newsk);
1015 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1016 
1017 		atomic_set(&newsk->sk_rmem_alloc, 0);
1018 		atomic_set(&newsk->sk_wmem_alloc, 0);
1019 		atomic_set(&newsk->sk_omem_alloc, 0);
1020 		skb_queue_head_init(&newsk->sk_receive_queue);
1021 		skb_queue_head_init(&newsk->sk_write_queue);
1022 #ifdef CONFIG_NET_DMA
1023 		skb_queue_head_init(&newsk->sk_async_wait_queue);
1024 #endif
1025 
1026 		rwlock_init(&newsk->sk_dst_lock);
1027 		rwlock_init(&newsk->sk_callback_lock);
1028 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1029 				af_callback_keys + newsk->sk_family,
1030 				af_family_clock_key_strings[newsk->sk_family]);
1031 
1032 		newsk->sk_dst_cache	= NULL;
1033 		newsk->sk_wmem_queued	= 0;
1034 		newsk->sk_forward_alloc = 0;
1035 		newsk->sk_send_head	= NULL;
1036 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1037 
1038 		sock_reset_flag(newsk, SOCK_DONE);
1039 		skb_queue_head_init(&newsk->sk_error_queue);
1040 
1041 		filter = newsk->sk_filter;
1042 		if (filter != NULL)
1043 			sk_filter_charge(newsk, filter);
1044 
1045 		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1046 			/* It is still raw copy of parent, so invalidate
1047 			 * destructor and make plain sk_free() */
1048 			newsk->sk_destruct = NULL;
1049 			sk_free(newsk);
1050 			newsk = NULL;
1051 			goto out;
1052 		}
1053 
1054 		newsk->sk_err	   = 0;
1055 		newsk->sk_priority = 0;
1056 		atomic_set(&newsk->sk_refcnt, 2);
1057 
1058 		/*
1059 		 * Increment the counter in the same struct proto as the master
1060 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1061 		 * is the same as sk->sk_prot->socks, as this field was copied
1062 		 * with memcpy).
1063 		 *
1064 		 * This _changes_ the previous behaviour, where
1065 		 * tcp_create_openreq_child always was incrementing the
1066 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1067 		 * to be taken into account in all callers. -acme
1068 		 */
1069 		sk_refcnt_debug_inc(newsk);
1070 		sk_set_socket(newsk, NULL);
1071 		newsk->sk_sleep	 = NULL;
1072 
1073 		if (newsk->sk_prot->sockets_allocated)
1074 			atomic_inc(newsk->sk_prot->sockets_allocated);
1075 	}
1076 out:
1077 	return newsk;
1078 }
1079 
1080 EXPORT_SYMBOL_GPL(sk_clone);
1081 
1082 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1083 {
1084 	__sk_dst_set(sk, dst);
1085 	sk->sk_route_caps = dst->dev->features;
1086 	if (sk->sk_route_caps & NETIF_F_GSO)
1087 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1088 	if (sk_can_gso(sk)) {
1089 		if (dst->header_len) {
1090 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1091 		} else {
1092 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1093 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1094 		}
1095 	}
1096 }
1097 EXPORT_SYMBOL_GPL(sk_setup_caps);
1098 
1099 void __init sk_init(void)
1100 {
1101 	if (num_physpages <= 4096) {
1102 		sysctl_wmem_max = 32767;
1103 		sysctl_rmem_max = 32767;
1104 		sysctl_wmem_default = 32767;
1105 		sysctl_rmem_default = 32767;
1106 	} else if (num_physpages >= 131072) {
1107 		sysctl_wmem_max = 131071;
1108 		sysctl_rmem_max = 131071;
1109 	}
1110 }
1111 
1112 /*
1113  *	Simple resource managers for sockets.
1114  */
1115 
1116 
1117 /*
1118  * Write buffer destructor automatically called from kfree_skb.
1119  */
1120 void sock_wfree(struct sk_buff *skb)
1121 {
1122 	struct sock *sk = skb->sk;
1123 
1124 	/* In case it might be waiting for more memory. */
1125 	atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1126 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1127 		sk->sk_write_space(sk);
1128 	sock_put(sk);
1129 }
1130 
1131 /*
1132  * Read buffer destructor automatically called from kfree_skb.
1133  */
1134 void sock_rfree(struct sk_buff *skb)
1135 {
1136 	struct sock *sk = skb->sk;
1137 
1138 	skb_truesize_check(skb);
1139 	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1140 	sk_mem_uncharge(skb->sk, skb->truesize);
1141 }
1142 
1143 
1144 int sock_i_uid(struct sock *sk)
1145 {
1146 	int uid;
1147 
1148 	read_lock(&sk->sk_callback_lock);
1149 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1150 	read_unlock(&sk->sk_callback_lock);
1151 	return uid;
1152 }
1153 
1154 unsigned long sock_i_ino(struct sock *sk)
1155 {
1156 	unsigned long ino;
1157 
1158 	read_lock(&sk->sk_callback_lock);
1159 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1160 	read_unlock(&sk->sk_callback_lock);
1161 	return ino;
1162 }
1163 
1164 /*
1165  * Allocate a skb from the socket's send buffer.
1166  */
1167 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1168 			     gfp_t priority)
1169 {
1170 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1171 		struct sk_buff * skb = alloc_skb(size, priority);
1172 		if (skb) {
1173 			skb_set_owner_w(skb, sk);
1174 			return skb;
1175 		}
1176 	}
1177 	return NULL;
1178 }
1179 
1180 /*
1181  * Allocate a skb from the socket's receive buffer.
1182  */
1183 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1184 			     gfp_t priority)
1185 {
1186 	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1187 		struct sk_buff *skb = alloc_skb(size, priority);
1188 		if (skb) {
1189 			skb_set_owner_r(skb, sk);
1190 			return skb;
1191 		}
1192 	}
1193 	return NULL;
1194 }
1195 
1196 /*
1197  * Allocate a memory block from the socket's option memory buffer.
1198  */
1199 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1200 {
1201 	if ((unsigned)size <= sysctl_optmem_max &&
1202 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1203 		void *mem;
1204 		/* First do the add, to avoid the race if kmalloc
1205 		 * might sleep.
1206 		 */
1207 		atomic_add(size, &sk->sk_omem_alloc);
1208 		mem = kmalloc(size, priority);
1209 		if (mem)
1210 			return mem;
1211 		atomic_sub(size, &sk->sk_omem_alloc);
1212 	}
1213 	return NULL;
1214 }
1215 
1216 /*
1217  * Free an option memory block.
1218  */
1219 void sock_kfree_s(struct sock *sk, void *mem, int size)
1220 {
1221 	kfree(mem);
1222 	atomic_sub(size, &sk->sk_omem_alloc);
1223 }
1224 
1225 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1226    I think, these locks should be removed for datagram sockets.
1227  */
1228 static long sock_wait_for_wmem(struct sock * sk, long timeo)
1229 {
1230 	DEFINE_WAIT(wait);
1231 
1232 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1233 	for (;;) {
1234 		if (!timeo)
1235 			break;
1236 		if (signal_pending(current))
1237 			break;
1238 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1239 		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1240 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1241 			break;
1242 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1243 			break;
1244 		if (sk->sk_err)
1245 			break;
1246 		timeo = schedule_timeout(timeo);
1247 	}
1248 	finish_wait(sk->sk_sleep, &wait);
1249 	return timeo;
1250 }
1251 
1252 
1253 /*
1254  *	Generic send/receive buffer handlers
1255  */
1256 
1257 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1258 					    unsigned long header_len,
1259 					    unsigned long data_len,
1260 					    int noblock, int *errcode)
1261 {
1262 	struct sk_buff *skb;
1263 	gfp_t gfp_mask;
1264 	long timeo;
1265 	int err;
1266 
1267 	gfp_mask = sk->sk_allocation;
1268 	if (gfp_mask & __GFP_WAIT)
1269 		gfp_mask |= __GFP_REPEAT;
1270 
1271 	timeo = sock_sndtimeo(sk, noblock);
1272 	while (1) {
1273 		err = sock_error(sk);
1274 		if (err != 0)
1275 			goto failure;
1276 
1277 		err = -EPIPE;
1278 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1279 			goto failure;
1280 
1281 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1282 			skb = alloc_skb(header_len, gfp_mask);
1283 			if (skb) {
1284 				int npages;
1285 				int i;
1286 
1287 				/* No pages, we're done... */
1288 				if (!data_len)
1289 					break;
1290 
1291 				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1292 				skb->truesize += data_len;
1293 				skb_shinfo(skb)->nr_frags = npages;
1294 				for (i = 0; i < npages; i++) {
1295 					struct page *page;
1296 					skb_frag_t *frag;
1297 
1298 					page = alloc_pages(sk->sk_allocation, 0);
1299 					if (!page) {
1300 						err = -ENOBUFS;
1301 						skb_shinfo(skb)->nr_frags = i;
1302 						kfree_skb(skb);
1303 						goto failure;
1304 					}
1305 
1306 					frag = &skb_shinfo(skb)->frags[i];
1307 					frag->page = page;
1308 					frag->page_offset = 0;
1309 					frag->size = (data_len >= PAGE_SIZE ?
1310 						      PAGE_SIZE :
1311 						      data_len);
1312 					data_len -= PAGE_SIZE;
1313 				}
1314 
1315 				/* Full success... */
1316 				break;
1317 			}
1318 			err = -ENOBUFS;
1319 			goto failure;
1320 		}
1321 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1322 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1323 		err = -EAGAIN;
1324 		if (!timeo)
1325 			goto failure;
1326 		if (signal_pending(current))
1327 			goto interrupted;
1328 		timeo = sock_wait_for_wmem(sk, timeo);
1329 	}
1330 
1331 	skb_set_owner_w(skb, sk);
1332 	return skb;
1333 
1334 interrupted:
1335 	err = sock_intr_errno(timeo);
1336 failure:
1337 	*errcode = err;
1338 	return NULL;
1339 }
1340 
1341 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1342 				    int noblock, int *errcode)
1343 {
1344 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1345 }
1346 
1347 static void __lock_sock(struct sock *sk)
1348 {
1349 	DEFINE_WAIT(wait);
1350 
1351 	for (;;) {
1352 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1353 					TASK_UNINTERRUPTIBLE);
1354 		spin_unlock_bh(&sk->sk_lock.slock);
1355 		schedule();
1356 		spin_lock_bh(&sk->sk_lock.slock);
1357 		if (!sock_owned_by_user(sk))
1358 			break;
1359 	}
1360 	finish_wait(&sk->sk_lock.wq, &wait);
1361 }
1362 
1363 static void __release_sock(struct sock *sk)
1364 {
1365 	struct sk_buff *skb = sk->sk_backlog.head;
1366 
1367 	do {
1368 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1369 		bh_unlock_sock(sk);
1370 
1371 		do {
1372 			struct sk_buff *next = skb->next;
1373 
1374 			skb->next = NULL;
1375 			sk_backlog_rcv(sk, skb);
1376 
1377 			/*
1378 			 * We are in process context here with softirqs
1379 			 * disabled, use cond_resched_softirq() to preempt.
1380 			 * This is safe to do because we've taken the backlog
1381 			 * queue private:
1382 			 */
1383 			cond_resched_softirq();
1384 
1385 			skb = next;
1386 		} while (skb != NULL);
1387 
1388 		bh_lock_sock(sk);
1389 	} while ((skb = sk->sk_backlog.head) != NULL);
1390 }
1391 
1392 /**
1393  * sk_wait_data - wait for data to arrive at sk_receive_queue
1394  * @sk:    sock to wait on
1395  * @timeo: for how long
1396  *
1397  * Now socket state including sk->sk_err is changed only under lock,
1398  * hence we may omit checks after joining wait queue.
1399  * We check receive queue before schedule() only as optimization;
1400  * it is very likely that release_sock() added new data.
1401  */
1402 int sk_wait_data(struct sock *sk, long *timeo)
1403 {
1404 	int rc;
1405 	DEFINE_WAIT(wait);
1406 
1407 	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1408 	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1409 	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1410 	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1411 	finish_wait(sk->sk_sleep, &wait);
1412 	return rc;
1413 }
1414 
1415 EXPORT_SYMBOL(sk_wait_data);
1416 
1417 /**
1418  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1419  *	@sk: socket
1420  *	@size: memory size to allocate
1421  *	@kind: allocation type
1422  *
1423  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1424  *	rmem allocation. This function assumes that protocols which have
1425  *	memory_pressure use sk_wmem_queued as write buffer accounting.
1426  */
1427 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1428 {
1429 	struct proto *prot = sk->sk_prot;
1430 	int amt = sk_mem_pages(size);
1431 	int allocated;
1432 
1433 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1434 	allocated = atomic_add_return(amt, prot->memory_allocated);
1435 
1436 	/* Under limit. */
1437 	if (allocated <= prot->sysctl_mem[0]) {
1438 		if (prot->memory_pressure && *prot->memory_pressure)
1439 			*prot->memory_pressure = 0;
1440 		return 1;
1441 	}
1442 
1443 	/* Under pressure. */
1444 	if (allocated > prot->sysctl_mem[1])
1445 		if (prot->enter_memory_pressure)
1446 			prot->enter_memory_pressure(sk);
1447 
1448 	/* Over hard limit. */
1449 	if (allocated > prot->sysctl_mem[2])
1450 		goto suppress_allocation;
1451 
1452 	/* guarantee minimum buffer size under pressure */
1453 	if (kind == SK_MEM_RECV) {
1454 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1455 			return 1;
1456 	} else { /* SK_MEM_SEND */
1457 		if (sk->sk_type == SOCK_STREAM) {
1458 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1459 				return 1;
1460 		} else if (atomic_read(&sk->sk_wmem_alloc) <
1461 			   prot->sysctl_wmem[0])
1462 				return 1;
1463 	}
1464 
1465 	if (prot->memory_pressure) {
1466 		if (!*prot->memory_pressure ||
1467 		    prot->sysctl_mem[2] > atomic_read(prot->sockets_allocated) *
1468 		    sk_mem_pages(sk->sk_wmem_queued +
1469 				 atomic_read(&sk->sk_rmem_alloc) +
1470 				 sk->sk_forward_alloc))
1471 			return 1;
1472 	}
1473 
1474 suppress_allocation:
1475 
1476 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1477 		sk_stream_moderate_sndbuf(sk);
1478 
1479 		/* Fail only if socket is _under_ its sndbuf.
1480 		 * In this case we cannot block, so that we have to fail.
1481 		 */
1482 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1483 			return 1;
1484 	}
1485 
1486 	/* Alas. Undo changes. */
1487 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1488 	atomic_sub(amt, prot->memory_allocated);
1489 	return 0;
1490 }
1491 
1492 EXPORT_SYMBOL(__sk_mem_schedule);
1493 
1494 /**
1495  *	__sk_reclaim - reclaim memory_allocated
1496  *	@sk: socket
1497  */
1498 void __sk_mem_reclaim(struct sock *sk)
1499 {
1500 	struct proto *prot = sk->sk_prot;
1501 
1502 	atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1503 		   prot->memory_allocated);
1504 	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1505 
1506 	if (prot->memory_pressure && *prot->memory_pressure &&
1507 	    (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1508 		*prot->memory_pressure = 0;
1509 }
1510 
1511 EXPORT_SYMBOL(__sk_mem_reclaim);
1512 
1513 
1514 /*
1515  * Set of default routines for initialising struct proto_ops when
1516  * the protocol does not support a particular function. In certain
1517  * cases where it makes no sense for a protocol to have a "do nothing"
1518  * function, some default processing is provided.
1519  */
1520 
1521 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1522 {
1523 	return -EOPNOTSUPP;
1524 }
1525 
1526 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1527 		    int len, int flags)
1528 {
1529 	return -EOPNOTSUPP;
1530 }
1531 
1532 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1533 {
1534 	return -EOPNOTSUPP;
1535 }
1536 
1537 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1538 {
1539 	return -EOPNOTSUPP;
1540 }
1541 
1542 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1543 		    int *len, int peer)
1544 {
1545 	return -EOPNOTSUPP;
1546 }
1547 
1548 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1549 {
1550 	return 0;
1551 }
1552 
1553 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1554 {
1555 	return -EOPNOTSUPP;
1556 }
1557 
1558 int sock_no_listen(struct socket *sock, int backlog)
1559 {
1560 	return -EOPNOTSUPP;
1561 }
1562 
1563 int sock_no_shutdown(struct socket *sock, int how)
1564 {
1565 	return -EOPNOTSUPP;
1566 }
1567 
1568 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1569 		    char __user *optval, int optlen)
1570 {
1571 	return -EOPNOTSUPP;
1572 }
1573 
1574 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1575 		    char __user *optval, int __user *optlen)
1576 {
1577 	return -EOPNOTSUPP;
1578 }
1579 
1580 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1581 		    size_t len)
1582 {
1583 	return -EOPNOTSUPP;
1584 }
1585 
1586 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1587 		    size_t len, int flags)
1588 {
1589 	return -EOPNOTSUPP;
1590 }
1591 
1592 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1593 {
1594 	/* Mirror missing mmap method error code */
1595 	return -ENODEV;
1596 }
1597 
1598 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1599 {
1600 	ssize_t res;
1601 	struct msghdr msg = {.msg_flags = flags};
1602 	struct kvec iov;
1603 	char *kaddr = kmap(page);
1604 	iov.iov_base = kaddr + offset;
1605 	iov.iov_len = size;
1606 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1607 	kunmap(page);
1608 	return res;
1609 }
1610 
1611 /*
1612  *	Default Socket Callbacks
1613  */
1614 
1615 static void sock_def_wakeup(struct sock *sk)
1616 {
1617 	read_lock(&sk->sk_callback_lock);
1618 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1619 		wake_up_interruptible_all(sk->sk_sleep);
1620 	read_unlock(&sk->sk_callback_lock);
1621 }
1622 
1623 static void sock_def_error_report(struct sock *sk)
1624 {
1625 	read_lock(&sk->sk_callback_lock);
1626 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1627 		wake_up_interruptible(sk->sk_sleep);
1628 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1629 	read_unlock(&sk->sk_callback_lock);
1630 }
1631 
1632 static void sock_def_readable(struct sock *sk, int len)
1633 {
1634 	read_lock(&sk->sk_callback_lock);
1635 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1636 		wake_up_interruptible_sync(sk->sk_sleep);
1637 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1638 	read_unlock(&sk->sk_callback_lock);
1639 }
1640 
1641 static void sock_def_write_space(struct sock *sk)
1642 {
1643 	read_lock(&sk->sk_callback_lock);
1644 
1645 	/* Do not wake up a writer until he can make "significant"
1646 	 * progress.  --DaveM
1647 	 */
1648 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1649 		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1650 			wake_up_interruptible_sync(sk->sk_sleep);
1651 
1652 		/* Should agree with poll, otherwise some programs break */
1653 		if (sock_writeable(sk))
1654 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1655 	}
1656 
1657 	read_unlock(&sk->sk_callback_lock);
1658 }
1659 
1660 static void sock_def_destruct(struct sock *sk)
1661 {
1662 	kfree(sk->sk_protinfo);
1663 }
1664 
1665 void sk_send_sigurg(struct sock *sk)
1666 {
1667 	if (sk->sk_socket && sk->sk_socket->file)
1668 		if (send_sigurg(&sk->sk_socket->file->f_owner))
1669 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1670 }
1671 
1672 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1673 		    unsigned long expires)
1674 {
1675 	if (!mod_timer(timer, expires))
1676 		sock_hold(sk);
1677 }
1678 
1679 EXPORT_SYMBOL(sk_reset_timer);
1680 
1681 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1682 {
1683 	if (timer_pending(timer) && del_timer(timer))
1684 		__sock_put(sk);
1685 }
1686 
1687 EXPORT_SYMBOL(sk_stop_timer);
1688 
1689 void sock_init_data(struct socket *sock, struct sock *sk)
1690 {
1691 	skb_queue_head_init(&sk->sk_receive_queue);
1692 	skb_queue_head_init(&sk->sk_write_queue);
1693 	skb_queue_head_init(&sk->sk_error_queue);
1694 #ifdef CONFIG_NET_DMA
1695 	skb_queue_head_init(&sk->sk_async_wait_queue);
1696 #endif
1697 
1698 	sk->sk_send_head	=	NULL;
1699 
1700 	init_timer(&sk->sk_timer);
1701 
1702 	sk->sk_allocation	=	GFP_KERNEL;
1703 	sk->sk_rcvbuf		=	sysctl_rmem_default;
1704 	sk->sk_sndbuf		=	sysctl_wmem_default;
1705 	sk->sk_state		=	TCP_CLOSE;
1706 	sk_set_socket(sk, sock);
1707 
1708 	sock_set_flag(sk, SOCK_ZAPPED);
1709 
1710 	if (sock) {
1711 		sk->sk_type	=	sock->type;
1712 		sk->sk_sleep	=	&sock->wait;
1713 		sock->sk	=	sk;
1714 	} else
1715 		sk->sk_sleep	=	NULL;
1716 
1717 	rwlock_init(&sk->sk_dst_lock);
1718 	rwlock_init(&sk->sk_callback_lock);
1719 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1720 			af_callback_keys + sk->sk_family,
1721 			af_family_clock_key_strings[sk->sk_family]);
1722 
1723 	sk->sk_state_change	=	sock_def_wakeup;
1724 	sk->sk_data_ready	=	sock_def_readable;
1725 	sk->sk_write_space	=	sock_def_write_space;
1726 	sk->sk_error_report	=	sock_def_error_report;
1727 	sk->sk_destruct		=	sock_def_destruct;
1728 
1729 	sk->sk_sndmsg_page	=	NULL;
1730 	sk->sk_sndmsg_off	=	0;
1731 
1732 	sk->sk_peercred.pid 	=	0;
1733 	sk->sk_peercred.uid	=	-1;
1734 	sk->sk_peercred.gid	=	-1;
1735 	sk->sk_write_pending	=	0;
1736 	sk->sk_rcvlowat		=	1;
1737 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1738 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1739 
1740 	sk->sk_stamp = ktime_set(-1L, 0);
1741 
1742 	atomic_set(&sk->sk_refcnt, 1);
1743 	atomic_set(&sk->sk_drops, 0);
1744 }
1745 
1746 void lock_sock_nested(struct sock *sk, int subclass)
1747 {
1748 	might_sleep();
1749 	spin_lock_bh(&sk->sk_lock.slock);
1750 	if (sk->sk_lock.owned)
1751 		__lock_sock(sk);
1752 	sk->sk_lock.owned = 1;
1753 	spin_unlock(&sk->sk_lock.slock);
1754 	/*
1755 	 * The sk_lock has mutex_lock() semantics here:
1756 	 */
1757 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1758 	local_bh_enable();
1759 }
1760 
1761 EXPORT_SYMBOL(lock_sock_nested);
1762 
1763 void release_sock(struct sock *sk)
1764 {
1765 	/*
1766 	 * The sk_lock has mutex_unlock() semantics:
1767 	 */
1768 	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1769 
1770 	spin_lock_bh(&sk->sk_lock.slock);
1771 	if (sk->sk_backlog.tail)
1772 		__release_sock(sk);
1773 	sk->sk_lock.owned = 0;
1774 	if (waitqueue_active(&sk->sk_lock.wq))
1775 		wake_up(&sk->sk_lock.wq);
1776 	spin_unlock_bh(&sk->sk_lock.slock);
1777 }
1778 EXPORT_SYMBOL(release_sock);
1779 
1780 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1781 {
1782 	struct timeval tv;
1783 	if (!sock_flag(sk, SOCK_TIMESTAMP))
1784 		sock_enable_timestamp(sk);
1785 	tv = ktime_to_timeval(sk->sk_stamp);
1786 	if (tv.tv_sec == -1)
1787 		return -ENOENT;
1788 	if (tv.tv_sec == 0) {
1789 		sk->sk_stamp = ktime_get_real();
1790 		tv = ktime_to_timeval(sk->sk_stamp);
1791 	}
1792 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1793 }
1794 EXPORT_SYMBOL(sock_get_timestamp);
1795 
1796 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1797 {
1798 	struct timespec ts;
1799 	if (!sock_flag(sk, SOCK_TIMESTAMP))
1800 		sock_enable_timestamp(sk);
1801 	ts = ktime_to_timespec(sk->sk_stamp);
1802 	if (ts.tv_sec == -1)
1803 		return -ENOENT;
1804 	if (ts.tv_sec == 0) {
1805 		sk->sk_stamp = ktime_get_real();
1806 		ts = ktime_to_timespec(sk->sk_stamp);
1807 	}
1808 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1809 }
1810 EXPORT_SYMBOL(sock_get_timestampns);
1811 
1812 void sock_enable_timestamp(struct sock *sk)
1813 {
1814 	if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1815 		sock_set_flag(sk, SOCK_TIMESTAMP);
1816 		net_enable_timestamp();
1817 	}
1818 }
1819 
1820 /*
1821  *	Get a socket option on an socket.
1822  *
1823  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
1824  *	asynchronous errors should be reported by getsockopt. We assume
1825  *	this means if you specify SO_ERROR (otherwise whats the point of it).
1826  */
1827 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1828 			   char __user *optval, int __user *optlen)
1829 {
1830 	struct sock *sk = sock->sk;
1831 
1832 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1833 }
1834 
1835 EXPORT_SYMBOL(sock_common_getsockopt);
1836 
1837 #ifdef CONFIG_COMPAT
1838 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1839 				  char __user *optval, int __user *optlen)
1840 {
1841 	struct sock *sk = sock->sk;
1842 
1843 	if (sk->sk_prot->compat_getsockopt != NULL)
1844 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
1845 						      optval, optlen);
1846 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1847 }
1848 EXPORT_SYMBOL(compat_sock_common_getsockopt);
1849 #endif
1850 
1851 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1852 			struct msghdr *msg, size_t size, int flags)
1853 {
1854 	struct sock *sk = sock->sk;
1855 	int addr_len = 0;
1856 	int err;
1857 
1858 	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1859 				   flags & ~MSG_DONTWAIT, &addr_len);
1860 	if (err >= 0)
1861 		msg->msg_namelen = addr_len;
1862 	return err;
1863 }
1864 
1865 EXPORT_SYMBOL(sock_common_recvmsg);
1866 
1867 /*
1868  *	Set socket options on an inet socket.
1869  */
1870 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1871 			   char __user *optval, int optlen)
1872 {
1873 	struct sock *sk = sock->sk;
1874 
1875 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1876 }
1877 
1878 EXPORT_SYMBOL(sock_common_setsockopt);
1879 
1880 #ifdef CONFIG_COMPAT
1881 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1882 				  char __user *optval, int optlen)
1883 {
1884 	struct sock *sk = sock->sk;
1885 
1886 	if (sk->sk_prot->compat_setsockopt != NULL)
1887 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
1888 						      optval, optlen);
1889 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1890 }
1891 EXPORT_SYMBOL(compat_sock_common_setsockopt);
1892 #endif
1893 
1894 void sk_common_release(struct sock *sk)
1895 {
1896 	if (sk->sk_prot->destroy)
1897 		sk->sk_prot->destroy(sk);
1898 
1899 	/*
1900 	 * Observation: when sock_common_release is called, processes have
1901 	 * no access to socket. But net still has.
1902 	 * Step one, detach it from networking:
1903 	 *
1904 	 * A. Remove from hash tables.
1905 	 */
1906 
1907 	sk->sk_prot->unhash(sk);
1908 
1909 	/*
1910 	 * In this point socket cannot receive new packets, but it is possible
1911 	 * that some packets are in flight because some CPU runs receiver and
1912 	 * did hash table lookup before we unhashed socket. They will achieve
1913 	 * receive queue and will be purged by socket destructor.
1914 	 *
1915 	 * Also we still have packets pending on receive queue and probably,
1916 	 * our own packets waiting in device queues. sock_destroy will drain
1917 	 * receive queue, but transmitted packets will delay socket destruction
1918 	 * until the last reference will be released.
1919 	 */
1920 
1921 	sock_orphan(sk);
1922 
1923 	xfrm_sk_free_policy(sk);
1924 
1925 	sk_refcnt_debug_release(sk);
1926 	sock_put(sk);
1927 }
1928 
1929 EXPORT_SYMBOL(sk_common_release);
1930 
1931 static DEFINE_RWLOCK(proto_list_lock);
1932 static LIST_HEAD(proto_list);
1933 
1934 #ifdef CONFIG_PROC_FS
1935 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
1936 struct prot_inuse {
1937 	int val[PROTO_INUSE_NR];
1938 };
1939 
1940 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
1941 
1942 #ifdef CONFIG_NET_NS
1943 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
1944 {
1945 	int cpu = smp_processor_id();
1946 	per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
1947 }
1948 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
1949 
1950 int sock_prot_inuse_get(struct net *net, struct proto *prot)
1951 {
1952 	int cpu, idx = prot->inuse_idx;
1953 	int res = 0;
1954 
1955 	for_each_possible_cpu(cpu)
1956 		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
1957 
1958 	return res >= 0 ? res : 0;
1959 }
1960 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
1961 
1962 static int sock_inuse_init_net(struct net *net)
1963 {
1964 	net->core.inuse = alloc_percpu(struct prot_inuse);
1965 	return net->core.inuse ? 0 : -ENOMEM;
1966 }
1967 
1968 static void sock_inuse_exit_net(struct net *net)
1969 {
1970 	free_percpu(net->core.inuse);
1971 }
1972 
1973 static struct pernet_operations net_inuse_ops = {
1974 	.init = sock_inuse_init_net,
1975 	.exit = sock_inuse_exit_net,
1976 };
1977 
1978 static __init int net_inuse_init(void)
1979 {
1980 	if (register_pernet_subsys(&net_inuse_ops))
1981 		panic("Cannot initialize net inuse counters");
1982 
1983 	return 0;
1984 }
1985 
1986 core_initcall(net_inuse_init);
1987 #else
1988 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
1989 
1990 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
1991 {
1992 	__get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
1993 }
1994 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
1995 
1996 int sock_prot_inuse_get(struct net *net, struct proto *prot)
1997 {
1998 	int cpu, idx = prot->inuse_idx;
1999 	int res = 0;
2000 
2001 	for_each_possible_cpu(cpu)
2002 		res += per_cpu(prot_inuse, cpu).val[idx];
2003 
2004 	return res >= 0 ? res : 0;
2005 }
2006 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2007 #endif
2008 
2009 static void assign_proto_idx(struct proto *prot)
2010 {
2011 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2012 
2013 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2014 		printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2015 		return;
2016 	}
2017 
2018 	set_bit(prot->inuse_idx, proto_inuse_idx);
2019 }
2020 
2021 static void release_proto_idx(struct proto *prot)
2022 {
2023 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2024 		clear_bit(prot->inuse_idx, proto_inuse_idx);
2025 }
2026 #else
2027 static inline void assign_proto_idx(struct proto *prot)
2028 {
2029 }
2030 
2031 static inline void release_proto_idx(struct proto *prot)
2032 {
2033 }
2034 #endif
2035 
2036 int proto_register(struct proto *prot, int alloc_slab)
2037 {
2038 	if (alloc_slab) {
2039 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2040 					       SLAB_HWCACHE_ALIGN, NULL);
2041 
2042 		if (prot->slab == NULL) {
2043 			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2044 			       prot->name);
2045 			goto out;
2046 		}
2047 
2048 		if (prot->rsk_prot != NULL) {
2049 			static const char mask[] = "request_sock_%s";
2050 
2051 			prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2052 			if (prot->rsk_prot->slab_name == NULL)
2053 				goto out_free_sock_slab;
2054 
2055 			sprintf(prot->rsk_prot->slab_name, mask, prot->name);
2056 			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2057 								 prot->rsk_prot->obj_size, 0,
2058 								 SLAB_HWCACHE_ALIGN, NULL);
2059 
2060 			if (prot->rsk_prot->slab == NULL) {
2061 				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2062 				       prot->name);
2063 				goto out_free_request_sock_slab_name;
2064 			}
2065 		}
2066 
2067 		if (prot->twsk_prot != NULL) {
2068 			static const char mask[] = "tw_sock_%s";
2069 
2070 			prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2071 
2072 			if (prot->twsk_prot->twsk_slab_name == NULL)
2073 				goto out_free_request_sock_slab;
2074 
2075 			sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
2076 			prot->twsk_prot->twsk_slab =
2077 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2078 						  prot->twsk_prot->twsk_obj_size,
2079 						  0, SLAB_HWCACHE_ALIGN,
2080 						  NULL);
2081 			if (prot->twsk_prot->twsk_slab == NULL)
2082 				goto out_free_timewait_sock_slab_name;
2083 		}
2084 	}
2085 
2086 	write_lock(&proto_list_lock);
2087 	list_add(&prot->node, &proto_list);
2088 	assign_proto_idx(prot);
2089 	write_unlock(&proto_list_lock);
2090 	return 0;
2091 
2092 out_free_timewait_sock_slab_name:
2093 	kfree(prot->twsk_prot->twsk_slab_name);
2094 out_free_request_sock_slab:
2095 	if (prot->rsk_prot && prot->rsk_prot->slab) {
2096 		kmem_cache_destroy(prot->rsk_prot->slab);
2097 		prot->rsk_prot->slab = NULL;
2098 	}
2099 out_free_request_sock_slab_name:
2100 	kfree(prot->rsk_prot->slab_name);
2101 out_free_sock_slab:
2102 	kmem_cache_destroy(prot->slab);
2103 	prot->slab = NULL;
2104 out:
2105 	return -ENOBUFS;
2106 }
2107 
2108 EXPORT_SYMBOL(proto_register);
2109 
2110 void proto_unregister(struct proto *prot)
2111 {
2112 	write_lock(&proto_list_lock);
2113 	release_proto_idx(prot);
2114 	list_del(&prot->node);
2115 	write_unlock(&proto_list_lock);
2116 
2117 	if (prot->slab != NULL) {
2118 		kmem_cache_destroy(prot->slab);
2119 		prot->slab = NULL;
2120 	}
2121 
2122 	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2123 		kmem_cache_destroy(prot->rsk_prot->slab);
2124 		kfree(prot->rsk_prot->slab_name);
2125 		prot->rsk_prot->slab = NULL;
2126 	}
2127 
2128 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2129 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2130 		kfree(prot->twsk_prot->twsk_slab_name);
2131 		prot->twsk_prot->twsk_slab = NULL;
2132 	}
2133 }
2134 
2135 EXPORT_SYMBOL(proto_unregister);
2136 
2137 #ifdef CONFIG_PROC_FS
2138 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2139 	__acquires(proto_list_lock)
2140 {
2141 	read_lock(&proto_list_lock);
2142 	return seq_list_start_head(&proto_list, *pos);
2143 }
2144 
2145 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2146 {
2147 	return seq_list_next(v, &proto_list, pos);
2148 }
2149 
2150 static void proto_seq_stop(struct seq_file *seq, void *v)
2151 	__releases(proto_list_lock)
2152 {
2153 	read_unlock(&proto_list_lock);
2154 }
2155 
2156 static char proto_method_implemented(const void *method)
2157 {
2158 	return method == NULL ? 'n' : 'y';
2159 }
2160 
2161 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2162 {
2163 	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
2164 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2165 		   proto->name,
2166 		   proto->obj_size,
2167 		   proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
2168 		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2169 		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2170 		   proto->max_header,
2171 		   proto->slab == NULL ? "no" : "yes",
2172 		   module_name(proto->owner),
2173 		   proto_method_implemented(proto->close),
2174 		   proto_method_implemented(proto->connect),
2175 		   proto_method_implemented(proto->disconnect),
2176 		   proto_method_implemented(proto->accept),
2177 		   proto_method_implemented(proto->ioctl),
2178 		   proto_method_implemented(proto->init),
2179 		   proto_method_implemented(proto->destroy),
2180 		   proto_method_implemented(proto->shutdown),
2181 		   proto_method_implemented(proto->setsockopt),
2182 		   proto_method_implemented(proto->getsockopt),
2183 		   proto_method_implemented(proto->sendmsg),
2184 		   proto_method_implemented(proto->recvmsg),
2185 		   proto_method_implemented(proto->sendpage),
2186 		   proto_method_implemented(proto->bind),
2187 		   proto_method_implemented(proto->backlog_rcv),
2188 		   proto_method_implemented(proto->hash),
2189 		   proto_method_implemented(proto->unhash),
2190 		   proto_method_implemented(proto->get_port),
2191 		   proto_method_implemented(proto->enter_memory_pressure));
2192 }
2193 
2194 static int proto_seq_show(struct seq_file *seq, void *v)
2195 {
2196 	if (v == &proto_list)
2197 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2198 			   "protocol",
2199 			   "size",
2200 			   "sockets",
2201 			   "memory",
2202 			   "press",
2203 			   "maxhdr",
2204 			   "slab",
2205 			   "module",
2206 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2207 	else
2208 		proto_seq_printf(seq, list_entry(v, struct proto, node));
2209 	return 0;
2210 }
2211 
2212 static const struct seq_operations proto_seq_ops = {
2213 	.start  = proto_seq_start,
2214 	.next   = proto_seq_next,
2215 	.stop   = proto_seq_stop,
2216 	.show   = proto_seq_show,
2217 };
2218 
2219 static int proto_seq_open(struct inode *inode, struct file *file)
2220 {
2221 	return seq_open(file, &proto_seq_ops);
2222 }
2223 
2224 static const struct file_operations proto_seq_fops = {
2225 	.owner		= THIS_MODULE,
2226 	.open		= proto_seq_open,
2227 	.read		= seq_read,
2228 	.llseek		= seq_lseek,
2229 	.release	= seq_release,
2230 };
2231 
2232 static int __init proto_init(void)
2233 {
2234 	/* register /proc/net/protocols */
2235 	return proc_net_fops_create(&init_net, "protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
2236 }
2237 
2238 subsys_initcall(proto_init);
2239 
2240 #endif /* PROC_FS */
2241 
2242 EXPORT_SYMBOL(sk_alloc);
2243 EXPORT_SYMBOL(sk_free);
2244 EXPORT_SYMBOL(sk_send_sigurg);
2245 EXPORT_SYMBOL(sock_alloc_send_skb);
2246 EXPORT_SYMBOL(sock_init_data);
2247 EXPORT_SYMBOL(sock_kfree_s);
2248 EXPORT_SYMBOL(sock_kmalloc);
2249 EXPORT_SYMBOL(sock_no_accept);
2250 EXPORT_SYMBOL(sock_no_bind);
2251 EXPORT_SYMBOL(sock_no_connect);
2252 EXPORT_SYMBOL(sock_no_getname);
2253 EXPORT_SYMBOL(sock_no_getsockopt);
2254 EXPORT_SYMBOL(sock_no_ioctl);
2255 EXPORT_SYMBOL(sock_no_listen);
2256 EXPORT_SYMBOL(sock_no_mmap);
2257 EXPORT_SYMBOL(sock_no_poll);
2258 EXPORT_SYMBOL(sock_no_recvmsg);
2259 EXPORT_SYMBOL(sock_no_sendmsg);
2260 EXPORT_SYMBOL(sock_no_sendpage);
2261 EXPORT_SYMBOL(sock_no_setsockopt);
2262 EXPORT_SYMBOL(sock_no_shutdown);
2263 EXPORT_SYMBOL(sock_no_socketpair);
2264 EXPORT_SYMBOL(sock_rfree);
2265 EXPORT_SYMBOL(sock_setsockopt);
2266 EXPORT_SYMBOL(sock_wfree);
2267 EXPORT_SYMBOL(sock_wmalloc);
2268 EXPORT_SYMBOL(sock_i_uid);
2269 EXPORT_SYMBOL(sock_i_ino);
2270 EXPORT_SYMBOL(sysctl_optmem_max);
2271