xref: /linux/net/core/sock.c (revision 20d0021394c1b070bf04b22c5bc8fdb437edd4c5)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Version:	$Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11  *
12  * Authors:	Ross Biro
13  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14  *		Florian La Roche, <flla@stud.uni-sb.de>
15  *		Alan Cox, <A.Cox@swansea.ac.uk>
16  *
17  * Fixes:
18  *		Alan Cox	: 	Numerous verify_area() problems
19  *		Alan Cox	:	Connecting on a connecting socket
20  *					now returns an error for tcp.
21  *		Alan Cox	:	sock->protocol is set correctly.
22  *					and is not sometimes left as 0.
23  *		Alan Cox	:	connect handles icmp errors on a
24  *					connect properly. Unfortunately there
25  *					is a restart syscall nasty there. I
26  *					can't match BSD without hacking the C
27  *					library. Ideas urgently sought!
28  *		Alan Cox	:	Disallow bind() to addresses that are
29  *					not ours - especially broadcast ones!!
30  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
31  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
32  *					instead they leave that for the DESTROY timer.
33  *		Alan Cox	:	Clean up error flag in accept
34  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
35  *					was buggy. Put a remove_sock() in the handler
36  *					for memory when we hit 0. Also altered the timer
37  *					code. The ACK stuff can wait and needs major
38  *					TCP layer surgery.
39  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
40  *					and fixed timer/inet_bh race.
41  *		Alan Cox	:	Added zapped flag for TCP
42  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
43  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
45  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
46  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
48  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
49  *	Pauline Middelink	:	identd support
50  *		Alan Cox	:	Fixed connect() taking signals I think.
51  *		Alan Cox	:	SO_LINGER supported
52  *		Alan Cox	:	Error reporting fixes
53  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
54  *		Alan Cox	:	inet sockets don't set sk->type!
55  *		Alan Cox	:	Split socket option code
56  *		Alan Cox	:	Callbacks
57  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
58  *		Alex		:	Removed restriction on inet fioctl
59  *		Alan Cox	:	Splitting INET from NET core
60  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
61  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
62  *		Alan Cox	:	Split IP from generic code
63  *		Alan Cox	:	New kfree_skbmem()
64  *		Alan Cox	:	Make SO_DEBUG superuser only.
65  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
66  *					(compatibility fix)
67  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
68  *		Alan Cox	:	Allocator for a socket is settable.
69  *		Alan Cox	:	SO_ERROR includes soft errors.
70  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
71  *		Alan Cox	: 	Generic socket allocation to make hooks
72  *					easier (suggested by Craig Metz).
73  *		Michael Pall	:	SO_ERROR returns positive errno again
74  *              Steve Whitehouse:       Added default destructor to free
75  *                                      protocol private data.
76  *              Steve Whitehouse:       Added various other default routines
77  *                                      common to several socket families.
78  *              Chris Evans     :       Call suser() check last on F_SETOWN
79  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
81  *		Andi Kleen	:	Fix write_space callback
82  *		Chris Evans	:	Security fixes - signedness again
83  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
84  *
85  * To Fix:
86  *
87  *
88  *		This program is free software; you can redistribute it and/or
89  *		modify it under the terms of the GNU General Public License
90  *		as published by the Free Software Foundation; either version
91  *		2 of the License, or (at your option) any later version.
92  */
93 
94 #include <linux/config.h>
95 #include <linux/errno.h>
96 #include <linux/types.h>
97 #include <linux/socket.h>
98 #include <linux/in.h>
99 #include <linux/kernel.h>
100 #include <linux/module.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/sched.h>
104 #include <linux/timer.h>
105 #include <linux/string.h>
106 #include <linux/sockios.h>
107 #include <linux/net.h>
108 #include <linux/mm.h>
109 #include <linux/slab.h>
110 #include <linux/interrupt.h>
111 #include <linux/poll.h>
112 #include <linux/tcp.h>
113 #include <linux/init.h>
114 
115 #include <asm/uaccess.h>
116 #include <asm/system.h>
117 
118 #include <linux/netdevice.h>
119 #include <net/protocol.h>
120 #include <linux/skbuff.h>
121 #include <net/request_sock.h>
122 #include <net/sock.h>
123 #include <net/xfrm.h>
124 #include <linux/ipsec.h>
125 
126 #include <linux/filter.h>
127 
128 #ifdef CONFIG_INET
129 #include <net/tcp.h>
130 #endif
131 
132 /* Take into consideration the size of the struct sk_buff overhead in the
133  * determination of these values, since that is non-constant across
134  * platforms.  This makes socket queueing behavior and performance
135  * not depend upon such differences.
136  */
137 #define _SK_MEM_PACKETS		256
138 #define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
139 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
140 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
141 
142 /* Run time adjustable parameters. */
143 __u32 sysctl_wmem_max = SK_WMEM_MAX;
144 __u32 sysctl_rmem_max = SK_RMEM_MAX;
145 __u32 sysctl_wmem_default = SK_WMEM_MAX;
146 __u32 sysctl_rmem_default = SK_RMEM_MAX;
147 
148 /* Maximal space eaten by iovec or ancilliary data plus some space */
149 int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
150 
151 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
152 {
153 	struct timeval tv;
154 
155 	if (optlen < sizeof(tv))
156 		return -EINVAL;
157 	if (copy_from_user(&tv, optval, sizeof(tv)))
158 		return -EFAULT;
159 
160 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
161 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
162 		return 0;
163 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
164 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
165 	return 0;
166 }
167 
168 static void sock_warn_obsolete_bsdism(const char *name)
169 {
170 	static int warned;
171 	static char warncomm[TASK_COMM_LEN];
172 	if (strcmp(warncomm, current->comm) && warned < 5) {
173 		strcpy(warncomm,  current->comm);
174 		printk(KERN_WARNING "process `%s' is using obsolete "
175 		       "%s SO_BSDCOMPAT\n", warncomm, name);
176 		warned++;
177 	}
178 }
179 
180 static void sock_disable_timestamp(struct sock *sk)
181 {
182 	if (sock_flag(sk, SOCK_TIMESTAMP)) {
183 		sock_reset_flag(sk, SOCK_TIMESTAMP);
184 		net_disable_timestamp();
185 	}
186 }
187 
188 
189 /*
190  *	This is meant for all protocols to use and covers goings on
191  *	at the socket level. Everything here is generic.
192  */
193 
194 int sock_setsockopt(struct socket *sock, int level, int optname,
195 		    char __user *optval, int optlen)
196 {
197 	struct sock *sk=sock->sk;
198 	struct sk_filter *filter;
199 	int val;
200 	int valbool;
201 	struct linger ling;
202 	int ret = 0;
203 
204 	/*
205 	 *	Options without arguments
206 	 */
207 
208 #ifdef SO_DONTLINGER		/* Compatibility item... */
209 	switch (optname) {
210 		case SO_DONTLINGER:
211 			sock_reset_flag(sk, SOCK_LINGER);
212 			return 0;
213 	}
214 #endif
215 
216   	if(optlen<sizeof(int))
217   		return(-EINVAL);
218 
219 	if (get_user(val, (int __user *)optval))
220 		return -EFAULT;
221 
222   	valbool = val?1:0;
223 
224 	lock_sock(sk);
225 
226   	switch(optname)
227   	{
228 		case SO_DEBUG:
229 			if(val && !capable(CAP_NET_ADMIN))
230 			{
231 				ret = -EACCES;
232 			}
233 			else if (valbool)
234 				sock_set_flag(sk, SOCK_DBG);
235 			else
236 				sock_reset_flag(sk, SOCK_DBG);
237 			break;
238 		case SO_REUSEADDR:
239 			sk->sk_reuse = valbool;
240 			break;
241 		case SO_TYPE:
242 		case SO_ERROR:
243 			ret = -ENOPROTOOPT;
244 		  	break;
245 		case SO_DONTROUTE:
246 			if (valbool)
247 				sock_set_flag(sk, SOCK_LOCALROUTE);
248 			else
249 				sock_reset_flag(sk, SOCK_LOCALROUTE);
250 			break;
251 		case SO_BROADCAST:
252 			sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
253 			break;
254 		case SO_SNDBUF:
255 			/* Don't error on this BSD doesn't and if you think
256 			   about it this is right. Otherwise apps have to
257 			   play 'guess the biggest size' games. RCVBUF/SNDBUF
258 			   are treated in BSD as hints */
259 
260 			if (val > sysctl_wmem_max)
261 				val = sysctl_wmem_max;
262 
263 			sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
264 			if ((val * 2) < SOCK_MIN_SNDBUF)
265 				sk->sk_sndbuf = SOCK_MIN_SNDBUF;
266 			else
267 				sk->sk_sndbuf = val * 2;
268 
269 			/*
270 			 *	Wake up sending tasks if we
271 			 *	upped the value.
272 			 */
273 			sk->sk_write_space(sk);
274 			break;
275 
276 		case SO_RCVBUF:
277 			/* Don't error on this BSD doesn't and if you think
278 			   about it this is right. Otherwise apps have to
279 			   play 'guess the biggest size' games. RCVBUF/SNDBUF
280 			   are treated in BSD as hints */
281 
282 			if (val > sysctl_rmem_max)
283 				val = sysctl_rmem_max;
284 
285 			sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
286 			/* FIXME: is this lower bound the right one? */
287 			if ((val * 2) < SOCK_MIN_RCVBUF)
288 				sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
289 			else
290 				sk->sk_rcvbuf = val * 2;
291 			break;
292 
293 		case SO_KEEPALIVE:
294 #ifdef CONFIG_INET
295 			if (sk->sk_protocol == IPPROTO_TCP)
296 				tcp_set_keepalive(sk, valbool);
297 #endif
298 			sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
299 			break;
300 
301 	 	case SO_OOBINLINE:
302 			sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
303 			break;
304 
305 	 	case SO_NO_CHECK:
306 			sk->sk_no_check = valbool;
307 			break;
308 
309 		case SO_PRIORITY:
310 			if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
311 				sk->sk_priority = val;
312 			else
313 				ret = -EPERM;
314 			break;
315 
316 		case SO_LINGER:
317 			if(optlen<sizeof(ling)) {
318 				ret = -EINVAL;	/* 1003.1g */
319 				break;
320 			}
321 			if (copy_from_user(&ling,optval,sizeof(ling))) {
322 				ret = -EFAULT;
323 				break;
324 			}
325 			if (!ling.l_onoff)
326 				sock_reset_flag(sk, SOCK_LINGER);
327 			else {
328 #if (BITS_PER_LONG == 32)
329 				if (ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
330 					sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
331 				else
332 #endif
333 					sk->sk_lingertime = ling.l_linger * HZ;
334 				sock_set_flag(sk, SOCK_LINGER);
335 			}
336 			break;
337 
338 		case SO_BSDCOMPAT:
339 			sock_warn_obsolete_bsdism("setsockopt");
340 			break;
341 
342 		case SO_PASSCRED:
343 			if (valbool)
344 				set_bit(SOCK_PASSCRED, &sock->flags);
345 			else
346 				clear_bit(SOCK_PASSCRED, &sock->flags);
347 			break;
348 
349 		case SO_TIMESTAMP:
350 			if (valbool)  {
351 				sock_set_flag(sk, SOCK_RCVTSTAMP);
352 				sock_enable_timestamp(sk);
353 			} else
354 				sock_reset_flag(sk, SOCK_RCVTSTAMP);
355 			break;
356 
357 		case SO_RCVLOWAT:
358 			if (val < 0)
359 				val = INT_MAX;
360 			sk->sk_rcvlowat = val ? : 1;
361 			break;
362 
363 		case SO_RCVTIMEO:
364 			ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
365 			break;
366 
367 		case SO_SNDTIMEO:
368 			ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
369 			break;
370 
371 #ifdef CONFIG_NETDEVICES
372 		case SO_BINDTODEVICE:
373 		{
374 			char devname[IFNAMSIZ];
375 
376 			/* Sorry... */
377 			if (!capable(CAP_NET_RAW)) {
378 				ret = -EPERM;
379 				break;
380 			}
381 
382 			/* Bind this socket to a particular device like "eth0",
383 			 * as specified in the passed interface name. If the
384 			 * name is "" or the option length is zero the socket
385 			 * is not bound.
386 			 */
387 
388 			if (!valbool) {
389 				sk->sk_bound_dev_if = 0;
390 			} else {
391 				if (optlen > IFNAMSIZ)
392 					optlen = IFNAMSIZ;
393 				if (copy_from_user(devname, optval, optlen)) {
394 					ret = -EFAULT;
395 					break;
396 				}
397 
398 				/* Remove any cached route for this socket. */
399 				sk_dst_reset(sk);
400 
401 				if (devname[0] == '\0') {
402 					sk->sk_bound_dev_if = 0;
403 				} else {
404 					struct net_device *dev = dev_get_by_name(devname);
405 					if (!dev) {
406 						ret = -ENODEV;
407 						break;
408 					}
409 					sk->sk_bound_dev_if = dev->ifindex;
410 					dev_put(dev);
411 				}
412 			}
413 			break;
414 		}
415 #endif
416 
417 
418 		case SO_ATTACH_FILTER:
419 			ret = -EINVAL;
420 			if (optlen == sizeof(struct sock_fprog)) {
421 				struct sock_fprog fprog;
422 
423 				ret = -EFAULT;
424 				if (copy_from_user(&fprog, optval, sizeof(fprog)))
425 					break;
426 
427 				ret = sk_attach_filter(&fprog, sk);
428 			}
429 			break;
430 
431 		case SO_DETACH_FILTER:
432 			spin_lock_bh(&sk->sk_lock.slock);
433 			filter = sk->sk_filter;
434                         if (filter) {
435 				sk->sk_filter = NULL;
436 				spin_unlock_bh(&sk->sk_lock.slock);
437 				sk_filter_release(sk, filter);
438 				break;
439 			}
440 			spin_unlock_bh(&sk->sk_lock.slock);
441 			ret = -ENONET;
442 			break;
443 
444 		/* We implement the SO_SNDLOWAT etc to
445 		   not be settable (1003.1g 5.3) */
446 		default:
447 		  	ret = -ENOPROTOOPT;
448 			break;
449   	}
450 	release_sock(sk);
451 	return ret;
452 }
453 
454 
455 int sock_getsockopt(struct socket *sock, int level, int optname,
456 		    char __user *optval, int __user *optlen)
457 {
458 	struct sock *sk = sock->sk;
459 
460 	union
461 	{
462   		int val;
463   		struct linger ling;
464 		struct timeval tm;
465 	} v;
466 
467 	unsigned int lv = sizeof(int);
468 	int len;
469 
470   	if(get_user(len,optlen))
471   		return -EFAULT;
472 	if(len < 0)
473 		return -EINVAL;
474 
475   	switch(optname)
476   	{
477 		case SO_DEBUG:
478 			v.val = sock_flag(sk, SOCK_DBG);
479 			break;
480 
481 		case SO_DONTROUTE:
482 			v.val = sock_flag(sk, SOCK_LOCALROUTE);
483 			break;
484 
485 		case SO_BROADCAST:
486 			v.val = !!sock_flag(sk, SOCK_BROADCAST);
487 			break;
488 
489 		case SO_SNDBUF:
490 			v.val = sk->sk_sndbuf;
491 			break;
492 
493 		case SO_RCVBUF:
494 			v.val = sk->sk_rcvbuf;
495 			break;
496 
497 		case SO_REUSEADDR:
498 			v.val = sk->sk_reuse;
499 			break;
500 
501 		case SO_KEEPALIVE:
502 			v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
503 			break;
504 
505 		case SO_TYPE:
506 			v.val = sk->sk_type;
507 			break;
508 
509 		case SO_ERROR:
510 			v.val = -sock_error(sk);
511 			if(v.val==0)
512 				v.val = xchg(&sk->sk_err_soft, 0);
513 			break;
514 
515 		case SO_OOBINLINE:
516 			v.val = !!sock_flag(sk, SOCK_URGINLINE);
517 			break;
518 
519 		case SO_NO_CHECK:
520 			v.val = sk->sk_no_check;
521 			break;
522 
523 		case SO_PRIORITY:
524 			v.val = sk->sk_priority;
525 			break;
526 
527 		case SO_LINGER:
528 			lv		= sizeof(v.ling);
529 			v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
530  			v.ling.l_linger	= sk->sk_lingertime / HZ;
531 			break;
532 
533 		case SO_BSDCOMPAT:
534 			sock_warn_obsolete_bsdism("getsockopt");
535 			break;
536 
537 		case SO_TIMESTAMP:
538 			v.val = sock_flag(sk, SOCK_RCVTSTAMP);
539 			break;
540 
541 		case SO_RCVTIMEO:
542 			lv=sizeof(struct timeval);
543 			if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
544 				v.tm.tv_sec = 0;
545 				v.tm.tv_usec = 0;
546 			} else {
547 				v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
548 				v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
549 			}
550 			break;
551 
552 		case SO_SNDTIMEO:
553 			lv=sizeof(struct timeval);
554 			if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
555 				v.tm.tv_sec = 0;
556 				v.tm.tv_usec = 0;
557 			} else {
558 				v.tm.tv_sec = sk->sk_sndtimeo / HZ;
559 				v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
560 			}
561 			break;
562 
563 		case SO_RCVLOWAT:
564 			v.val = sk->sk_rcvlowat;
565 			break;
566 
567 		case SO_SNDLOWAT:
568 			v.val=1;
569 			break;
570 
571 		case SO_PASSCRED:
572 			v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
573 			break;
574 
575 		case SO_PEERCRED:
576 			if (len > sizeof(sk->sk_peercred))
577 				len = sizeof(sk->sk_peercred);
578 			if (copy_to_user(optval, &sk->sk_peercred, len))
579 				return -EFAULT;
580 			goto lenout;
581 
582 		case SO_PEERNAME:
583 		{
584 			char address[128];
585 
586 			if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
587 				return -ENOTCONN;
588 			if (lv < len)
589 				return -EINVAL;
590 			if (copy_to_user(optval, address, len))
591 				return -EFAULT;
592 			goto lenout;
593 		}
594 
595 		/* Dubious BSD thing... Probably nobody even uses it, but
596 		 * the UNIX standard wants it for whatever reason... -DaveM
597 		 */
598 		case SO_ACCEPTCONN:
599 			v.val = sk->sk_state == TCP_LISTEN;
600 			break;
601 
602 		case SO_PEERSEC:
603 			return security_socket_getpeersec(sock, optval, optlen, len);
604 
605 		default:
606 			return(-ENOPROTOOPT);
607 	}
608 	if (len > lv)
609 		len = lv;
610 	if (copy_to_user(optval, &v, len))
611 		return -EFAULT;
612 lenout:
613   	if (put_user(len, optlen))
614   		return -EFAULT;
615   	return 0;
616 }
617 
618 /**
619  *	sk_alloc - All socket objects are allocated here
620  *	@family: protocol family
621  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
622  *	@prot: struct proto associated with this new sock instance
623  *	@zero_it: if we should zero the newly allocated sock
624  */
625 struct sock *sk_alloc(int family, unsigned int __nocast priority,
626 		      struct proto *prot, int zero_it)
627 {
628 	struct sock *sk = NULL;
629 	kmem_cache_t *slab = prot->slab;
630 
631 	if (slab != NULL)
632 		sk = kmem_cache_alloc(slab, priority);
633 	else
634 		sk = kmalloc(prot->obj_size, priority);
635 
636 	if (sk) {
637 		if (zero_it) {
638 			memset(sk, 0, prot->obj_size);
639 			sk->sk_family = family;
640 			/*
641 			 * See comment in struct sock definition to understand
642 			 * why we need sk_prot_creator -acme
643 			 */
644 			sk->sk_prot = sk->sk_prot_creator = prot;
645 			sock_lock_init(sk);
646 		}
647 
648 		if (security_sk_alloc(sk, family, priority)) {
649 			if (slab != NULL)
650 				kmem_cache_free(slab, sk);
651 			else
652 				kfree(sk);
653 			sk = NULL;
654 		} else
655 			__module_get(prot->owner);
656 	}
657 	return sk;
658 }
659 
660 void sk_free(struct sock *sk)
661 {
662 	struct sk_filter *filter;
663 	struct module *owner = sk->sk_prot_creator->owner;
664 
665 	if (sk->sk_destruct)
666 		sk->sk_destruct(sk);
667 
668 	filter = sk->sk_filter;
669 	if (filter) {
670 		sk_filter_release(sk, filter);
671 		sk->sk_filter = NULL;
672 	}
673 
674 	sock_disable_timestamp(sk);
675 
676 	if (atomic_read(&sk->sk_omem_alloc))
677 		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
678 		       __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
679 
680 	security_sk_free(sk);
681 	if (sk->sk_prot_creator->slab != NULL)
682 		kmem_cache_free(sk->sk_prot_creator->slab, sk);
683 	else
684 		kfree(sk);
685 	module_put(owner);
686 }
687 
688 void __init sk_init(void)
689 {
690 	if (num_physpages <= 4096) {
691 		sysctl_wmem_max = 32767;
692 		sysctl_rmem_max = 32767;
693 		sysctl_wmem_default = 32767;
694 		sysctl_rmem_default = 32767;
695 	} else if (num_physpages >= 131072) {
696 		sysctl_wmem_max = 131071;
697 		sysctl_rmem_max = 131071;
698 	}
699 }
700 
701 /*
702  *	Simple resource managers for sockets.
703  */
704 
705 
706 /*
707  * Write buffer destructor automatically called from kfree_skb.
708  */
709 void sock_wfree(struct sk_buff *skb)
710 {
711 	struct sock *sk = skb->sk;
712 
713 	/* In case it might be waiting for more memory. */
714 	atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
715 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
716 		sk->sk_write_space(sk);
717 	sock_put(sk);
718 }
719 
720 /*
721  * Read buffer destructor automatically called from kfree_skb.
722  */
723 void sock_rfree(struct sk_buff *skb)
724 {
725 	struct sock *sk = skb->sk;
726 
727 	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
728 }
729 
730 
731 int sock_i_uid(struct sock *sk)
732 {
733 	int uid;
734 
735 	read_lock(&sk->sk_callback_lock);
736 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
737 	read_unlock(&sk->sk_callback_lock);
738 	return uid;
739 }
740 
741 unsigned long sock_i_ino(struct sock *sk)
742 {
743 	unsigned long ino;
744 
745 	read_lock(&sk->sk_callback_lock);
746 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
747 	read_unlock(&sk->sk_callback_lock);
748 	return ino;
749 }
750 
751 /*
752  * Allocate a skb from the socket's send buffer.
753  */
754 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
755 			     unsigned int __nocast priority)
756 {
757 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
758 		struct sk_buff * skb = alloc_skb(size, priority);
759 		if (skb) {
760 			skb_set_owner_w(skb, sk);
761 			return skb;
762 		}
763 	}
764 	return NULL;
765 }
766 
767 /*
768  * Allocate a skb from the socket's receive buffer.
769  */
770 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
771 			     unsigned int __nocast priority)
772 {
773 	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
774 		struct sk_buff *skb = alloc_skb(size, priority);
775 		if (skb) {
776 			skb_set_owner_r(skb, sk);
777 			return skb;
778 		}
779 	}
780 	return NULL;
781 }
782 
783 /*
784  * Allocate a memory block from the socket's option memory buffer.
785  */
786 void *sock_kmalloc(struct sock *sk, int size, unsigned int __nocast priority)
787 {
788 	if ((unsigned)size <= sysctl_optmem_max &&
789 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
790 		void *mem;
791 		/* First do the add, to avoid the race if kmalloc
792  		 * might sleep.
793 		 */
794 		atomic_add(size, &sk->sk_omem_alloc);
795 		mem = kmalloc(size, priority);
796 		if (mem)
797 			return mem;
798 		atomic_sub(size, &sk->sk_omem_alloc);
799 	}
800 	return NULL;
801 }
802 
803 /*
804  * Free an option memory block.
805  */
806 void sock_kfree_s(struct sock *sk, void *mem, int size)
807 {
808 	kfree(mem);
809 	atomic_sub(size, &sk->sk_omem_alloc);
810 }
811 
812 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
813    I think, these locks should be removed for datagram sockets.
814  */
815 static long sock_wait_for_wmem(struct sock * sk, long timeo)
816 {
817 	DEFINE_WAIT(wait);
818 
819 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
820 	for (;;) {
821 		if (!timeo)
822 			break;
823 		if (signal_pending(current))
824 			break;
825 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
826 		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
827 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
828 			break;
829 		if (sk->sk_shutdown & SEND_SHUTDOWN)
830 			break;
831 		if (sk->sk_err)
832 			break;
833 		timeo = schedule_timeout(timeo);
834 	}
835 	finish_wait(sk->sk_sleep, &wait);
836 	return timeo;
837 }
838 
839 
840 /*
841  *	Generic send/receive buffer handlers
842  */
843 
844 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
845 					    unsigned long header_len,
846 					    unsigned long data_len,
847 					    int noblock, int *errcode)
848 {
849 	struct sk_buff *skb;
850 	unsigned int gfp_mask;
851 	long timeo;
852 	int err;
853 
854 	gfp_mask = sk->sk_allocation;
855 	if (gfp_mask & __GFP_WAIT)
856 		gfp_mask |= __GFP_REPEAT;
857 
858 	timeo = sock_sndtimeo(sk, noblock);
859 	while (1) {
860 		err = sock_error(sk);
861 		if (err != 0)
862 			goto failure;
863 
864 		err = -EPIPE;
865 		if (sk->sk_shutdown & SEND_SHUTDOWN)
866 			goto failure;
867 
868 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
869 			skb = alloc_skb(header_len, sk->sk_allocation);
870 			if (skb) {
871 				int npages;
872 				int i;
873 
874 				/* No pages, we're done... */
875 				if (!data_len)
876 					break;
877 
878 				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
879 				skb->truesize += data_len;
880 				skb_shinfo(skb)->nr_frags = npages;
881 				for (i = 0; i < npages; i++) {
882 					struct page *page;
883 					skb_frag_t *frag;
884 
885 					page = alloc_pages(sk->sk_allocation, 0);
886 					if (!page) {
887 						err = -ENOBUFS;
888 						skb_shinfo(skb)->nr_frags = i;
889 						kfree_skb(skb);
890 						goto failure;
891 					}
892 
893 					frag = &skb_shinfo(skb)->frags[i];
894 					frag->page = page;
895 					frag->page_offset = 0;
896 					frag->size = (data_len >= PAGE_SIZE ?
897 						      PAGE_SIZE :
898 						      data_len);
899 					data_len -= PAGE_SIZE;
900 				}
901 
902 				/* Full success... */
903 				break;
904 			}
905 			err = -ENOBUFS;
906 			goto failure;
907 		}
908 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
909 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
910 		err = -EAGAIN;
911 		if (!timeo)
912 			goto failure;
913 		if (signal_pending(current))
914 			goto interrupted;
915 		timeo = sock_wait_for_wmem(sk, timeo);
916 	}
917 
918 	skb_set_owner_w(skb, sk);
919 	return skb;
920 
921 interrupted:
922 	err = sock_intr_errno(timeo);
923 failure:
924 	*errcode = err;
925 	return NULL;
926 }
927 
928 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
929 				    int noblock, int *errcode)
930 {
931 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
932 }
933 
934 static void __lock_sock(struct sock *sk)
935 {
936 	DEFINE_WAIT(wait);
937 
938 	for(;;) {
939 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
940 					TASK_UNINTERRUPTIBLE);
941 		spin_unlock_bh(&sk->sk_lock.slock);
942 		schedule();
943 		spin_lock_bh(&sk->sk_lock.slock);
944 		if(!sock_owned_by_user(sk))
945 			break;
946 	}
947 	finish_wait(&sk->sk_lock.wq, &wait);
948 }
949 
950 static void __release_sock(struct sock *sk)
951 {
952 	struct sk_buff *skb = sk->sk_backlog.head;
953 
954 	do {
955 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
956 		bh_unlock_sock(sk);
957 
958 		do {
959 			struct sk_buff *next = skb->next;
960 
961 			skb->next = NULL;
962 			sk->sk_backlog_rcv(sk, skb);
963 
964 			/*
965 			 * We are in process context here with softirqs
966 			 * disabled, use cond_resched_softirq() to preempt.
967 			 * This is safe to do because we've taken the backlog
968 			 * queue private:
969 			 */
970 			cond_resched_softirq();
971 
972 			skb = next;
973 		} while (skb != NULL);
974 
975 		bh_lock_sock(sk);
976 	} while((skb = sk->sk_backlog.head) != NULL);
977 }
978 
979 /**
980  * sk_wait_data - wait for data to arrive at sk_receive_queue
981  * @sk:    sock to wait on
982  * @timeo: for how long
983  *
984  * Now socket state including sk->sk_err is changed only under lock,
985  * hence we may omit checks after joining wait queue.
986  * We check receive queue before schedule() only as optimization;
987  * it is very likely that release_sock() added new data.
988  */
989 int sk_wait_data(struct sock *sk, long *timeo)
990 {
991 	int rc;
992 	DEFINE_WAIT(wait);
993 
994 	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
995 	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
996 	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
997 	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
998 	finish_wait(sk->sk_sleep, &wait);
999 	return rc;
1000 }
1001 
1002 EXPORT_SYMBOL(sk_wait_data);
1003 
1004 /*
1005  * Set of default routines for initialising struct proto_ops when
1006  * the protocol does not support a particular function. In certain
1007  * cases where it makes no sense for a protocol to have a "do nothing"
1008  * function, some default processing is provided.
1009  */
1010 
1011 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1012 {
1013 	return -EOPNOTSUPP;
1014 }
1015 
1016 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1017 		    int len, int flags)
1018 {
1019 	return -EOPNOTSUPP;
1020 }
1021 
1022 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1023 {
1024 	return -EOPNOTSUPP;
1025 }
1026 
1027 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1028 {
1029 	return -EOPNOTSUPP;
1030 }
1031 
1032 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1033 		    int *len, int peer)
1034 {
1035 	return -EOPNOTSUPP;
1036 }
1037 
1038 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1039 {
1040 	return 0;
1041 }
1042 
1043 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1044 {
1045 	return -EOPNOTSUPP;
1046 }
1047 
1048 int sock_no_listen(struct socket *sock, int backlog)
1049 {
1050 	return -EOPNOTSUPP;
1051 }
1052 
1053 int sock_no_shutdown(struct socket *sock, int how)
1054 {
1055 	return -EOPNOTSUPP;
1056 }
1057 
1058 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1059 		    char __user *optval, int optlen)
1060 {
1061 	return -EOPNOTSUPP;
1062 }
1063 
1064 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1065 		    char __user *optval, int __user *optlen)
1066 {
1067 	return -EOPNOTSUPP;
1068 }
1069 
1070 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1071 		    size_t len)
1072 {
1073 	return -EOPNOTSUPP;
1074 }
1075 
1076 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1077 		    size_t len, int flags)
1078 {
1079 	return -EOPNOTSUPP;
1080 }
1081 
1082 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1083 {
1084 	/* Mirror missing mmap method error code */
1085 	return -ENODEV;
1086 }
1087 
1088 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1089 {
1090 	ssize_t res;
1091 	struct msghdr msg = {.msg_flags = flags};
1092 	struct kvec iov;
1093 	char *kaddr = kmap(page);
1094 	iov.iov_base = kaddr + offset;
1095 	iov.iov_len = size;
1096 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1097 	kunmap(page);
1098 	return res;
1099 }
1100 
1101 /*
1102  *	Default Socket Callbacks
1103  */
1104 
1105 static void sock_def_wakeup(struct sock *sk)
1106 {
1107 	read_lock(&sk->sk_callback_lock);
1108 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1109 		wake_up_interruptible_all(sk->sk_sleep);
1110 	read_unlock(&sk->sk_callback_lock);
1111 }
1112 
1113 static void sock_def_error_report(struct sock *sk)
1114 {
1115 	read_lock(&sk->sk_callback_lock);
1116 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1117 		wake_up_interruptible(sk->sk_sleep);
1118 	sk_wake_async(sk,0,POLL_ERR);
1119 	read_unlock(&sk->sk_callback_lock);
1120 }
1121 
1122 static void sock_def_readable(struct sock *sk, int len)
1123 {
1124 	read_lock(&sk->sk_callback_lock);
1125 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1126 		wake_up_interruptible(sk->sk_sleep);
1127 	sk_wake_async(sk,1,POLL_IN);
1128 	read_unlock(&sk->sk_callback_lock);
1129 }
1130 
1131 static void sock_def_write_space(struct sock *sk)
1132 {
1133 	read_lock(&sk->sk_callback_lock);
1134 
1135 	/* Do not wake up a writer until he can make "significant"
1136 	 * progress.  --DaveM
1137 	 */
1138 	if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1139 		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1140 			wake_up_interruptible(sk->sk_sleep);
1141 
1142 		/* Should agree with poll, otherwise some programs break */
1143 		if (sock_writeable(sk))
1144 			sk_wake_async(sk, 2, POLL_OUT);
1145 	}
1146 
1147 	read_unlock(&sk->sk_callback_lock);
1148 }
1149 
1150 static void sock_def_destruct(struct sock *sk)
1151 {
1152 	if (sk->sk_protinfo)
1153 		kfree(sk->sk_protinfo);
1154 }
1155 
1156 void sk_send_sigurg(struct sock *sk)
1157 {
1158 	if (sk->sk_socket && sk->sk_socket->file)
1159 		if (send_sigurg(&sk->sk_socket->file->f_owner))
1160 			sk_wake_async(sk, 3, POLL_PRI);
1161 }
1162 
1163 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1164 		    unsigned long expires)
1165 {
1166 	if (!mod_timer(timer, expires))
1167 		sock_hold(sk);
1168 }
1169 
1170 EXPORT_SYMBOL(sk_reset_timer);
1171 
1172 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1173 {
1174 	if (timer_pending(timer) && del_timer(timer))
1175 		__sock_put(sk);
1176 }
1177 
1178 EXPORT_SYMBOL(sk_stop_timer);
1179 
1180 void sock_init_data(struct socket *sock, struct sock *sk)
1181 {
1182 	skb_queue_head_init(&sk->sk_receive_queue);
1183 	skb_queue_head_init(&sk->sk_write_queue);
1184 	skb_queue_head_init(&sk->sk_error_queue);
1185 
1186 	sk->sk_send_head	=	NULL;
1187 
1188 	init_timer(&sk->sk_timer);
1189 
1190 	sk->sk_allocation	=	GFP_KERNEL;
1191 	sk->sk_rcvbuf		=	sysctl_rmem_default;
1192 	sk->sk_sndbuf		=	sysctl_wmem_default;
1193 	sk->sk_state		=	TCP_CLOSE;
1194 	sk->sk_socket		=	sock;
1195 
1196 	sock_set_flag(sk, SOCK_ZAPPED);
1197 
1198 	if(sock)
1199 	{
1200 		sk->sk_type	=	sock->type;
1201 		sk->sk_sleep	=	&sock->wait;
1202 		sock->sk	=	sk;
1203 	} else
1204 		sk->sk_sleep	=	NULL;
1205 
1206 	rwlock_init(&sk->sk_dst_lock);
1207 	rwlock_init(&sk->sk_callback_lock);
1208 
1209 	sk->sk_state_change	=	sock_def_wakeup;
1210 	sk->sk_data_ready	=	sock_def_readable;
1211 	sk->sk_write_space	=	sock_def_write_space;
1212 	sk->sk_error_report	=	sock_def_error_report;
1213 	sk->sk_destruct		=	sock_def_destruct;
1214 
1215 	sk->sk_sndmsg_page	=	NULL;
1216 	sk->sk_sndmsg_off	=	0;
1217 
1218 	sk->sk_peercred.pid 	=	0;
1219 	sk->sk_peercred.uid	=	-1;
1220 	sk->sk_peercred.gid	=	-1;
1221 	sk->sk_write_pending	=	0;
1222 	sk->sk_rcvlowat		=	1;
1223 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1224 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1225 
1226 	sk->sk_stamp.tv_sec     = -1L;
1227 	sk->sk_stamp.tv_usec    = -1L;
1228 
1229 	atomic_set(&sk->sk_refcnt, 1);
1230 }
1231 
1232 void fastcall lock_sock(struct sock *sk)
1233 {
1234 	might_sleep();
1235 	spin_lock_bh(&(sk->sk_lock.slock));
1236 	if (sk->sk_lock.owner)
1237 		__lock_sock(sk);
1238 	sk->sk_lock.owner = (void *)1;
1239 	spin_unlock_bh(&(sk->sk_lock.slock));
1240 }
1241 
1242 EXPORT_SYMBOL(lock_sock);
1243 
1244 void fastcall release_sock(struct sock *sk)
1245 {
1246 	spin_lock_bh(&(sk->sk_lock.slock));
1247 	if (sk->sk_backlog.tail)
1248 		__release_sock(sk);
1249 	sk->sk_lock.owner = NULL;
1250         if (waitqueue_active(&(sk->sk_lock.wq)))
1251 		wake_up(&(sk->sk_lock.wq));
1252 	spin_unlock_bh(&(sk->sk_lock.slock));
1253 }
1254 EXPORT_SYMBOL(release_sock);
1255 
1256 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1257 {
1258 	if (!sock_flag(sk, SOCK_TIMESTAMP))
1259 		sock_enable_timestamp(sk);
1260 	if (sk->sk_stamp.tv_sec == -1)
1261 		return -ENOENT;
1262 	if (sk->sk_stamp.tv_sec == 0)
1263 		do_gettimeofday(&sk->sk_stamp);
1264 	return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
1265 		-EFAULT : 0;
1266 }
1267 EXPORT_SYMBOL(sock_get_timestamp);
1268 
1269 void sock_enable_timestamp(struct sock *sk)
1270 {
1271 	if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1272 		sock_set_flag(sk, SOCK_TIMESTAMP);
1273 		net_enable_timestamp();
1274 	}
1275 }
1276 EXPORT_SYMBOL(sock_enable_timestamp);
1277 
1278 /*
1279  *	Get a socket option on an socket.
1280  *
1281  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
1282  *	asynchronous errors should be reported by getsockopt. We assume
1283  *	this means if you specify SO_ERROR (otherwise whats the point of it).
1284  */
1285 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1286 			   char __user *optval, int __user *optlen)
1287 {
1288 	struct sock *sk = sock->sk;
1289 
1290 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1291 }
1292 
1293 EXPORT_SYMBOL(sock_common_getsockopt);
1294 
1295 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1296 			struct msghdr *msg, size_t size, int flags)
1297 {
1298 	struct sock *sk = sock->sk;
1299 	int addr_len = 0;
1300 	int err;
1301 
1302 	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1303 				   flags & ~MSG_DONTWAIT, &addr_len);
1304 	if (err >= 0)
1305 		msg->msg_namelen = addr_len;
1306 	return err;
1307 }
1308 
1309 EXPORT_SYMBOL(sock_common_recvmsg);
1310 
1311 /*
1312  *	Set socket options on an inet socket.
1313  */
1314 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1315 			   char __user *optval, int optlen)
1316 {
1317 	struct sock *sk = sock->sk;
1318 
1319 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1320 }
1321 
1322 EXPORT_SYMBOL(sock_common_setsockopt);
1323 
1324 void sk_common_release(struct sock *sk)
1325 {
1326 	if (sk->sk_prot->destroy)
1327 		sk->sk_prot->destroy(sk);
1328 
1329 	/*
1330 	 * Observation: when sock_common_release is called, processes have
1331 	 * no access to socket. But net still has.
1332 	 * Step one, detach it from networking:
1333 	 *
1334 	 * A. Remove from hash tables.
1335 	 */
1336 
1337 	sk->sk_prot->unhash(sk);
1338 
1339 	/*
1340 	 * In this point socket cannot receive new packets, but it is possible
1341 	 * that some packets are in flight because some CPU runs receiver and
1342 	 * did hash table lookup before we unhashed socket. They will achieve
1343 	 * receive queue and will be purged by socket destructor.
1344 	 *
1345 	 * Also we still have packets pending on receive queue and probably,
1346 	 * our own packets waiting in device queues. sock_destroy will drain
1347 	 * receive queue, but transmitted packets will delay socket destruction
1348 	 * until the last reference will be released.
1349 	 */
1350 
1351 	sock_orphan(sk);
1352 
1353 	xfrm_sk_free_policy(sk);
1354 
1355 #ifdef INET_REFCNT_DEBUG
1356 	if (atomic_read(&sk->sk_refcnt) != 1)
1357 		printk(KERN_DEBUG "Destruction of the socket %p delayed, c=%d\n",
1358 		       sk, atomic_read(&sk->sk_refcnt));
1359 #endif
1360 	sock_put(sk);
1361 }
1362 
1363 EXPORT_SYMBOL(sk_common_release);
1364 
1365 static DEFINE_RWLOCK(proto_list_lock);
1366 static LIST_HEAD(proto_list);
1367 
1368 int proto_register(struct proto *prot, int alloc_slab)
1369 {
1370 	char *request_sock_slab_name;
1371 	int rc = -ENOBUFS;
1372 
1373 	if (alloc_slab) {
1374 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1375 					       SLAB_HWCACHE_ALIGN, NULL, NULL);
1376 
1377 		if (prot->slab == NULL) {
1378 			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1379 			       prot->name);
1380 			goto out;
1381 		}
1382 
1383 		if (prot->rsk_prot != NULL) {
1384 			static const char mask[] = "request_sock_%s";
1385 
1386 			request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1387 			if (request_sock_slab_name == NULL)
1388 				goto out_free_sock_slab;
1389 
1390 			sprintf(request_sock_slab_name, mask, prot->name);
1391 			prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1392 								 prot->rsk_prot->obj_size, 0,
1393 								 SLAB_HWCACHE_ALIGN, NULL, NULL);
1394 
1395 			if (prot->rsk_prot->slab == NULL) {
1396 				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1397 				       prot->name);
1398 				goto out_free_request_sock_slab_name;
1399 			}
1400 		}
1401 	}
1402 
1403 	write_lock(&proto_list_lock);
1404 	list_add(&prot->node, &proto_list);
1405 	write_unlock(&proto_list_lock);
1406 	rc = 0;
1407 out:
1408 	return rc;
1409 out_free_request_sock_slab_name:
1410 	kfree(request_sock_slab_name);
1411 out_free_sock_slab:
1412 	kmem_cache_destroy(prot->slab);
1413 	prot->slab = NULL;
1414 	goto out;
1415 }
1416 
1417 EXPORT_SYMBOL(proto_register);
1418 
1419 void proto_unregister(struct proto *prot)
1420 {
1421 	write_lock(&proto_list_lock);
1422 
1423 	if (prot->slab != NULL) {
1424 		kmem_cache_destroy(prot->slab);
1425 		prot->slab = NULL;
1426 	}
1427 
1428 	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1429 		const char *name = kmem_cache_name(prot->rsk_prot->slab);
1430 
1431 		kmem_cache_destroy(prot->rsk_prot->slab);
1432 		kfree(name);
1433 		prot->rsk_prot->slab = NULL;
1434 	}
1435 
1436 	list_del(&prot->node);
1437 	write_unlock(&proto_list_lock);
1438 }
1439 
1440 EXPORT_SYMBOL(proto_unregister);
1441 
1442 #ifdef CONFIG_PROC_FS
1443 static inline struct proto *__proto_head(void)
1444 {
1445 	return list_entry(proto_list.next, struct proto, node);
1446 }
1447 
1448 static inline struct proto *proto_head(void)
1449 {
1450 	return list_empty(&proto_list) ? NULL : __proto_head();
1451 }
1452 
1453 static inline struct proto *proto_next(struct proto *proto)
1454 {
1455 	return proto->node.next == &proto_list ? NULL :
1456 		list_entry(proto->node.next, struct proto, node);
1457 }
1458 
1459 static inline struct proto *proto_get_idx(loff_t pos)
1460 {
1461 	struct proto *proto;
1462 	loff_t i = 0;
1463 
1464 	list_for_each_entry(proto, &proto_list, node)
1465 		if (i++ == pos)
1466 			goto out;
1467 
1468 	proto = NULL;
1469 out:
1470 	return proto;
1471 }
1472 
1473 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1474 {
1475 	read_lock(&proto_list_lock);
1476 	return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1477 }
1478 
1479 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1480 {
1481 	++*pos;
1482 	return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1483 }
1484 
1485 static void proto_seq_stop(struct seq_file *seq, void *v)
1486 {
1487 	read_unlock(&proto_list_lock);
1488 }
1489 
1490 static char proto_method_implemented(const void *method)
1491 {
1492 	return method == NULL ? 'n' : 'y';
1493 }
1494 
1495 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1496 {
1497 	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
1498 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1499 		   proto->name,
1500 		   proto->obj_size,
1501 		   proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1502 		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1503 		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1504 		   proto->max_header,
1505 		   proto->slab == NULL ? "no" : "yes",
1506 		   module_name(proto->owner),
1507 		   proto_method_implemented(proto->close),
1508 		   proto_method_implemented(proto->connect),
1509 		   proto_method_implemented(proto->disconnect),
1510 		   proto_method_implemented(proto->accept),
1511 		   proto_method_implemented(proto->ioctl),
1512 		   proto_method_implemented(proto->init),
1513 		   proto_method_implemented(proto->destroy),
1514 		   proto_method_implemented(proto->shutdown),
1515 		   proto_method_implemented(proto->setsockopt),
1516 		   proto_method_implemented(proto->getsockopt),
1517 		   proto_method_implemented(proto->sendmsg),
1518 		   proto_method_implemented(proto->recvmsg),
1519 		   proto_method_implemented(proto->sendpage),
1520 		   proto_method_implemented(proto->bind),
1521 		   proto_method_implemented(proto->backlog_rcv),
1522 		   proto_method_implemented(proto->hash),
1523 		   proto_method_implemented(proto->unhash),
1524 		   proto_method_implemented(proto->get_port),
1525 		   proto_method_implemented(proto->enter_memory_pressure));
1526 }
1527 
1528 static int proto_seq_show(struct seq_file *seq, void *v)
1529 {
1530 	if (v == SEQ_START_TOKEN)
1531 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1532 			   "protocol",
1533 			   "size",
1534 			   "sockets",
1535 			   "memory",
1536 			   "press",
1537 			   "maxhdr",
1538 			   "slab",
1539 			   "module",
1540 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1541 	else
1542 		proto_seq_printf(seq, v);
1543 	return 0;
1544 }
1545 
1546 static struct seq_operations proto_seq_ops = {
1547 	.start  = proto_seq_start,
1548 	.next   = proto_seq_next,
1549 	.stop   = proto_seq_stop,
1550 	.show   = proto_seq_show,
1551 };
1552 
1553 static int proto_seq_open(struct inode *inode, struct file *file)
1554 {
1555 	return seq_open(file, &proto_seq_ops);
1556 }
1557 
1558 static struct file_operations proto_seq_fops = {
1559 	.owner		= THIS_MODULE,
1560 	.open		= proto_seq_open,
1561 	.read		= seq_read,
1562 	.llseek		= seq_lseek,
1563 	.release	= seq_release,
1564 };
1565 
1566 static int __init proto_init(void)
1567 {
1568 	/* register /proc/net/protocols */
1569 	return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1570 }
1571 
1572 subsys_initcall(proto_init);
1573 
1574 #endif /* PROC_FS */
1575 
1576 EXPORT_SYMBOL(sk_alloc);
1577 EXPORT_SYMBOL(sk_free);
1578 EXPORT_SYMBOL(sk_send_sigurg);
1579 EXPORT_SYMBOL(sock_alloc_send_skb);
1580 EXPORT_SYMBOL(sock_init_data);
1581 EXPORT_SYMBOL(sock_kfree_s);
1582 EXPORT_SYMBOL(sock_kmalloc);
1583 EXPORT_SYMBOL(sock_no_accept);
1584 EXPORT_SYMBOL(sock_no_bind);
1585 EXPORT_SYMBOL(sock_no_connect);
1586 EXPORT_SYMBOL(sock_no_getname);
1587 EXPORT_SYMBOL(sock_no_getsockopt);
1588 EXPORT_SYMBOL(sock_no_ioctl);
1589 EXPORT_SYMBOL(sock_no_listen);
1590 EXPORT_SYMBOL(sock_no_mmap);
1591 EXPORT_SYMBOL(sock_no_poll);
1592 EXPORT_SYMBOL(sock_no_recvmsg);
1593 EXPORT_SYMBOL(sock_no_sendmsg);
1594 EXPORT_SYMBOL(sock_no_sendpage);
1595 EXPORT_SYMBOL(sock_no_setsockopt);
1596 EXPORT_SYMBOL(sock_no_shutdown);
1597 EXPORT_SYMBOL(sock_no_socketpair);
1598 EXPORT_SYMBOL(sock_rfree);
1599 EXPORT_SYMBOL(sock_setsockopt);
1600 EXPORT_SYMBOL(sock_wfree);
1601 EXPORT_SYMBOL(sock_wmalloc);
1602 EXPORT_SYMBOL(sock_i_uid);
1603 EXPORT_SYMBOL(sock_i_ino);
1604 #ifdef CONFIG_SYSCTL
1605 EXPORT_SYMBOL(sysctl_optmem_max);
1606 EXPORT_SYMBOL(sysctl_rmem_max);
1607 EXPORT_SYMBOL(sysctl_wmem_max);
1608 #endif
1609