xref: /linux/net/core/sock.c (revision 13abf8130139c2ccd4962a7e5a8902be5e6cb5a7)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Version:	$Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
11  *
12  * Authors:	Ross Biro
13  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
14  *		Florian La Roche, <flla@stud.uni-sb.de>
15  *		Alan Cox, <A.Cox@swansea.ac.uk>
16  *
17  * Fixes:
18  *		Alan Cox	: 	Numerous verify_area() problems
19  *		Alan Cox	:	Connecting on a connecting socket
20  *					now returns an error for tcp.
21  *		Alan Cox	:	sock->protocol is set correctly.
22  *					and is not sometimes left as 0.
23  *		Alan Cox	:	connect handles icmp errors on a
24  *					connect properly. Unfortunately there
25  *					is a restart syscall nasty there. I
26  *					can't match BSD without hacking the C
27  *					library. Ideas urgently sought!
28  *		Alan Cox	:	Disallow bind() to addresses that are
29  *					not ours - especially broadcast ones!!
30  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
31  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
32  *					instead they leave that for the DESTROY timer.
33  *		Alan Cox	:	Clean up error flag in accept
34  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
35  *					was buggy. Put a remove_sock() in the handler
36  *					for memory when we hit 0. Also altered the timer
37  *					code. The ACK stuff can wait and needs major
38  *					TCP layer surgery.
39  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
40  *					and fixed timer/inet_bh race.
41  *		Alan Cox	:	Added zapped flag for TCP
42  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
43  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
44  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
45  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
46  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
47  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
48  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
49  *	Pauline Middelink	:	identd support
50  *		Alan Cox	:	Fixed connect() taking signals I think.
51  *		Alan Cox	:	SO_LINGER supported
52  *		Alan Cox	:	Error reporting fixes
53  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
54  *		Alan Cox	:	inet sockets don't set sk->type!
55  *		Alan Cox	:	Split socket option code
56  *		Alan Cox	:	Callbacks
57  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
58  *		Alex		:	Removed restriction on inet fioctl
59  *		Alan Cox	:	Splitting INET from NET core
60  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
61  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
62  *		Alan Cox	:	Split IP from generic code
63  *		Alan Cox	:	New kfree_skbmem()
64  *		Alan Cox	:	Make SO_DEBUG superuser only.
65  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
66  *					(compatibility fix)
67  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
68  *		Alan Cox	:	Allocator for a socket is settable.
69  *		Alan Cox	:	SO_ERROR includes soft errors.
70  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
71  *		Alan Cox	: 	Generic socket allocation to make hooks
72  *					easier (suggested by Craig Metz).
73  *		Michael Pall	:	SO_ERROR returns positive errno again
74  *              Steve Whitehouse:       Added default destructor to free
75  *                                      protocol private data.
76  *              Steve Whitehouse:       Added various other default routines
77  *                                      common to several socket families.
78  *              Chris Evans     :       Call suser() check last on F_SETOWN
79  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
80  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
81  *		Andi Kleen	:	Fix write_space callback
82  *		Chris Evans	:	Security fixes - signedness again
83  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
84  *
85  * To Fix:
86  *
87  *
88  *		This program is free software; you can redistribute it and/or
89  *		modify it under the terms of the GNU General Public License
90  *		as published by the Free Software Foundation; either version
91  *		2 of the License, or (at your option) any later version.
92  */
93 
94 #include <linux/config.h>
95 #include <linux/errno.h>
96 #include <linux/types.h>
97 #include <linux/socket.h>
98 #include <linux/in.h>
99 #include <linux/kernel.h>
100 #include <linux/module.h>
101 #include <linux/proc_fs.h>
102 #include <linux/seq_file.h>
103 #include <linux/sched.h>
104 #include <linux/timer.h>
105 #include <linux/string.h>
106 #include <linux/sockios.h>
107 #include <linux/net.h>
108 #include <linux/mm.h>
109 #include <linux/slab.h>
110 #include <linux/interrupt.h>
111 #include <linux/poll.h>
112 #include <linux/tcp.h>
113 #include <linux/init.h>
114 
115 #include <asm/uaccess.h>
116 #include <asm/system.h>
117 
118 #include <linux/netdevice.h>
119 #include <net/protocol.h>
120 #include <linux/skbuff.h>
121 #include <net/request_sock.h>
122 #include <net/sock.h>
123 #include <net/xfrm.h>
124 #include <linux/ipsec.h>
125 
126 #include <linux/filter.h>
127 
128 #ifdef CONFIG_INET
129 #include <net/tcp.h>
130 #endif
131 
132 /* Take into consideration the size of the struct sk_buff overhead in the
133  * determination of these values, since that is non-constant across
134  * platforms.  This makes socket queueing behavior and performance
135  * not depend upon such differences.
136  */
137 #define _SK_MEM_PACKETS		256
138 #define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
139 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
140 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
141 
142 /* Run time adjustable parameters. */
143 __u32 sysctl_wmem_max = SK_WMEM_MAX;
144 __u32 sysctl_rmem_max = SK_RMEM_MAX;
145 __u32 sysctl_wmem_default = SK_WMEM_MAX;
146 __u32 sysctl_rmem_default = SK_RMEM_MAX;
147 
148 /* Maximal space eaten by iovec or ancilliary data plus some space */
149 int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
150 
151 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
152 {
153 	struct timeval tv;
154 
155 	if (optlen < sizeof(tv))
156 		return -EINVAL;
157 	if (copy_from_user(&tv, optval, sizeof(tv)))
158 		return -EFAULT;
159 
160 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
161 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
162 		return 0;
163 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
164 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
165 	return 0;
166 }
167 
168 static void sock_warn_obsolete_bsdism(const char *name)
169 {
170 	static int warned;
171 	static char warncomm[TASK_COMM_LEN];
172 	if (strcmp(warncomm, current->comm) && warned < 5) {
173 		strcpy(warncomm,  current->comm);
174 		printk(KERN_WARNING "process `%s' is using obsolete "
175 		       "%s SO_BSDCOMPAT\n", warncomm, name);
176 		warned++;
177 	}
178 }
179 
180 static void sock_disable_timestamp(struct sock *sk)
181 {
182 	if (sock_flag(sk, SOCK_TIMESTAMP)) {
183 		sock_reset_flag(sk, SOCK_TIMESTAMP);
184 		net_disable_timestamp();
185 	}
186 }
187 
188 
189 /*
190  *	This is meant for all protocols to use and covers goings on
191  *	at the socket level. Everything here is generic.
192  */
193 
194 int sock_setsockopt(struct socket *sock, int level, int optname,
195 		    char __user *optval, int optlen)
196 {
197 	struct sock *sk=sock->sk;
198 	struct sk_filter *filter;
199 	int val;
200 	int valbool;
201 	struct linger ling;
202 	int ret = 0;
203 
204 	/*
205 	 *	Options without arguments
206 	 */
207 
208 #ifdef SO_DONTLINGER		/* Compatibility item... */
209 	if (optname == SO_DONTLINGER) {
210 		lock_sock(sk);
211 		sock_reset_flag(sk, SOCK_LINGER);
212 		release_sock(sk);
213 		return 0;
214 	}
215 #endif
216 
217   	if(optlen<sizeof(int))
218   		return(-EINVAL);
219 
220 	if (get_user(val, (int __user *)optval))
221 		return -EFAULT;
222 
223   	valbool = val?1:0;
224 
225 	lock_sock(sk);
226 
227   	switch(optname)
228   	{
229 		case SO_DEBUG:
230 			if(val && !capable(CAP_NET_ADMIN))
231 			{
232 				ret = -EACCES;
233 			}
234 			else if (valbool)
235 				sock_set_flag(sk, SOCK_DBG);
236 			else
237 				sock_reset_flag(sk, SOCK_DBG);
238 			break;
239 		case SO_REUSEADDR:
240 			sk->sk_reuse = valbool;
241 			break;
242 		case SO_TYPE:
243 		case SO_ERROR:
244 			ret = -ENOPROTOOPT;
245 		  	break;
246 		case SO_DONTROUTE:
247 			if (valbool)
248 				sock_set_flag(sk, SOCK_LOCALROUTE);
249 			else
250 				sock_reset_flag(sk, SOCK_LOCALROUTE);
251 			break;
252 		case SO_BROADCAST:
253 			sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
254 			break;
255 		case SO_SNDBUF:
256 			/* Don't error on this BSD doesn't and if you think
257 			   about it this is right. Otherwise apps have to
258 			   play 'guess the biggest size' games. RCVBUF/SNDBUF
259 			   are treated in BSD as hints */
260 
261 			if (val > sysctl_wmem_max)
262 				val = sysctl_wmem_max;
263 set_sndbuf:
264 			sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
265 			if ((val * 2) < SOCK_MIN_SNDBUF)
266 				sk->sk_sndbuf = SOCK_MIN_SNDBUF;
267 			else
268 				sk->sk_sndbuf = val * 2;
269 
270 			/*
271 			 *	Wake up sending tasks if we
272 			 *	upped the value.
273 			 */
274 			sk->sk_write_space(sk);
275 			break;
276 
277 		case SO_SNDBUFFORCE:
278 			if (!capable(CAP_NET_ADMIN)) {
279 				ret = -EPERM;
280 				break;
281 			}
282 			goto set_sndbuf;
283 
284 		case SO_RCVBUF:
285 			/* Don't error on this BSD doesn't and if you think
286 			   about it this is right. Otherwise apps have to
287 			   play 'guess the biggest size' games. RCVBUF/SNDBUF
288 			   are treated in BSD as hints */
289 
290 			if (val > sysctl_rmem_max)
291 				val = sysctl_rmem_max;
292 set_rcvbuf:
293 			sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
294 			/* FIXME: is this lower bound the right one? */
295 			if ((val * 2) < SOCK_MIN_RCVBUF)
296 				sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
297 			else
298 				sk->sk_rcvbuf = val * 2;
299 			break;
300 
301 		case SO_RCVBUFFORCE:
302 			if (!capable(CAP_NET_ADMIN)) {
303 				ret = -EPERM;
304 				break;
305 			}
306 			goto set_rcvbuf;
307 
308 		case SO_KEEPALIVE:
309 #ifdef CONFIG_INET
310 			if (sk->sk_protocol == IPPROTO_TCP)
311 				tcp_set_keepalive(sk, valbool);
312 #endif
313 			sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
314 			break;
315 
316 	 	case SO_OOBINLINE:
317 			sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
318 			break;
319 
320 	 	case SO_NO_CHECK:
321 			sk->sk_no_check = valbool;
322 			break;
323 
324 		case SO_PRIORITY:
325 			if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
326 				sk->sk_priority = val;
327 			else
328 				ret = -EPERM;
329 			break;
330 
331 		case SO_LINGER:
332 			if(optlen<sizeof(ling)) {
333 				ret = -EINVAL;	/* 1003.1g */
334 				break;
335 			}
336 			if (copy_from_user(&ling,optval,sizeof(ling))) {
337 				ret = -EFAULT;
338 				break;
339 			}
340 			if (!ling.l_onoff)
341 				sock_reset_flag(sk, SOCK_LINGER);
342 			else {
343 #if (BITS_PER_LONG == 32)
344 				if (ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
345 					sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
346 				else
347 #endif
348 					sk->sk_lingertime = ling.l_linger * HZ;
349 				sock_set_flag(sk, SOCK_LINGER);
350 			}
351 			break;
352 
353 		case SO_BSDCOMPAT:
354 			sock_warn_obsolete_bsdism("setsockopt");
355 			break;
356 
357 		case SO_PASSCRED:
358 			if (valbool)
359 				set_bit(SOCK_PASSCRED, &sock->flags);
360 			else
361 				clear_bit(SOCK_PASSCRED, &sock->flags);
362 			break;
363 
364 		case SO_TIMESTAMP:
365 			if (valbool)  {
366 				sock_set_flag(sk, SOCK_RCVTSTAMP);
367 				sock_enable_timestamp(sk);
368 			} else
369 				sock_reset_flag(sk, SOCK_RCVTSTAMP);
370 			break;
371 
372 		case SO_RCVLOWAT:
373 			if (val < 0)
374 				val = INT_MAX;
375 			sk->sk_rcvlowat = val ? : 1;
376 			break;
377 
378 		case SO_RCVTIMEO:
379 			ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
380 			break;
381 
382 		case SO_SNDTIMEO:
383 			ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
384 			break;
385 
386 #ifdef CONFIG_NETDEVICES
387 		case SO_BINDTODEVICE:
388 		{
389 			char devname[IFNAMSIZ];
390 
391 			/* Sorry... */
392 			if (!capable(CAP_NET_RAW)) {
393 				ret = -EPERM;
394 				break;
395 			}
396 
397 			/* Bind this socket to a particular device like "eth0",
398 			 * as specified in the passed interface name. If the
399 			 * name is "" or the option length is zero the socket
400 			 * is not bound.
401 			 */
402 
403 			if (!valbool) {
404 				sk->sk_bound_dev_if = 0;
405 			} else {
406 				if (optlen > IFNAMSIZ)
407 					optlen = IFNAMSIZ;
408 				if (copy_from_user(devname, optval, optlen)) {
409 					ret = -EFAULT;
410 					break;
411 				}
412 
413 				/* Remove any cached route for this socket. */
414 				sk_dst_reset(sk);
415 
416 				if (devname[0] == '\0') {
417 					sk->sk_bound_dev_if = 0;
418 				} else {
419 					struct net_device *dev = dev_get_by_name(devname);
420 					if (!dev) {
421 						ret = -ENODEV;
422 						break;
423 					}
424 					sk->sk_bound_dev_if = dev->ifindex;
425 					dev_put(dev);
426 				}
427 			}
428 			break;
429 		}
430 #endif
431 
432 
433 		case SO_ATTACH_FILTER:
434 			ret = -EINVAL;
435 			if (optlen == sizeof(struct sock_fprog)) {
436 				struct sock_fprog fprog;
437 
438 				ret = -EFAULT;
439 				if (copy_from_user(&fprog, optval, sizeof(fprog)))
440 					break;
441 
442 				ret = sk_attach_filter(&fprog, sk);
443 			}
444 			break;
445 
446 		case SO_DETACH_FILTER:
447 			spin_lock_bh(&sk->sk_lock.slock);
448 			filter = sk->sk_filter;
449                         if (filter) {
450 				sk->sk_filter = NULL;
451 				spin_unlock_bh(&sk->sk_lock.slock);
452 				sk_filter_release(sk, filter);
453 				break;
454 			}
455 			spin_unlock_bh(&sk->sk_lock.slock);
456 			ret = -ENONET;
457 			break;
458 
459 		/* We implement the SO_SNDLOWAT etc to
460 		   not be settable (1003.1g 5.3) */
461 		default:
462 		  	ret = -ENOPROTOOPT;
463 			break;
464   	}
465 	release_sock(sk);
466 	return ret;
467 }
468 
469 
470 int sock_getsockopt(struct socket *sock, int level, int optname,
471 		    char __user *optval, int __user *optlen)
472 {
473 	struct sock *sk = sock->sk;
474 
475 	union
476 	{
477   		int val;
478   		struct linger ling;
479 		struct timeval tm;
480 	} v;
481 
482 	unsigned int lv = sizeof(int);
483 	int len;
484 
485   	if(get_user(len,optlen))
486   		return -EFAULT;
487 	if(len < 0)
488 		return -EINVAL;
489 
490   	switch(optname)
491   	{
492 		case SO_DEBUG:
493 			v.val = sock_flag(sk, SOCK_DBG);
494 			break;
495 
496 		case SO_DONTROUTE:
497 			v.val = sock_flag(sk, SOCK_LOCALROUTE);
498 			break;
499 
500 		case SO_BROADCAST:
501 			v.val = !!sock_flag(sk, SOCK_BROADCAST);
502 			break;
503 
504 		case SO_SNDBUF:
505 			v.val = sk->sk_sndbuf;
506 			break;
507 
508 		case SO_RCVBUF:
509 			v.val = sk->sk_rcvbuf;
510 			break;
511 
512 		case SO_REUSEADDR:
513 			v.val = sk->sk_reuse;
514 			break;
515 
516 		case SO_KEEPALIVE:
517 			v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
518 			break;
519 
520 		case SO_TYPE:
521 			v.val = sk->sk_type;
522 			break;
523 
524 		case SO_ERROR:
525 			v.val = -sock_error(sk);
526 			if(v.val==0)
527 				v.val = xchg(&sk->sk_err_soft, 0);
528 			break;
529 
530 		case SO_OOBINLINE:
531 			v.val = !!sock_flag(sk, SOCK_URGINLINE);
532 			break;
533 
534 		case SO_NO_CHECK:
535 			v.val = sk->sk_no_check;
536 			break;
537 
538 		case SO_PRIORITY:
539 			v.val = sk->sk_priority;
540 			break;
541 
542 		case SO_LINGER:
543 			lv		= sizeof(v.ling);
544 			v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
545  			v.ling.l_linger	= sk->sk_lingertime / HZ;
546 			break;
547 
548 		case SO_BSDCOMPAT:
549 			sock_warn_obsolete_bsdism("getsockopt");
550 			break;
551 
552 		case SO_TIMESTAMP:
553 			v.val = sock_flag(sk, SOCK_RCVTSTAMP);
554 			break;
555 
556 		case SO_RCVTIMEO:
557 			lv=sizeof(struct timeval);
558 			if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
559 				v.tm.tv_sec = 0;
560 				v.tm.tv_usec = 0;
561 			} else {
562 				v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
563 				v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
564 			}
565 			break;
566 
567 		case SO_SNDTIMEO:
568 			lv=sizeof(struct timeval);
569 			if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
570 				v.tm.tv_sec = 0;
571 				v.tm.tv_usec = 0;
572 			} else {
573 				v.tm.tv_sec = sk->sk_sndtimeo / HZ;
574 				v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
575 			}
576 			break;
577 
578 		case SO_RCVLOWAT:
579 			v.val = sk->sk_rcvlowat;
580 			break;
581 
582 		case SO_SNDLOWAT:
583 			v.val=1;
584 			break;
585 
586 		case SO_PASSCRED:
587 			v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
588 			break;
589 
590 		case SO_PEERCRED:
591 			if (len > sizeof(sk->sk_peercred))
592 				len = sizeof(sk->sk_peercred);
593 			if (copy_to_user(optval, &sk->sk_peercred, len))
594 				return -EFAULT;
595 			goto lenout;
596 
597 		case SO_PEERNAME:
598 		{
599 			char address[128];
600 
601 			if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
602 				return -ENOTCONN;
603 			if (lv < len)
604 				return -EINVAL;
605 			if (copy_to_user(optval, address, len))
606 				return -EFAULT;
607 			goto lenout;
608 		}
609 
610 		/* Dubious BSD thing... Probably nobody even uses it, but
611 		 * the UNIX standard wants it for whatever reason... -DaveM
612 		 */
613 		case SO_ACCEPTCONN:
614 			v.val = sk->sk_state == TCP_LISTEN;
615 			break;
616 
617 		case SO_PEERSEC:
618 			return security_socket_getpeersec(sock, optval, optlen, len);
619 
620 		default:
621 			return(-ENOPROTOOPT);
622 	}
623 	if (len > lv)
624 		len = lv;
625 	if (copy_to_user(optval, &v, len))
626 		return -EFAULT;
627 lenout:
628   	if (put_user(len, optlen))
629   		return -EFAULT;
630   	return 0;
631 }
632 
633 /**
634  *	sk_alloc - All socket objects are allocated here
635  *	@family: protocol family
636  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
637  *	@prot: struct proto associated with this new sock instance
638  *	@zero_it: if we should zero the newly allocated sock
639  */
640 struct sock *sk_alloc(int family, unsigned int __nocast priority,
641 		      struct proto *prot, int zero_it)
642 {
643 	struct sock *sk = NULL;
644 	kmem_cache_t *slab = prot->slab;
645 
646 	if (slab != NULL)
647 		sk = kmem_cache_alloc(slab, priority);
648 	else
649 		sk = kmalloc(prot->obj_size, priority);
650 
651 	if (sk) {
652 		if (zero_it) {
653 			memset(sk, 0, prot->obj_size);
654 			sk->sk_family = family;
655 			/*
656 			 * See comment in struct sock definition to understand
657 			 * why we need sk_prot_creator -acme
658 			 */
659 			sk->sk_prot = sk->sk_prot_creator = prot;
660 			sock_lock_init(sk);
661 		}
662 
663 		if (security_sk_alloc(sk, family, priority)) {
664 			if (slab != NULL)
665 				kmem_cache_free(slab, sk);
666 			else
667 				kfree(sk);
668 			sk = NULL;
669 		} else
670 			__module_get(prot->owner);
671 	}
672 	return sk;
673 }
674 
675 void sk_free(struct sock *sk)
676 {
677 	struct sk_filter *filter;
678 	struct module *owner = sk->sk_prot_creator->owner;
679 
680 	if (sk->sk_destruct)
681 		sk->sk_destruct(sk);
682 
683 	filter = sk->sk_filter;
684 	if (filter) {
685 		sk_filter_release(sk, filter);
686 		sk->sk_filter = NULL;
687 	}
688 
689 	sock_disable_timestamp(sk);
690 
691 	if (atomic_read(&sk->sk_omem_alloc))
692 		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
693 		       __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
694 
695 	security_sk_free(sk);
696 	if (sk->sk_prot_creator->slab != NULL)
697 		kmem_cache_free(sk->sk_prot_creator->slab, sk);
698 	else
699 		kfree(sk);
700 	module_put(owner);
701 }
702 
703 struct sock *sk_clone(const struct sock *sk, const unsigned int __nocast priority)
704 {
705 	struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
706 
707 	if (newsk != NULL) {
708 		struct sk_filter *filter;
709 
710 		memcpy(newsk, sk, sk->sk_prot->obj_size);
711 
712 		/* SANITY */
713 		sk_node_init(&newsk->sk_node);
714 		sock_lock_init(newsk);
715 		bh_lock_sock(newsk);
716 
717 		atomic_set(&newsk->sk_rmem_alloc, 0);
718 		atomic_set(&newsk->sk_wmem_alloc, 0);
719 		atomic_set(&newsk->sk_omem_alloc, 0);
720 		skb_queue_head_init(&newsk->sk_receive_queue);
721 		skb_queue_head_init(&newsk->sk_write_queue);
722 
723 		rwlock_init(&newsk->sk_dst_lock);
724 		rwlock_init(&newsk->sk_callback_lock);
725 
726 		newsk->sk_dst_cache	= NULL;
727 		newsk->sk_wmem_queued	= 0;
728 		newsk->sk_forward_alloc = 0;
729 		newsk->sk_send_head	= NULL;
730 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
731 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
732 
733 		sock_reset_flag(newsk, SOCK_DONE);
734 		skb_queue_head_init(&newsk->sk_error_queue);
735 
736 		filter = newsk->sk_filter;
737 		if (filter != NULL)
738 			sk_filter_charge(newsk, filter);
739 
740 		if (unlikely(xfrm_sk_clone_policy(newsk))) {
741 			/* It is still raw copy of parent, so invalidate
742 			 * destructor and make plain sk_free() */
743 			newsk->sk_destruct = NULL;
744 			sk_free(newsk);
745 			newsk = NULL;
746 			goto out;
747 		}
748 
749 		newsk->sk_err	   = 0;
750 		newsk->sk_priority = 0;
751 		atomic_set(&newsk->sk_refcnt, 2);
752 
753 		/*
754 		 * Increment the counter in the same struct proto as the master
755 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
756 		 * is the same as sk->sk_prot->socks, as this field was copied
757 		 * with memcpy).
758 		 *
759 		 * This _changes_ the previous behaviour, where
760 		 * tcp_create_openreq_child always was incrementing the
761 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
762 		 * to be taken into account in all callers. -acme
763 		 */
764 		sk_refcnt_debug_inc(newsk);
765 		newsk->sk_socket = NULL;
766 		newsk->sk_sleep	 = NULL;
767 
768 		if (newsk->sk_prot->sockets_allocated)
769 			atomic_inc(newsk->sk_prot->sockets_allocated);
770 	}
771 out:
772 	return newsk;
773 }
774 
775 EXPORT_SYMBOL_GPL(sk_clone);
776 
777 void __init sk_init(void)
778 {
779 	if (num_physpages <= 4096) {
780 		sysctl_wmem_max = 32767;
781 		sysctl_rmem_max = 32767;
782 		sysctl_wmem_default = 32767;
783 		sysctl_rmem_default = 32767;
784 	} else if (num_physpages >= 131072) {
785 		sysctl_wmem_max = 131071;
786 		sysctl_rmem_max = 131071;
787 	}
788 }
789 
790 /*
791  *	Simple resource managers for sockets.
792  */
793 
794 
795 /*
796  * Write buffer destructor automatically called from kfree_skb.
797  */
798 void sock_wfree(struct sk_buff *skb)
799 {
800 	struct sock *sk = skb->sk;
801 
802 	/* In case it might be waiting for more memory. */
803 	atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
804 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
805 		sk->sk_write_space(sk);
806 	sock_put(sk);
807 }
808 
809 /*
810  * Read buffer destructor automatically called from kfree_skb.
811  */
812 void sock_rfree(struct sk_buff *skb)
813 {
814 	struct sock *sk = skb->sk;
815 
816 	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
817 }
818 
819 
820 int sock_i_uid(struct sock *sk)
821 {
822 	int uid;
823 
824 	read_lock(&sk->sk_callback_lock);
825 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
826 	read_unlock(&sk->sk_callback_lock);
827 	return uid;
828 }
829 
830 unsigned long sock_i_ino(struct sock *sk)
831 {
832 	unsigned long ino;
833 
834 	read_lock(&sk->sk_callback_lock);
835 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
836 	read_unlock(&sk->sk_callback_lock);
837 	return ino;
838 }
839 
840 /*
841  * Allocate a skb from the socket's send buffer.
842  */
843 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
844 			     unsigned int __nocast priority)
845 {
846 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
847 		struct sk_buff * skb = alloc_skb(size, priority);
848 		if (skb) {
849 			skb_set_owner_w(skb, sk);
850 			return skb;
851 		}
852 	}
853 	return NULL;
854 }
855 
856 /*
857  * Allocate a skb from the socket's receive buffer.
858  */
859 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
860 			     unsigned int __nocast priority)
861 {
862 	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
863 		struct sk_buff *skb = alloc_skb(size, priority);
864 		if (skb) {
865 			skb_set_owner_r(skb, sk);
866 			return skb;
867 		}
868 	}
869 	return NULL;
870 }
871 
872 /*
873  * Allocate a memory block from the socket's option memory buffer.
874  */
875 void *sock_kmalloc(struct sock *sk, int size, unsigned int __nocast priority)
876 {
877 	if ((unsigned)size <= sysctl_optmem_max &&
878 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
879 		void *mem;
880 		/* First do the add, to avoid the race if kmalloc
881  		 * might sleep.
882 		 */
883 		atomic_add(size, &sk->sk_omem_alloc);
884 		mem = kmalloc(size, priority);
885 		if (mem)
886 			return mem;
887 		atomic_sub(size, &sk->sk_omem_alloc);
888 	}
889 	return NULL;
890 }
891 
892 /*
893  * Free an option memory block.
894  */
895 void sock_kfree_s(struct sock *sk, void *mem, int size)
896 {
897 	kfree(mem);
898 	atomic_sub(size, &sk->sk_omem_alloc);
899 }
900 
901 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
902    I think, these locks should be removed for datagram sockets.
903  */
904 static long sock_wait_for_wmem(struct sock * sk, long timeo)
905 {
906 	DEFINE_WAIT(wait);
907 
908 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
909 	for (;;) {
910 		if (!timeo)
911 			break;
912 		if (signal_pending(current))
913 			break;
914 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
915 		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
916 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
917 			break;
918 		if (sk->sk_shutdown & SEND_SHUTDOWN)
919 			break;
920 		if (sk->sk_err)
921 			break;
922 		timeo = schedule_timeout(timeo);
923 	}
924 	finish_wait(sk->sk_sleep, &wait);
925 	return timeo;
926 }
927 
928 
929 /*
930  *	Generic send/receive buffer handlers
931  */
932 
933 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
934 					    unsigned long header_len,
935 					    unsigned long data_len,
936 					    int noblock, int *errcode)
937 {
938 	struct sk_buff *skb;
939 	unsigned int gfp_mask;
940 	long timeo;
941 	int err;
942 
943 	gfp_mask = sk->sk_allocation;
944 	if (gfp_mask & __GFP_WAIT)
945 		gfp_mask |= __GFP_REPEAT;
946 
947 	timeo = sock_sndtimeo(sk, noblock);
948 	while (1) {
949 		err = sock_error(sk);
950 		if (err != 0)
951 			goto failure;
952 
953 		err = -EPIPE;
954 		if (sk->sk_shutdown & SEND_SHUTDOWN)
955 			goto failure;
956 
957 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
958 			skb = alloc_skb(header_len, sk->sk_allocation);
959 			if (skb) {
960 				int npages;
961 				int i;
962 
963 				/* No pages, we're done... */
964 				if (!data_len)
965 					break;
966 
967 				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
968 				skb->truesize += data_len;
969 				skb_shinfo(skb)->nr_frags = npages;
970 				for (i = 0; i < npages; i++) {
971 					struct page *page;
972 					skb_frag_t *frag;
973 
974 					page = alloc_pages(sk->sk_allocation, 0);
975 					if (!page) {
976 						err = -ENOBUFS;
977 						skb_shinfo(skb)->nr_frags = i;
978 						kfree_skb(skb);
979 						goto failure;
980 					}
981 
982 					frag = &skb_shinfo(skb)->frags[i];
983 					frag->page = page;
984 					frag->page_offset = 0;
985 					frag->size = (data_len >= PAGE_SIZE ?
986 						      PAGE_SIZE :
987 						      data_len);
988 					data_len -= PAGE_SIZE;
989 				}
990 
991 				/* Full success... */
992 				break;
993 			}
994 			err = -ENOBUFS;
995 			goto failure;
996 		}
997 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
998 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
999 		err = -EAGAIN;
1000 		if (!timeo)
1001 			goto failure;
1002 		if (signal_pending(current))
1003 			goto interrupted;
1004 		timeo = sock_wait_for_wmem(sk, timeo);
1005 	}
1006 
1007 	skb_set_owner_w(skb, sk);
1008 	return skb;
1009 
1010 interrupted:
1011 	err = sock_intr_errno(timeo);
1012 failure:
1013 	*errcode = err;
1014 	return NULL;
1015 }
1016 
1017 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1018 				    int noblock, int *errcode)
1019 {
1020 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1021 }
1022 
1023 static void __lock_sock(struct sock *sk)
1024 {
1025 	DEFINE_WAIT(wait);
1026 
1027 	for(;;) {
1028 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1029 					TASK_UNINTERRUPTIBLE);
1030 		spin_unlock_bh(&sk->sk_lock.slock);
1031 		schedule();
1032 		spin_lock_bh(&sk->sk_lock.slock);
1033 		if(!sock_owned_by_user(sk))
1034 			break;
1035 	}
1036 	finish_wait(&sk->sk_lock.wq, &wait);
1037 }
1038 
1039 static void __release_sock(struct sock *sk)
1040 {
1041 	struct sk_buff *skb = sk->sk_backlog.head;
1042 
1043 	do {
1044 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1045 		bh_unlock_sock(sk);
1046 
1047 		do {
1048 			struct sk_buff *next = skb->next;
1049 
1050 			skb->next = NULL;
1051 			sk->sk_backlog_rcv(sk, skb);
1052 
1053 			/*
1054 			 * We are in process context here with softirqs
1055 			 * disabled, use cond_resched_softirq() to preempt.
1056 			 * This is safe to do because we've taken the backlog
1057 			 * queue private:
1058 			 */
1059 			cond_resched_softirq();
1060 
1061 			skb = next;
1062 		} while (skb != NULL);
1063 
1064 		bh_lock_sock(sk);
1065 	} while((skb = sk->sk_backlog.head) != NULL);
1066 }
1067 
1068 /**
1069  * sk_wait_data - wait for data to arrive at sk_receive_queue
1070  * @sk:    sock to wait on
1071  * @timeo: for how long
1072  *
1073  * Now socket state including sk->sk_err is changed only under lock,
1074  * hence we may omit checks after joining wait queue.
1075  * We check receive queue before schedule() only as optimization;
1076  * it is very likely that release_sock() added new data.
1077  */
1078 int sk_wait_data(struct sock *sk, long *timeo)
1079 {
1080 	int rc;
1081 	DEFINE_WAIT(wait);
1082 
1083 	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1084 	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1085 	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1086 	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1087 	finish_wait(sk->sk_sleep, &wait);
1088 	return rc;
1089 }
1090 
1091 EXPORT_SYMBOL(sk_wait_data);
1092 
1093 /*
1094  * Set of default routines for initialising struct proto_ops when
1095  * the protocol does not support a particular function. In certain
1096  * cases where it makes no sense for a protocol to have a "do nothing"
1097  * function, some default processing is provided.
1098  */
1099 
1100 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1101 {
1102 	return -EOPNOTSUPP;
1103 }
1104 
1105 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1106 		    int len, int flags)
1107 {
1108 	return -EOPNOTSUPP;
1109 }
1110 
1111 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1112 {
1113 	return -EOPNOTSUPP;
1114 }
1115 
1116 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1117 {
1118 	return -EOPNOTSUPP;
1119 }
1120 
1121 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1122 		    int *len, int peer)
1123 {
1124 	return -EOPNOTSUPP;
1125 }
1126 
1127 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1128 {
1129 	return 0;
1130 }
1131 
1132 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1133 {
1134 	return -EOPNOTSUPP;
1135 }
1136 
1137 int sock_no_listen(struct socket *sock, int backlog)
1138 {
1139 	return -EOPNOTSUPP;
1140 }
1141 
1142 int sock_no_shutdown(struct socket *sock, int how)
1143 {
1144 	return -EOPNOTSUPP;
1145 }
1146 
1147 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1148 		    char __user *optval, int optlen)
1149 {
1150 	return -EOPNOTSUPP;
1151 }
1152 
1153 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1154 		    char __user *optval, int __user *optlen)
1155 {
1156 	return -EOPNOTSUPP;
1157 }
1158 
1159 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1160 		    size_t len)
1161 {
1162 	return -EOPNOTSUPP;
1163 }
1164 
1165 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1166 		    size_t len, int flags)
1167 {
1168 	return -EOPNOTSUPP;
1169 }
1170 
1171 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1172 {
1173 	/* Mirror missing mmap method error code */
1174 	return -ENODEV;
1175 }
1176 
1177 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1178 {
1179 	ssize_t res;
1180 	struct msghdr msg = {.msg_flags = flags};
1181 	struct kvec iov;
1182 	char *kaddr = kmap(page);
1183 	iov.iov_base = kaddr + offset;
1184 	iov.iov_len = size;
1185 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1186 	kunmap(page);
1187 	return res;
1188 }
1189 
1190 /*
1191  *	Default Socket Callbacks
1192  */
1193 
1194 static void sock_def_wakeup(struct sock *sk)
1195 {
1196 	read_lock(&sk->sk_callback_lock);
1197 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1198 		wake_up_interruptible_all(sk->sk_sleep);
1199 	read_unlock(&sk->sk_callback_lock);
1200 }
1201 
1202 static void sock_def_error_report(struct sock *sk)
1203 {
1204 	read_lock(&sk->sk_callback_lock);
1205 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1206 		wake_up_interruptible(sk->sk_sleep);
1207 	sk_wake_async(sk,0,POLL_ERR);
1208 	read_unlock(&sk->sk_callback_lock);
1209 }
1210 
1211 static void sock_def_readable(struct sock *sk, int len)
1212 {
1213 	read_lock(&sk->sk_callback_lock);
1214 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1215 		wake_up_interruptible(sk->sk_sleep);
1216 	sk_wake_async(sk,1,POLL_IN);
1217 	read_unlock(&sk->sk_callback_lock);
1218 }
1219 
1220 static void sock_def_write_space(struct sock *sk)
1221 {
1222 	read_lock(&sk->sk_callback_lock);
1223 
1224 	/* Do not wake up a writer until he can make "significant"
1225 	 * progress.  --DaveM
1226 	 */
1227 	if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1228 		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1229 			wake_up_interruptible(sk->sk_sleep);
1230 
1231 		/* Should agree with poll, otherwise some programs break */
1232 		if (sock_writeable(sk))
1233 			sk_wake_async(sk, 2, POLL_OUT);
1234 	}
1235 
1236 	read_unlock(&sk->sk_callback_lock);
1237 }
1238 
1239 static void sock_def_destruct(struct sock *sk)
1240 {
1241 	if (sk->sk_protinfo)
1242 		kfree(sk->sk_protinfo);
1243 }
1244 
1245 void sk_send_sigurg(struct sock *sk)
1246 {
1247 	if (sk->sk_socket && sk->sk_socket->file)
1248 		if (send_sigurg(&sk->sk_socket->file->f_owner))
1249 			sk_wake_async(sk, 3, POLL_PRI);
1250 }
1251 
1252 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1253 		    unsigned long expires)
1254 {
1255 	if (!mod_timer(timer, expires))
1256 		sock_hold(sk);
1257 }
1258 
1259 EXPORT_SYMBOL(sk_reset_timer);
1260 
1261 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1262 {
1263 	if (timer_pending(timer) && del_timer(timer))
1264 		__sock_put(sk);
1265 }
1266 
1267 EXPORT_SYMBOL(sk_stop_timer);
1268 
1269 void sock_init_data(struct socket *sock, struct sock *sk)
1270 {
1271 	skb_queue_head_init(&sk->sk_receive_queue);
1272 	skb_queue_head_init(&sk->sk_write_queue);
1273 	skb_queue_head_init(&sk->sk_error_queue);
1274 
1275 	sk->sk_send_head	=	NULL;
1276 
1277 	init_timer(&sk->sk_timer);
1278 
1279 	sk->sk_allocation	=	GFP_KERNEL;
1280 	sk->sk_rcvbuf		=	sysctl_rmem_default;
1281 	sk->sk_sndbuf		=	sysctl_wmem_default;
1282 	sk->sk_state		=	TCP_CLOSE;
1283 	sk->sk_socket		=	sock;
1284 
1285 	sock_set_flag(sk, SOCK_ZAPPED);
1286 
1287 	if(sock)
1288 	{
1289 		sk->sk_type	=	sock->type;
1290 		sk->sk_sleep	=	&sock->wait;
1291 		sock->sk	=	sk;
1292 	} else
1293 		sk->sk_sleep	=	NULL;
1294 
1295 	rwlock_init(&sk->sk_dst_lock);
1296 	rwlock_init(&sk->sk_callback_lock);
1297 
1298 	sk->sk_state_change	=	sock_def_wakeup;
1299 	sk->sk_data_ready	=	sock_def_readable;
1300 	sk->sk_write_space	=	sock_def_write_space;
1301 	sk->sk_error_report	=	sock_def_error_report;
1302 	sk->sk_destruct		=	sock_def_destruct;
1303 
1304 	sk->sk_sndmsg_page	=	NULL;
1305 	sk->sk_sndmsg_off	=	0;
1306 
1307 	sk->sk_peercred.pid 	=	0;
1308 	sk->sk_peercred.uid	=	-1;
1309 	sk->sk_peercred.gid	=	-1;
1310 	sk->sk_write_pending	=	0;
1311 	sk->sk_rcvlowat		=	1;
1312 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1313 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1314 
1315 	sk->sk_stamp.tv_sec     = -1L;
1316 	sk->sk_stamp.tv_usec    = -1L;
1317 
1318 	atomic_set(&sk->sk_refcnt, 1);
1319 }
1320 
1321 void fastcall lock_sock(struct sock *sk)
1322 {
1323 	might_sleep();
1324 	spin_lock_bh(&(sk->sk_lock.slock));
1325 	if (sk->sk_lock.owner)
1326 		__lock_sock(sk);
1327 	sk->sk_lock.owner = (void *)1;
1328 	spin_unlock_bh(&(sk->sk_lock.slock));
1329 }
1330 
1331 EXPORT_SYMBOL(lock_sock);
1332 
1333 void fastcall release_sock(struct sock *sk)
1334 {
1335 	spin_lock_bh(&(sk->sk_lock.slock));
1336 	if (sk->sk_backlog.tail)
1337 		__release_sock(sk);
1338 	sk->sk_lock.owner = NULL;
1339         if (waitqueue_active(&(sk->sk_lock.wq)))
1340 		wake_up(&(sk->sk_lock.wq));
1341 	spin_unlock_bh(&(sk->sk_lock.slock));
1342 }
1343 EXPORT_SYMBOL(release_sock);
1344 
1345 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1346 {
1347 	if (!sock_flag(sk, SOCK_TIMESTAMP))
1348 		sock_enable_timestamp(sk);
1349 	if (sk->sk_stamp.tv_sec == -1)
1350 		return -ENOENT;
1351 	if (sk->sk_stamp.tv_sec == 0)
1352 		do_gettimeofday(&sk->sk_stamp);
1353 	return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
1354 		-EFAULT : 0;
1355 }
1356 EXPORT_SYMBOL(sock_get_timestamp);
1357 
1358 void sock_enable_timestamp(struct sock *sk)
1359 {
1360 	if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1361 		sock_set_flag(sk, SOCK_TIMESTAMP);
1362 		net_enable_timestamp();
1363 	}
1364 }
1365 EXPORT_SYMBOL(sock_enable_timestamp);
1366 
1367 /*
1368  *	Get a socket option on an socket.
1369  *
1370  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
1371  *	asynchronous errors should be reported by getsockopt. We assume
1372  *	this means if you specify SO_ERROR (otherwise whats the point of it).
1373  */
1374 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1375 			   char __user *optval, int __user *optlen)
1376 {
1377 	struct sock *sk = sock->sk;
1378 
1379 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1380 }
1381 
1382 EXPORT_SYMBOL(sock_common_getsockopt);
1383 
1384 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1385 			struct msghdr *msg, size_t size, int flags)
1386 {
1387 	struct sock *sk = sock->sk;
1388 	int addr_len = 0;
1389 	int err;
1390 
1391 	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1392 				   flags & ~MSG_DONTWAIT, &addr_len);
1393 	if (err >= 0)
1394 		msg->msg_namelen = addr_len;
1395 	return err;
1396 }
1397 
1398 EXPORT_SYMBOL(sock_common_recvmsg);
1399 
1400 /*
1401  *	Set socket options on an inet socket.
1402  */
1403 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1404 			   char __user *optval, int optlen)
1405 {
1406 	struct sock *sk = sock->sk;
1407 
1408 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1409 }
1410 
1411 EXPORT_SYMBOL(sock_common_setsockopt);
1412 
1413 void sk_common_release(struct sock *sk)
1414 {
1415 	if (sk->sk_prot->destroy)
1416 		sk->sk_prot->destroy(sk);
1417 
1418 	/*
1419 	 * Observation: when sock_common_release is called, processes have
1420 	 * no access to socket. But net still has.
1421 	 * Step one, detach it from networking:
1422 	 *
1423 	 * A. Remove from hash tables.
1424 	 */
1425 
1426 	sk->sk_prot->unhash(sk);
1427 
1428 	/*
1429 	 * In this point socket cannot receive new packets, but it is possible
1430 	 * that some packets are in flight because some CPU runs receiver and
1431 	 * did hash table lookup before we unhashed socket. They will achieve
1432 	 * receive queue and will be purged by socket destructor.
1433 	 *
1434 	 * Also we still have packets pending on receive queue and probably,
1435 	 * our own packets waiting in device queues. sock_destroy will drain
1436 	 * receive queue, but transmitted packets will delay socket destruction
1437 	 * until the last reference will be released.
1438 	 */
1439 
1440 	sock_orphan(sk);
1441 
1442 	xfrm_sk_free_policy(sk);
1443 
1444 	sk_refcnt_debug_release(sk);
1445 	sock_put(sk);
1446 }
1447 
1448 EXPORT_SYMBOL(sk_common_release);
1449 
1450 static DEFINE_RWLOCK(proto_list_lock);
1451 static LIST_HEAD(proto_list);
1452 
1453 int proto_register(struct proto *prot, int alloc_slab)
1454 {
1455 	char *request_sock_slab_name = NULL;
1456 	char *timewait_sock_slab_name;
1457 	int rc = -ENOBUFS;
1458 
1459 	if (alloc_slab) {
1460 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
1461 					       SLAB_HWCACHE_ALIGN, NULL, NULL);
1462 
1463 		if (prot->slab == NULL) {
1464 			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
1465 			       prot->name);
1466 			goto out;
1467 		}
1468 
1469 		if (prot->rsk_prot != NULL) {
1470 			static const char mask[] = "request_sock_%s";
1471 
1472 			request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1473 			if (request_sock_slab_name == NULL)
1474 				goto out_free_sock_slab;
1475 
1476 			sprintf(request_sock_slab_name, mask, prot->name);
1477 			prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
1478 								 prot->rsk_prot->obj_size, 0,
1479 								 SLAB_HWCACHE_ALIGN, NULL, NULL);
1480 
1481 			if (prot->rsk_prot->slab == NULL) {
1482 				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
1483 				       prot->name);
1484 				goto out_free_request_sock_slab_name;
1485 			}
1486 		}
1487 
1488 		if (prot->twsk_obj_size) {
1489 			static const char mask[] = "tw_sock_%s";
1490 
1491 			timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
1492 
1493 			if (timewait_sock_slab_name == NULL)
1494 				goto out_free_request_sock_slab;
1495 
1496 			sprintf(timewait_sock_slab_name, mask, prot->name);
1497 			prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name,
1498 							    prot->twsk_obj_size,
1499 							    0, SLAB_HWCACHE_ALIGN,
1500 							    NULL, NULL);
1501 			if (prot->twsk_slab == NULL)
1502 				goto out_free_timewait_sock_slab_name;
1503 		}
1504 	}
1505 
1506 	write_lock(&proto_list_lock);
1507 	list_add(&prot->node, &proto_list);
1508 	write_unlock(&proto_list_lock);
1509 	rc = 0;
1510 out:
1511 	return rc;
1512 out_free_timewait_sock_slab_name:
1513 	kfree(timewait_sock_slab_name);
1514 out_free_request_sock_slab:
1515 	if (prot->rsk_prot && prot->rsk_prot->slab) {
1516 		kmem_cache_destroy(prot->rsk_prot->slab);
1517 		prot->rsk_prot->slab = NULL;
1518 	}
1519 out_free_request_sock_slab_name:
1520 	kfree(request_sock_slab_name);
1521 out_free_sock_slab:
1522 	kmem_cache_destroy(prot->slab);
1523 	prot->slab = NULL;
1524 	goto out;
1525 }
1526 
1527 EXPORT_SYMBOL(proto_register);
1528 
1529 void proto_unregister(struct proto *prot)
1530 {
1531 	write_lock(&proto_list_lock);
1532 
1533 	if (prot->slab != NULL) {
1534 		kmem_cache_destroy(prot->slab);
1535 		prot->slab = NULL;
1536 	}
1537 
1538 	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
1539 		const char *name = kmem_cache_name(prot->rsk_prot->slab);
1540 
1541 		kmem_cache_destroy(prot->rsk_prot->slab);
1542 		kfree(name);
1543 		prot->rsk_prot->slab = NULL;
1544 	}
1545 
1546 	if (prot->twsk_slab != NULL) {
1547 		const char *name = kmem_cache_name(prot->twsk_slab);
1548 
1549 		kmem_cache_destroy(prot->twsk_slab);
1550 		kfree(name);
1551 		prot->twsk_slab = NULL;
1552 	}
1553 
1554 	list_del(&prot->node);
1555 	write_unlock(&proto_list_lock);
1556 }
1557 
1558 EXPORT_SYMBOL(proto_unregister);
1559 
1560 #ifdef CONFIG_PROC_FS
1561 static inline struct proto *__proto_head(void)
1562 {
1563 	return list_entry(proto_list.next, struct proto, node);
1564 }
1565 
1566 static inline struct proto *proto_head(void)
1567 {
1568 	return list_empty(&proto_list) ? NULL : __proto_head();
1569 }
1570 
1571 static inline struct proto *proto_next(struct proto *proto)
1572 {
1573 	return proto->node.next == &proto_list ? NULL :
1574 		list_entry(proto->node.next, struct proto, node);
1575 }
1576 
1577 static inline struct proto *proto_get_idx(loff_t pos)
1578 {
1579 	struct proto *proto;
1580 	loff_t i = 0;
1581 
1582 	list_for_each_entry(proto, &proto_list, node)
1583 		if (i++ == pos)
1584 			goto out;
1585 
1586 	proto = NULL;
1587 out:
1588 	return proto;
1589 }
1590 
1591 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
1592 {
1593 	read_lock(&proto_list_lock);
1594 	return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
1595 }
1596 
1597 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1598 {
1599 	++*pos;
1600 	return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
1601 }
1602 
1603 static void proto_seq_stop(struct seq_file *seq, void *v)
1604 {
1605 	read_unlock(&proto_list_lock);
1606 }
1607 
1608 static char proto_method_implemented(const void *method)
1609 {
1610 	return method == NULL ? 'n' : 'y';
1611 }
1612 
1613 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
1614 {
1615 	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
1616 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
1617 		   proto->name,
1618 		   proto->obj_size,
1619 		   proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
1620 		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
1621 		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
1622 		   proto->max_header,
1623 		   proto->slab == NULL ? "no" : "yes",
1624 		   module_name(proto->owner),
1625 		   proto_method_implemented(proto->close),
1626 		   proto_method_implemented(proto->connect),
1627 		   proto_method_implemented(proto->disconnect),
1628 		   proto_method_implemented(proto->accept),
1629 		   proto_method_implemented(proto->ioctl),
1630 		   proto_method_implemented(proto->init),
1631 		   proto_method_implemented(proto->destroy),
1632 		   proto_method_implemented(proto->shutdown),
1633 		   proto_method_implemented(proto->setsockopt),
1634 		   proto_method_implemented(proto->getsockopt),
1635 		   proto_method_implemented(proto->sendmsg),
1636 		   proto_method_implemented(proto->recvmsg),
1637 		   proto_method_implemented(proto->sendpage),
1638 		   proto_method_implemented(proto->bind),
1639 		   proto_method_implemented(proto->backlog_rcv),
1640 		   proto_method_implemented(proto->hash),
1641 		   proto_method_implemented(proto->unhash),
1642 		   proto_method_implemented(proto->get_port),
1643 		   proto_method_implemented(proto->enter_memory_pressure));
1644 }
1645 
1646 static int proto_seq_show(struct seq_file *seq, void *v)
1647 {
1648 	if (v == SEQ_START_TOKEN)
1649 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
1650 			   "protocol",
1651 			   "size",
1652 			   "sockets",
1653 			   "memory",
1654 			   "press",
1655 			   "maxhdr",
1656 			   "slab",
1657 			   "module",
1658 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
1659 	else
1660 		proto_seq_printf(seq, v);
1661 	return 0;
1662 }
1663 
1664 static struct seq_operations proto_seq_ops = {
1665 	.start  = proto_seq_start,
1666 	.next   = proto_seq_next,
1667 	.stop   = proto_seq_stop,
1668 	.show   = proto_seq_show,
1669 };
1670 
1671 static int proto_seq_open(struct inode *inode, struct file *file)
1672 {
1673 	return seq_open(file, &proto_seq_ops);
1674 }
1675 
1676 static struct file_operations proto_seq_fops = {
1677 	.owner		= THIS_MODULE,
1678 	.open		= proto_seq_open,
1679 	.read		= seq_read,
1680 	.llseek		= seq_lseek,
1681 	.release	= seq_release,
1682 };
1683 
1684 static int __init proto_init(void)
1685 {
1686 	/* register /proc/net/protocols */
1687 	return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
1688 }
1689 
1690 subsys_initcall(proto_init);
1691 
1692 #endif /* PROC_FS */
1693 
1694 EXPORT_SYMBOL(sk_alloc);
1695 EXPORT_SYMBOL(sk_free);
1696 EXPORT_SYMBOL(sk_send_sigurg);
1697 EXPORT_SYMBOL(sock_alloc_send_skb);
1698 EXPORT_SYMBOL(sock_init_data);
1699 EXPORT_SYMBOL(sock_kfree_s);
1700 EXPORT_SYMBOL(sock_kmalloc);
1701 EXPORT_SYMBOL(sock_no_accept);
1702 EXPORT_SYMBOL(sock_no_bind);
1703 EXPORT_SYMBOL(sock_no_connect);
1704 EXPORT_SYMBOL(sock_no_getname);
1705 EXPORT_SYMBOL(sock_no_getsockopt);
1706 EXPORT_SYMBOL(sock_no_ioctl);
1707 EXPORT_SYMBOL(sock_no_listen);
1708 EXPORT_SYMBOL(sock_no_mmap);
1709 EXPORT_SYMBOL(sock_no_poll);
1710 EXPORT_SYMBOL(sock_no_recvmsg);
1711 EXPORT_SYMBOL(sock_no_sendmsg);
1712 EXPORT_SYMBOL(sock_no_sendpage);
1713 EXPORT_SYMBOL(sock_no_setsockopt);
1714 EXPORT_SYMBOL(sock_no_shutdown);
1715 EXPORT_SYMBOL(sock_no_socketpair);
1716 EXPORT_SYMBOL(sock_rfree);
1717 EXPORT_SYMBOL(sock_setsockopt);
1718 EXPORT_SYMBOL(sock_wfree);
1719 EXPORT_SYMBOL(sock_wmalloc);
1720 EXPORT_SYMBOL(sock_i_uid);
1721 EXPORT_SYMBOL(sock_i_ino);
1722 #ifdef CONFIG_SYSCTL
1723 EXPORT_SYMBOL(sysctl_optmem_max);
1724 EXPORT_SYMBOL(sysctl_rmem_max);
1725 EXPORT_SYMBOL(sysctl_wmem_max);
1726 #endif
1727