xref: /linux/net/core/sock.c (revision b233b28eac0cc37d07c2d007ea08c86c778c5af4)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #include <linux/capability.h>
93 #include <linux/errno.h>
94 #include <linux/types.h>
95 #include <linux/socket.h>
96 #include <linux/in.h>
97 #include <linux/kernel.h>
98 #include <linux/module.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/sched.h>
102 #include <linux/timer.h>
103 #include <linux/string.h>
104 #include <linux/sockios.h>
105 #include <linux/net.h>
106 #include <linux/mm.h>
107 #include <linux/slab.h>
108 #include <linux/interrupt.h>
109 #include <linux/poll.h>
110 #include <linux/tcp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 
114 #include <asm/uaccess.h>
115 #include <asm/system.h>
116 
117 #include <linux/netdevice.h>
118 #include <net/protocol.h>
119 #include <linux/skbuff.h>
120 #include <net/net_namespace.h>
121 #include <net/request_sock.h>
122 #include <net/sock.h>
123 #include <net/xfrm.h>
124 #include <linux/ipsec.h>
125 
126 #include <linux/filter.h>
127 
128 #ifdef CONFIG_INET
129 #include <net/tcp.h>
130 #endif
131 
132 /*
133  * Each address family might have different locking rules, so we have
134  * one slock key per address family:
135  */
136 static struct lock_class_key af_family_keys[AF_MAX];
137 static struct lock_class_key af_family_slock_keys[AF_MAX];
138 
139 /*
140  * Make lock validator output more readable. (we pre-construct these
141  * strings build-time, so that runtime initialization of socket
142  * locks is fast):
143  */
144 static const char *af_family_key_strings[AF_MAX+1] = {
145   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
146   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
147   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
148   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
149   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
150   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
151   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
152   "sk_lock-21"       , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
153   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
154   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
155   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
156   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
157   "sk_lock-AF_MAX"
158 };
159 static const char *af_family_slock_key_strings[AF_MAX+1] = {
160   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
161   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
162   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
163   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
164   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
165   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
166   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
167   "slock-21"       , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
168   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
169   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
170   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
171   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
172   "slock-AF_MAX"
173 };
174 static const char *af_family_clock_key_strings[AF_MAX+1] = {
175   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
176   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
177   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
178   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
179   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
180   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
181   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
182   "clock-21"       , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
183   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
184   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
185   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
186   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
187   "clock-AF_MAX"
188 };
189 
190 /*
191  * sk_callback_lock locking rules are per-address-family,
192  * so split the lock classes by using a per-AF key:
193  */
194 static struct lock_class_key af_callback_keys[AF_MAX];
195 
196 /* Take into consideration the size of the struct sk_buff overhead in the
197  * determination of these values, since that is non-constant across
198  * platforms.  This makes socket queueing behavior and performance
199  * not depend upon such differences.
200  */
201 #define _SK_MEM_PACKETS		256
202 #define _SK_MEM_OVERHEAD	(sizeof(struct sk_buff) + 256)
203 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
204 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
205 
206 /* Run time adjustable parameters. */
207 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
208 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
209 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
210 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
211 
212 /* Maximal space eaten by iovec or ancilliary data plus some space */
213 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
214 
215 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
216 {
217 	struct timeval tv;
218 
219 	if (optlen < sizeof(tv))
220 		return -EINVAL;
221 	if (copy_from_user(&tv, optval, sizeof(tv)))
222 		return -EFAULT;
223 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
224 		return -EDOM;
225 
226 	if (tv.tv_sec < 0) {
227 		static int warned __read_mostly;
228 
229 		*timeo_p = 0;
230 		if (warned < 10 && net_ratelimit()) {
231 			warned++;
232 			printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
233 			       "tries to set negative timeout\n",
234 				current->comm, task_pid_nr(current));
235 		}
236 		return 0;
237 	}
238 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
239 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
240 		return 0;
241 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
242 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
243 	return 0;
244 }
245 
246 static void sock_warn_obsolete_bsdism(const char *name)
247 {
248 	static int warned;
249 	static char warncomm[TASK_COMM_LEN];
250 	if (strcmp(warncomm, current->comm) && warned < 5) {
251 		strcpy(warncomm,  current->comm);
252 		printk(KERN_WARNING "process `%s' is using obsolete "
253 		       "%s SO_BSDCOMPAT\n", warncomm, name);
254 		warned++;
255 	}
256 }
257 
258 static void sock_disable_timestamp(struct sock *sk)
259 {
260 	if (sock_flag(sk, SOCK_TIMESTAMP)) {
261 		sock_reset_flag(sk, SOCK_TIMESTAMP);
262 		net_disable_timestamp();
263 	}
264 }
265 
266 
267 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
268 {
269 	int err = 0;
270 	int skb_len;
271 
272 	/* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
273 	   number of warnings when compiling with -W --ANK
274 	 */
275 	if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
276 	    (unsigned)sk->sk_rcvbuf) {
277 		err = -ENOMEM;
278 		goto out;
279 	}
280 
281 	err = sk_filter(sk, skb);
282 	if (err)
283 		goto out;
284 
285 	if (!sk_rmem_schedule(sk, skb->truesize)) {
286 		err = -ENOBUFS;
287 		goto out;
288 	}
289 
290 	skb->dev = NULL;
291 	skb_set_owner_r(skb, sk);
292 
293 	/* Cache the SKB length before we tack it onto the receive
294 	 * queue.  Once it is added it no longer belongs to us and
295 	 * may be freed by other threads of control pulling packets
296 	 * from the queue.
297 	 */
298 	skb_len = skb->len;
299 
300 	skb_queue_tail(&sk->sk_receive_queue, skb);
301 
302 	if (!sock_flag(sk, SOCK_DEAD))
303 		sk->sk_data_ready(sk, skb_len);
304 out:
305 	return err;
306 }
307 EXPORT_SYMBOL(sock_queue_rcv_skb);
308 
309 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
310 {
311 	int rc = NET_RX_SUCCESS;
312 
313 	if (sk_filter(sk, skb))
314 		goto discard_and_relse;
315 
316 	skb->dev = NULL;
317 
318 	if (nested)
319 		bh_lock_sock_nested(sk);
320 	else
321 		bh_lock_sock(sk);
322 	if (!sock_owned_by_user(sk)) {
323 		/*
324 		 * trylock + unlock semantics:
325 		 */
326 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
327 
328 		rc = sk_backlog_rcv(sk, skb);
329 
330 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
331 	} else
332 		sk_add_backlog(sk, skb);
333 	bh_unlock_sock(sk);
334 out:
335 	sock_put(sk);
336 	return rc;
337 discard_and_relse:
338 	kfree_skb(skb);
339 	goto out;
340 }
341 EXPORT_SYMBOL(sk_receive_skb);
342 
343 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
344 {
345 	struct dst_entry *dst = sk->sk_dst_cache;
346 
347 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
348 		sk->sk_dst_cache = NULL;
349 		dst_release(dst);
350 		return NULL;
351 	}
352 
353 	return dst;
354 }
355 EXPORT_SYMBOL(__sk_dst_check);
356 
357 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
358 {
359 	struct dst_entry *dst = sk_dst_get(sk);
360 
361 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
362 		sk_dst_reset(sk);
363 		dst_release(dst);
364 		return NULL;
365 	}
366 
367 	return dst;
368 }
369 EXPORT_SYMBOL(sk_dst_check);
370 
371 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
372 {
373 	int ret = -ENOPROTOOPT;
374 #ifdef CONFIG_NETDEVICES
375 	struct net *net = sock_net(sk);
376 	char devname[IFNAMSIZ];
377 	int index;
378 
379 	/* Sorry... */
380 	ret = -EPERM;
381 	if (!capable(CAP_NET_RAW))
382 		goto out;
383 
384 	ret = -EINVAL;
385 	if (optlen < 0)
386 		goto out;
387 
388 	/* Bind this socket to a particular device like "eth0",
389 	 * as specified in the passed interface name. If the
390 	 * name is "" or the option length is zero the socket
391 	 * is not bound.
392 	 */
393 	if (optlen > IFNAMSIZ - 1)
394 		optlen = IFNAMSIZ - 1;
395 	memset(devname, 0, sizeof(devname));
396 
397 	ret = -EFAULT;
398 	if (copy_from_user(devname, optval, optlen))
399 		goto out;
400 
401 	if (devname[0] == '\0') {
402 		index = 0;
403 	} else {
404 		struct net_device *dev = dev_get_by_name(net, devname);
405 
406 		ret = -ENODEV;
407 		if (!dev)
408 			goto out;
409 
410 		index = dev->ifindex;
411 		dev_put(dev);
412 	}
413 
414 	lock_sock(sk);
415 	sk->sk_bound_dev_if = index;
416 	sk_dst_reset(sk);
417 	release_sock(sk);
418 
419 	ret = 0;
420 
421 out:
422 #endif
423 
424 	return ret;
425 }
426 
427 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
428 {
429 	if (valbool)
430 		sock_set_flag(sk, bit);
431 	else
432 		sock_reset_flag(sk, bit);
433 }
434 
435 /*
436  *	This is meant for all protocols to use and covers goings on
437  *	at the socket level. Everything here is generic.
438  */
439 
440 int sock_setsockopt(struct socket *sock, int level, int optname,
441 		    char __user *optval, int optlen)
442 {
443 	struct sock *sk=sock->sk;
444 	int val;
445 	int valbool;
446 	struct linger ling;
447 	int ret = 0;
448 
449 	/*
450 	 *	Options without arguments
451 	 */
452 
453 	if (optname == SO_BINDTODEVICE)
454 		return sock_bindtodevice(sk, optval, optlen);
455 
456 	if (optlen < sizeof(int))
457 		return -EINVAL;
458 
459 	if (get_user(val, (int __user *)optval))
460 		return -EFAULT;
461 
462 	valbool = val?1:0;
463 
464 	lock_sock(sk);
465 
466 	switch(optname) {
467 	case SO_DEBUG:
468 		if (val && !capable(CAP_NET_ADMIN)) {
469 			ret = -EACCES;
470 		} else
471 			sock_valbool_flag(sk, SOCK_DBG, valbool);
472 		break;
473 	case SO_REUSEADDR:
474 		sk->sk_reuse = valbool;
475 		break;
476 	case SO_TYPE:
477 	case SO_ERROR:
478 		ret = -ENOPROTOOPT;
479 		break;
480 	case SO_DONTROUTE:
481 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
482 		break;
483 	case SO_BROADCAST:
484 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
485 		break;
486 	case SO_SNDBUF:
487 		/* Don't error on this BSD doesn't and if you think
488 		   about it this is right. Otherwise apps have to
489 		   play 'guess the biggest size' games. RCVBUF/SNDBUF
490 		   are treated in BSD as hints */
491 
492 		if (val > sysctl_wmem_max)
493 			val = sysctl_wmem_max;
494 set_sndbuf:
495 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
496 		if ((val * 2) < SOCK_MIN_SNDBUF)
497 			sk->sk_sndbuf = SOCK_MIN_SNDBUF;
498 		else
499 			sk->sk_sndbuf = val * 2;
500 
501 		/*
502 		 *	Wake up sending tasks if we
503 		 *	upped the value.
504 		 */
505 		sk->sk_write_space(sk);
506 		break;
507 
508 	case SO_SNDBUFFORCE:
509 		if (!capable(CAP_NET_ADMIN)) {
510 			ret = -EPERM;
511 			break;
512 		}
513 		goto set_sndbuf;
514 
515 	case SO_RCVBUF:
516 		/* Don't error on this BSD doesn't and if you think
517 		   about it this is right. Otherwise apps have to
518 		   play 'guess the biggest size' games. RCVBUF/SNDBUF
519 		   are treated in BSD as hints */
520 
521 		if (val > sysctl_rmem_max)
522 			val = sysctl_rmem_max;
523 set_rcvbuf:
524 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
525 		/*
526 		 * We double it on the way in to account for
527 		 * "struct sk_buff" etc. overhead.   Applications
528 		 * assume that the SO_RCVBUF setting they make will
529 		 * allow that much actual data to be received on that
530 		 * socket.
531 		 *
532 		 * Applications are unaware that "struct sk_buff" and
533 		 * other overheads allocate from the receive buffer
534 		 * during socket buffer allocation.
535 		 *
536 		 * And after considering the possible alternatives,
537 		 * returning the value we actually used in getsockopt
538 		 * is the most desirable behavior.
539 		 */
540 		if ((val * 2) < SOCK_MIN_RCVBUF)
541 			sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
542 		else
543 			sk->sk_rcvbuf = val * 2;
544 		break;
545 
546 	case SO_RCVBUFFORCE:
547 		if (!capable(CAP_NET_ADMIN)) {
548 			ret = -EPERM;
549 			break;
550 		}
551 		goto set_rcvbuf;
552 
553 	case SO_KEEPALIVE:
554 #ifdef CONFIG_INET
555 		if (sk->sk_protocol == IPPROTO_TCP)
556 			tcp_set_keepalive(sk, valbool);
557 #endif
558 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
559 		break;
560 
561 	case SO_OOBINLINE:
562 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
563 		break;
564 
565 	case SO_NO_CHECK:
566 		sk->sk_no_check = valbool;
567 		break;
568 
569 	case SO_PRIORITY:
570 		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
571 			sk->sk_priority = val;
572 		else
573 			ret = -EPERM;
574 		break;
575 
576 	case SO_LINGER:
577 		if (optlen < sizeof(ling)) {
578 			ret = -EINVAL;	/* 1003.1g */
579 			break;
580 		}
581 		if (copy_from_user(&ling,optval,sizeof(ling))) {
582 			ret = -EFAULT;
583 			break;
584 		}
585 		if (!ling.l_onoff)
586 			sock_reset_flag(sk, SOCK_LINGER);
587 		else {
588 #if (BITS_PER_LONG == 32)
589 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
590 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
591 			else
592 #endif
593 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
594 			sock_set_flag(sk, SOCK_LINGER);
595 		}
596 		break;
597 
598 	case SO_BSDCOMPAT:
599 		sock_warn_obsolete_bsdism("setsockopt");
600 		break;
601 
602 	case SO_PASSCRED:
603 		if (valbool)
604 			set_bit(SOCK_PASSCRED, &sock->flags);
605 		else
606 			clear_bit(SOCK_PASSCRED, &sock->flags);
607 		break;
608 
609 	case SO_TIMESTAMP:
610 	case SO_TIMESTAMPNS:
611 		if (valbool)  {
612 			if (optname == SO_TIMESTAMP)
613 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
614 			else
615 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
616 			sock_set_flag(sk, SOCK_RCVTSTAMP);
617 			sock_enable_timestamp(sk);
618 		} else {
619 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
620 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
621 		}
622 		break;
623 
624 	case SO_RCVLOWAT:
625 		if (val < 0)
626 			val = INT_MAX;
627 		sk->sk_rcvlowat = val ? : 1;
628 		break;
629 
630 	case SO_RCVTIMEO:
631 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
632 		break;
633 
634 	case SO_SNDTIMEO:
635 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
636 		break;
637 
638 	case SO_ATTACH_FILTER:
639 		ret = -EINVAL;
640 		if (optlen == sizeof(struct sock_fprog)) {
641 			struct sock_fprog fprog;
642 
643 			ret = -EFAULT;
644 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
645 				break;
646 
647 			ret = sk_attach_filter(&fprog, sk);
648 		}
649 		break;
650 
651 	case SO_DETACH_FILTER:
652 		ret = sk_detach_filter(sk);
653 		break;
654 
655 	case SO_PASSSEC:
656 		if (valbool)
657 			set_bit(SOCK_PASSSEC, &sock->flags);
658 		else
659 			clear_bit(SOCK_PASSSEC, &sock->flags);
660 		break;
661 	case SO_MARK:
662 		if (!capable(CAP_NET_ADMIN))
663 			ret = -EPERM;
664 		else {
665 			sk->sk_mark = val;
666 		}
667 		break;
668 
669 		/* We implement the SO_SNDLOWAT etc to
670 		   not be settable (1003.1g 5.3) */
671 	default:
672 		ret = -ENOPROTOOPT;
673 		break;
674 	}
675 	release_sock(sk);
676 	return ret;
677 }
678 
679 
680 int sock_getsockopt(struct socket *sock, int level, int optname,
681 		    char __user *optval, int __user *optlen)
682 {
683 	struct sock *sk = sock->sk;
684 
685 	union {
686 		int val;
687 		struct linger ling;
688 		struct timeval tm;
689 	} v;
690 
691 	unsigned int lv = sizeof(int);
692 	int len;
693 
694 	if (get_user(len, optlen))
695 		return -EFAULT;
696 	if (len < 0)
697 		return -EINVAL;
698 
699 	switch(optname) {
700 	case SO_DEBUG:
701 		v.val = sock_flag(sk, SOCK_DBG);
702 		break;
703 
704 	case SO_DONTROUTE:
705 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
706 		break;
707 
708 	case SO_BROADCAST:
709 		v.val = !!sock_flag(sk, SOCK_BROADCAST);
710 		break;
711 
712 	case SO_SNDBUF:
713 		v.val = sk->sk_sndbuf;
714 		break;
715 
716 	case SO_RCVBUF:
717 		v.val = sk->sk_rcvbuf;
718 		break;
719 
720 	case SO_REUSEADDR:
721 		v.val = sk->sk_reuse;
722 		break;
723 
724 	case SO_KEEPALIVE:
725 		v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
726 		break;
727 
728 	case SO_TYPE:
729 		v.val = sk->sk_type;
730 		break;
731 
732 	case SO_ERROR:
733 		v.val = -sock_error(sk);
734 		if (v.val==0)
735 			v.val = xchg(&sk->sk_err_soft, 0);
736 		break;
737 
738 	case SO_OOBINLINE:
739 		v.val = !!sock_flag(sk, SOCK_URGINLINE);
740 		break;
741 
742 	case SO_NO_CHECK:
743 		v.val = sk->sk_no_check;
744 		break;
745 
746 	case SO_PRIORITY:
747 		v.val = sk->sk_priority;
748 		break;
749 
750 	case SO_LINGER:
751 		lv		= sizeof(v.ling);
752 		v.ling.l_onoff	= !!sock_flag(sk, SOCK_LINGER);
753 		v.ling.l_linger	= sk->sk_lingertime / HZ;
754 		break;
755 
756 	case SO_BSDCOMPAT:
757 		sock_warn_obsolete_bsdism("getsockopt");
758 		break;
759 
760 	case SO_TIMESTAMP:
761 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
762 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
763 		break;
764 
765 	case SO_TIMESTAMPNS:
766 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
767 		break;
768 
769 	case SO_RCVTIMEO:
770 		lv=sizeof(struct timeval);
771 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
772 			v.tm.tv_sec = 0;
773 			v.tm.tv_usec = 0;
774 		} else {
775 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
776 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
777 		}
778 		break;
779 
780 	case SO_SNDTIMEO:
781 		lv=sizeof(struct timeval);
782 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
783 			v.tm.tv_sec = 0;
784 			v.tm.tv_usec = 0;
785 		} else {
786 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
787 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
788 		}
789 		break;
790 
791 	case SO_RCVLOWAT:
792 		v.val = sk->sk_rcvlowat;
793 		break;
794 
795 	case SO_SNDLOWAT:
796 		v.val=1;
797 		break;
798 
799 	case SO_PASSCRED:
800 		v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
801 		break;
802 
803 	case SO_PEERCRED:
804 		if (len > sizeof(sk->sk_peercred))
805 			len = sizeof(sk->sk_peercred);
806 		if (copy_to_user(optval, &sk->sk_peercred, len))
807 			return -EFAULT;
808 		goto lenout;
809 
810 	case SO_PEERNAME:
811 	{
812 		char address[128];
813 
814 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
815 			return -ENOTCONN;
816 		if (lv < len)
817 			return -EINVAL;
818 		if (copy_to_user(optval, address, len))
819 			return -EFAULT;
820 		goto lenout;
821 	}
822 
823 	/* Dubious BSD thing... Probably nobody even uses it, but
824 	 * the UNIX standard wants it for whatever reason... -DaveM
825 	 */
826 	case SO_ACCEPTCONN:
827 		v.val = sk->sk_state == TCP_LISTEN;
828 		break;
829 
830 	case SO_PASSSEC:
831 		v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
832 		break;
833 
834 	case SO_PEERSEC:
835 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
836 
837 	case SO_MARK:
838 		v.val = sk->sk_mark;
839 		break;
840 
841 	default:
842 		return -ENOPROTOOPT;
843 	}
844 
845 	if (len > lv)
846 		len = lv;
847 	if (copy_to_user(optval, &v, len))
848 		return -EFAULT;
849 lenout:
850 	if (put_user(len, optlen))
851 		return -EFAULT;
852 	return 0;
853 }
854 
855 /*
856  * Initialize an sk_lock.
857  *
858  * (We also register the sk_lock with the lock validator.)
859  */
860 static inline void sock_lock_init(struct sock *sk)
861 {
862 	sock_lock_init_class_and_name(sk,
863 			af_family_slock_key_strings[sk->sk_family],
864 			af_family_slock_keys + sk->sk_family,
865 			af_family_key_strings[sk->sk_family],
866 			af_family_keys + sk->sk_family);
867 }
868 
869 static void sock_copy(struct sock *nsk, const struct sock *osk)
870 {
871 #ifdef CONFIG_SECURITY_NETWORK
872 	void *sptr = nsk->sk_security;
873 #endif
874 
875 	memcpy(nsk, osk, osk->sk_prot->obj_size);
876 #ifdef CONFIG_SECURITY_NETWORK
877 	nsk->sk_security = sptr;
878 	security_sk_clone(osk, nsk);
879 #endif
880 }
881 
882 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
883 		int family)
884 {
885 	struct sock *sk;
886 	struct kmem_cache *slab;
887 
888 	slab = prot->slab;
889 	if (slab != NULL)
890 		sk = kmem_cache_alloc(slab, priority);
891 	else
892 		sk = kmalloc(prot->obj_size, priority);
893 
894 	if (sk != NULL) {
895 		if (security_sk_alloc(sk, family, priority))
896 			goto out_free;
897 
898 		if (!try_module_get(prot->owner))
899 			goto out_free_sec;
900 	}
901 
902 	return sk;
903 
904 out_free_sec:
905 	security_sk_free(sk);
906 out_free:
907 	if (slab != NULL)
908 		kmem_cache_free(slab, sk);
909 	else
910 		kfree(sk);
911 	return NULL;
912 }
913 
914 static void sk_prot_free(struct proto *prot, struct sock *sk)
915 {
916 	struct kmem_cache *slab;
917 	struct module *owner;
918 
919 	owner = prot->owner;
920 	slab = prot->slab;
921 
922 	security_sk_free(sk);
923 	if (slab != NULL)
924 		kmem_cache_free(slab, sk);
925 	else
926 		kfree(sk);
927 	module_put(owner);
928 }
929 
930 /**
931  *	sk_alloc - All socket objects are allocated here
932  *	@net: the applicable net namespace
933  *	@family: protocol family
934  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
935  *	@prot: struct proto associated with this new sock instance
936  */
937 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
938 		      struct proto *prot)
939 {
940 	struct sock *sk;
941 
942 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
943 	if (sk) {
944 		sk->sk_family = family;
945 		/*
946 		 * See comment in struct sock definition to understand
947 		 * why we need sk_prot_creator -acme
948 		 */
949 		sk->sk_prot = sk->sk_prot_creator = prot;
950 		sock_lock_init(sk);
951 		sock_net_set(sk, get_net(net));
952 	}
953 
954 	return sk;
955 }
956 
957 void sk_free(struct sock *sk)
958 {
959 	struct sk_filter *filter;
960 
961 	if (sk->sk_destruct)
962 		sk->sk_destruct(sk);
963 
964 	filter = rcu_dereference(sk->sk_filter);
965 	if (filter) {
966 		sk_filter_uncharge(sk, filter);
967 		rcu_assign_pointer(sk->sk_filter, NULL);
968 	}
969 
970 	sock_disable_timestamp(sk);
971 
972 	if (atomic_read(&sk->sk_omem_alloc))
973 		printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
974 		       __func__, atomic_read(&sk->sk_omem_alloc));
975 
976 	put_net(sock_net(sk));
977 	sk_prot_free(sk->sk_prot_creator, sk);
978 }
979 
980 /*
981  * Last sock_put should drop referrence to sk->sk_net. It has already
982  * been dropped in sk_change_net. Taking referrence to stopping namespace
983  * is not an option.
984  * Take referrence to a socket to remove it from hash _alive_ and after that
985  * destroy it in the context of init_net.
986  */
987 void sk_release_kernel(struct sock *sk)
988 {
989 	if (sk == NULL || sk->sk_socket == NULL)
990 		return;
991 
992 	sock_hold(sk);
993 	sock_release(sk->sk_socket);
994 	release_net(sock_net(sk));
995 	sock_net_set(sk, get_net(&init_net));
996 	sock_put(sk);
997 }
998 EXPORT_SYMBOL(sk_release_kernel);
999 
1000 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1001 {
1002 	struct sock *newsk;
1003 
1004 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1005 	if (newsk != NULL) {
1006 		struct sk_filter *filter;
1007 
1008 		sock_copy(newsk, sk);
1009 
1010 		/* SANITY */
1011 		get_net(sock_net(newsk));
1012 		sk_node_init(&newsk->sk_node);
1013 		sock_lock_init(newsk);
1014 		bh_lock_sock(newsk);
1015 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1016 
1017 		atomic_set(&newsk->sk_rmem_alloc, 0);
1018 		atomic_set(&newsk->sk_wmem_alloc, 0);
1019 		atomic_set(&newsk->sk_omem_alloc, 0);
1020 		skb_queue_head_init(&newsk->sk_receive_queue);
1021 		skb_queue_head_init(&newsk->sk_write_queue);
1022 #ifdef CONFIG_NET_DMA
1023 		skb_queue_head_init(&newsk->sk_async_wait_queue);
1024 #endif
1025 
1026 		rwlock_init(&newsk->sk_dst_lock);
1027 		rwlock_init(&newsk->sk_callback_lock);
1028 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1029 				af_callback_keys + newsk->sk_family,
1030 				af_family_clock_key_strings[newsk->sk_family]);
1031 
1032 		newsk->sk_dst_cache	= NULL;
1033 		newsk->sk_wmem_queued	= 0;
1034 		newsk->sk_forward_alloc = 0;
1035 		newsk->sk_send_head	= NULL;
1036 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1037 
1038 		sock_reset_flag(newsk, SOCK_DONE);
1039 		skb_queue_head_init(&newsk->sk_error_queue);
1040 
1041 		filter = newsk->sk_filter;
1042 		if (filter != NULL)
1043 			sk_filter_charge(newsk, filter);
1044 
1045 		if (unlikely(xfrm_sk_clone_policy(newsk))) {
1046 			/* It is still raw copy of parent, so invalidate
1047 			 * destructor and make plain sk_free() */
1048 			newsk->sk_destruct = NULL;
1049 			sk_free(newsk);
1050 			newsk = NULL;
1051 			goto out;
1052 		}
1053 
1054 		newsk->sk_err	   = 0;
1055 		newsk->sk_priority = 0;
1056 		atomic_set(&newsk->sk_refcnt, 2);
1057 
1058 		/*
1059 		 * Increment the counter in the same struct proto as the master
1060 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1061 		 * is the same as sk->sk_prot->socks, as this field was copied
1062 		 * with memcpy).
1063 		 *
1064 		 * This _changes_ the previous behaviour, where
1065 		 * tcp_create_openreq_child always was incrementing the
1066 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1067 		 * to be taken into account in all callers. -acme
1068 		 */
1069 		sk_refcnt_debug_inc(newsk);
1070 		sk_set_socket(newsk, NULL);
1071 		newsk->sk_sleep	 = NULL;
1072 
1073 		if (newsk->sk_prot->sockets_allocated)
1074 			percpu_counter_inc(newsk->sk_prot->sockets_allocated);
1075 	}
1076 out:
1077 	return newsk;
1078 }
1079 
1080 EXPORT_SYMBOL_GPL(sk_clone);
1081 
1082 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1083 {
1084 	__sk_dst_set(sk, dst);
1085 	sk->sk_route_caps = dst->dev->features;
1086 	if (sk->sk_route_caps & NETIF_F_GSO)
1087 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1088 	if (sk_can_gso(sk)) {
1089 		if (dst->header_len) {
1090 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1091 		} else {
1092 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1093 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1094 		}
1095 	}
1096 }
1097 EXPORT_SYMBOL_GPL(sk_setup_caps);
1098 
1099 void __init sk_init(void)
1100 {
1101 	if (num_physpages <= 4096) {
1102 		sysctl_wmem_max = 32767;
1103 		sysctl_rmem_max = 32767;
1104 		sysctl_wmem_default = 32767;
1105 		sysctl_rmem_default = 32767;
1106 	} else if (num_physpages >= 131072) {
1107 		sysctl_wmem_max = 131071;
1108 		sysctl_rmem_max = 131071;
1109 	}
1110 }
1111 
1112 /*
1113  *	Simple resource managers for sockets.
1114  */
1115 
1116 
1117 /*
1118  * Write buffer destructor automatically called from kfree_skb.
1119  */
1120 void sock_wfree(struct sk_buff *skb)
1121 {
1122 	struct sock *sk = skb->sk;
1123 
1124 	/* In case it might be waiting for more memory. */
1125 	atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
1126 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
1127 		sk->sk_write_space(sk);
1128 	sock_put(sk);
1129 }
1130 
1131 /*
1132  * Read buffer destructor automatically called from kfree_skb.
1133  */
1134 void sock_rfree(struct sk_buff *skb)
1135 {
1136 	struct sock *sk = skb->sk;
1137 
1138 	skb_truesize_check(skb);
1139 	atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1140 	sk_mem_uncharge(skb->sk, skb->truesize);
1141 }
1142 
1143 
1144 int sock_i_uid(struct sock *sk)
1145 {
1146 	int uid;
1147 
1148 	read_lock(&sk->sk_callback_lock);
1149 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1150 	read_unlock(&sk->sk_callback_lock);
1151 	return uid;
1152 }
1153 
1154 unsigned long sock_i_ino(struct sock *sk)
1155 {
1156 	unsigned long ino;
1157 
1158 	read_lock(&sk->sk_callback_lock);
1159 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1160 	read_unlock(&sk->sk_callback_lock);
1161 	return ino;
1162 }
1163 
1164 /*
1165  * Allocate a skb from the socket's send buffer.
1166  */
1167 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1168 			     gfp_t priority)
1169 {
1170 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1171 		struct sk_buff * skb = alloc_skb(size, priority);
1172 		if (skb) {
1173 			skb_set_owner_w(skb, sk);
1174 			return skb;
1175 		}
1176 	}
1177 	return NULL;
1178 }
1179 
1180 /*
1181  * Allocate a skb from the socket's receive buffer.
1182  */
1183 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1184 			     gfp_t priority)
1185 {
1186 	if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1187 		struct sk_buff *skb = alloc_skb(size, priority);
1188 		if (skb) {
1189 			skb_set_owner_r(skb, sk);
1190 			return skb;
1191 		}
1192 	}
1193 	return NULL;
1194 }
1195 
1196 /*
1197  * Allocate a memory block from the socket's option memory buffer.
1198  */
1199 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1200 {
1201 	if ((unsigned)size <= sysctl_optmem_max &&
1202 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1203 		void *mem;
1204 		/* First do the add, to avoid the race if kmalloc
1205 		 * might sleep.
1206 		 */
1207 		atomic_add(size, &sk->sk_omem_alloc);
1208 		mem = kmalloc(size, priority);
1209 		if (mem)
1210 			return mem;
1211 		atomic_sub(size, &sk->sk_omem_alloc);
1212 	}
1213 	return NULL;
1214 }
1215 
1216 /*
1217  * Free an option memory block.
1218  */
1219 void sock_kfree_s(struct sock *sk, void *mem, int size)
1220 {
1221 	kfree(mem);
1222 	atomic_sub(size, &sk->sk_omem_alloc);
1223 }
1224 
1225 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1226    I think, these locks should be removed for datagram sockets.
1227  */
1228 static long sock_wait_for_wmem(struct sock * sk, long timeo)
1229 {
1230 	DEFINE_WAIT(wait);
1231 
1232 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1233 	for (;;) {
1234 		if (!timeo)
1235 			break;
1236 		if (signal_pending(current))
1237 			break;
1238 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1239 		prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1240 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1241 			break;
1242 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1243 			break;
1244 		if (sk->sk_err)
1245 			break;
1246 		timeo = schedule_timeout(timeo);
1247 	}
1248 	finish_wait(sk->sk_sleep, &wait);
1249 	return timeo;
1250 }
1251 
1252 
1253 /*
1254  *	Generic send/receive buffer handlers
1255  */
1256 
1257 static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
1258 					    unsigned long header_len,
1259 					    unsigned long data_len,
1260 					    int noblock, int *errcode)
1261 {
1262 	struct sk_buff *skb;
1263 	gfp_t gfp_mask;
1264 	long timeo;
1265 	int err;
1266 
1267 	gfp_mask = sk->sk_allocation;
1268 	if (gfp_mask & __GFP_WAIT)
1269 		gfp_mask |= __GFP_REPEAT;
1270 
1271 	timeo = sock_sndtimeo(sk, noblock);
1272 	while (1) {
1273 		err = sock_error(sk);
1274 		if (err != 0)
1275 			goto failure;
1276 
1277 		err = -EPIPE;
1278 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1279 			goto failure;
1280 
1281 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1282 			skb = alloc_skb(header_len, gfp_mask);
1283 			if (skb) {
1284 				int npages;
1285 				int i;
1286 
1287 				/* No pages, we're done... */
1288 				if (!data_len)
1289 					break;
1290 
1291 				npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1292 				skb->truesize += data_len;
1293 				skb_shinfo(skb)->nr_frags = npages;
1294 				for (i = 0; i < npages; i++) {
1295 					struct page *page;
1296 					skb_frag_t *frag;
1297 
1298 					page = alloc_pages(sk->sk_allocation, 0);
1299 					if (!page) {
1300 						err = -ENOBUFS;
1301 						skb_shinfo(skb)->nr_frags = i;
1302 						kfree_skb(skb);
1303 						goto failure;
1304 					}
1305 
1306 					frag = &skb_shinfo(skb)->frags[i];
1307 					frag->page = page;
1308 					frag->page_offset = 0;
1309 					frag->size = (data_len >= PAGE_SIZE ?
1310 						      PAGE_SIZE :
1311 						      data_len);
1312 					data_len -= PAGE_SIZE;
1313 				}
1314 
1315 				/* Full success... */
1316 				break;
1317 			}
1318 			err = -ENOBUFS;
1319 			goto failure;
1320 		}
1321 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1322 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1323 		err = -EAGAIN;
1324 		if (!timeo)
1325 			goto failure;
1326 		if (signal_pending(current))
1327 			goto interrupted;
1328 		timeo = sock_wait_for_wmem(sk, timeo);
1329 	}
1330 
1331 	skb_set_owner_w(skb, sk);
1332 	return skb;
1333 
1334 interrupted:
1335 	err = sock_intr_errno(timeo);
1336 failure:
1337 	*errcode = err;
1338 	return NULL;
1339 }
1340 
1341 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1342 				    int noblock, int *errcode)
1343 {
1344 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1345 }
1346 
1347 static void __lock_sock(struct sock *sk)
1348 {
1349 	DEFINE_WAIT(wait);
1350 
1351 	for (;;) {
1352 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1353 					TASK_UNINTERRUPTIBLE);
1354 		spin_unlock_bh(&sk->sk_lock.slock);
1355 		schedule();
1356 		spin_lock_bh(&sk->sk_lock.slock);
1357 		if (!sock_owned_by_user(sk))
1358 			break;
1359 	}
1360 	finish_wait(&sk->sk_lock.wq, &wait);
1361 }
1362 
1363 static void __release_sock(struct sock *sk)
1364 {
1365 	struct sk_buff *skb = sk->sk_backlog.head;
1366 
1367 	do {
1368 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1369 		bh_unlock_sock(sk);
1370 
1371 		do {
1372 			struct sk_buff *next = skb->next;
1373 
1374 			skb->next = NULL;
1375 			sk_backlog_rcv(sk, skb);
1376 
1377 			/*
1378 			 * We are in process context here with softirqs
1379 			 * disabled, use cond_resched_softirq() to preempt.
1380 			 * This is safe to do because we've taken the backlog
1381 			 * queue private:
1382 			 */
1383 			cond_resched_softirq();
1384 
1385 			skb = next;
1386 		} while (skb != NULL);
1387 
1388 		bh_lock_sock(sk);
1389 	} while ((skb = sk->sk_backlog.head) != NULL);
1390 }
1391 
1392 /**
1393  * sk_wait_data - wait for data to arrive at sk_receive_queue
1394  * @sk:    sock to wait on
1395  * @timeo: for how long
1396  *
1397  * Now socket state including sk->sk_err is changed only under lock,
1398  * hence we may omit checks after joining wait queue.
1399  * We check receive queue before schedule() only as optimization;
1400  * it is very likely that release_sock() added new data.
1401  */
1402 int sk_wait_data(struct sock *sk, long *timeo)
1403 {
1404 	int rc;
1405 	DEFINE_WAIT(wait);
1406 
1407 	prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1408 	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1409 	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1410 	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1411 	finish_wait(sk->sk_sleep, &wait);
1412 	return rc;
1413 }
1414 
1415 EXPORT_SYMBOL(sk_wait_data);
1416 
1417 /**
1418  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1419  *	@sk: socket
1420  *	@size: memory size to allocate
1421  *	@kind: allocation type
1422  *
1423  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1424  *	rmem allocation. This function assumes that protocols which have
1425  *	memory_pressure use sk_wmem_queued as write buffer accounting.
1426  */
1427 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1428 {
1429 	struct proto *prot = sk->sk_prot;
1430 	int amt = sk_mem_pages(size);
1431 	int allocated;
1432 
1433 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1434 	allocated = atomic_add_return(amt, prot->memory_allocated);
1435 
1436 	/* Under limit. */
1437 	if (allocated <= prot->sysctl_mem[0]) {
1438 		if (prot->memory_pressure && *prot->memory_pressure)
1439 			*prot->memory_pressure = 0;
1440 		return 1;
1441 	}
1442 
1443 	/* Under pressure. */
1444 	if (allocated > prot->sysctl_mem[1])
1445 		if (prot->enter_memory_pressure)
1446 			prot->enter_memory_pressure(sk);
1447 
1448 	/* Over hard limit. */
1449 	if (allocated > prot->sysctl_mem[2])
1450 		goto suppress_allocation;
1451 
1452 	/* guarantee minimum buffer size under pressure */
1453 	if (kind == SK_MEM_RECV) {
1454 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1455 			return 1;
1456 	} else { /* SK_MEM_SEND */
1457 		if (sk->sk_type == SOCK_STREAM) {
1458 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1459 				return 1;
1460 		} else if (atomic_read(&sk->sk_wmem_alloc) <
1461 			   prot->sysctl_wmem[0])
1462 				return 1;
1463 	}
1464 
1465 	if (prot->memory_pressure) {
1466 		int alloc;
1467 
1468 		if (!*prot->memory_pressure)
1469 			return 1;
1470 		alloc = percpu_counter_read_positive(prot->sockets_allocated);
1471 		if (prot->sysctl_mem[2] > alloc *
1472 		    sk_mem_pages(sk->sk_wmem_queued +
1473 				 atomic_read(&sk->sk_rmem_alloc) +
1474 				 sk->sk_forward_alloc))
1475 			return 1;
1476 	}
1477 
1478 suppress_allocation:
1479 
1480 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1481 		sk_stream_moderate_sndbuf(sk);
1482 
1483 		/* Fail only if socket is _under_ its sndbuf.
1484 		 * In this case we cannot block, so that we have to fail.
1485 		 */
1486 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1487 			return 1;
1488 	}
1489 
1490 	/* Alas. Undo changes. */
1491 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1492 	atomic_sub(amt, prot->memory_allocated);
1493 	return 0;
1494 }
1495 
1496 EXPORT_SYMBOL(__sk_mem_schedule);
1497 
1498 /**
1499  *	__sk_reclaim - reclaim memory_allocated
1500  *	@sk: socket
1501  */
1502 void __sk_mem_reclaim(struct sock *sk)
1503 {
1504 	struct proto *prot = sk->sk_prot;
1505 
1506 	atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1507 		   prot->memory_allocated);
1508 	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1509 
1510 	if (prot->memory_pressure && *prot->memory_pressure &&
1511 	    (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1512 		*prot->memory_pressure = 0;
1513 }
1514 
1515 EXPORT_SYMBOL(__sk_mem_reclaim);
1516 
1517 
1518 /*
1519  * Set of default routines for initialising struct proto_ops when
1520  * the protocol does not support a particular function. In certain
1521  * cases where it makes no sense for a protocol to have a "do nothing"
1522  * function, some default processing is provided.
1523  */
1524 
1525 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1526 {
1527 	return -EOPNOTSUPP;
1528 }
1529 
1530 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1531 		    int len, int flags)
1532 {
1533 	return -EOPNOTSUPP;
1534 }
1535 
1536 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1537 {
1538 	return -EOPNOTSUPP;
1539 }
1540 
1541 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1542 {
1543 	return -EOPNOTSUPP;
1544 }
1545 
1546 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1547 		    int *len, int peer)
1548 {
1549 	return -EOPNOTSUPP;
1550 }
1551 
1552 unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
1553 {
1554 	return 0;
1555 }
1556 
1557 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1558 {
1559 	return -EOPNOTSUPP;
1560 }
1561 
1562 int sock_no_listen(struct socket *sock, int backlog)
1563 {
1564 	return -EOPNOTSUPP;
1565 }
1566 
1567 int sock_no_shutdown(struct socket *sock, int how)
1568 {
1569 	return -EOPNOTSUPP;
1570 }
1571 
1572 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1573 		    char __user *optval, int optlen)
1574 {
1575 	return -EOPNOTSUPP;
1576 }
1577 
1578 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1579 		    char __user *optval, int __user *optlen)
1580 {
1581 	return -EOPNOTSUPP;
1582 }
1583 
1584 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1585 		    size_t len)
1586 {
1587 	return -EOPNOTSUPP;
1588 }
1589 
1590 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1591 		    size_t len, int flags)
1592 {
1593 	return -EOPNOTSUPP;
1594 }
1595 
1596 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1597 {
1598 	/* Mirror missing mmap method error code */
1599 	return -ENODEV;
1600 }
1601 
1602 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1603 {
1604 	ssize_t res;
1605 	struct msghdr msg = {.msg_flags = flags};
1606 	struct kvec iov;
1607 	char *kaddr = kmap(page);
1608 	iov.iov_base = kaddr + offset;
1609 	iov.iov_len = size;
1610 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1611 	kunmap(page);
1612 	return res;
1613 }
1614 
1615 /*
1616  *	Default Socket Callbacks
1617  */
1618 
1619 static void sock_def_wakeup(struct sock *sk)
1620 {
1621 	read_lock(&sk->sk_callback_lock);
1622 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1623 		wake_up_interruptible_all(sk->sk_sleep);
1624 	read_unlock(&sk->sk_callback_lock);
1625 }
1626 
1627 static void sock_def_error_report(struct sock *sk)
1628 {
1629 	read_lock(&sk->sk_callback_lock);
1630 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1631 		wake_up_interruptible(sk->sk_sleep);
1632 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1633 	read_unlock(&sk->sk_callback_lock);
1634 }
1635 
1636 static void sock_def_readable(struct sock *sk, int len)
1637 {
1638 	read_lock(&sk->sk_callback_lock);
1639 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1640 		wake_up_interruptible_sync(sk->sk_sleep);
1641 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1642 	read_unlock(&sk->sk_callback_lock);
1643 }
1644 
1645 static void sock_def_write_space(struct sock *sk)
1646 {
1647 	read_lock(&sk->sk_callback_lock);
1648 
1649 	/* Do not wake up a writer until he can make "significant"
1650 	 * progress.  --DaveM
1651 	 */
1652 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1653 		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
1654 			wake_up_interruptible_sync(sk->sk_sleep);
1655 
1656 		/* Should agree with poll, otherwise some programs break */
1657 		if (sock_writeable(sk))
1658 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1659 	}
1660 
1661 	read_unlock(&sk->sk_callback_lock);
1662 }
1663 
1664 static void sock_def_destruct(struct sock *sk)
1665 {
1666 	kfree(sk->sk_protinfo);
1667 }
1668 
1669 void sk_send_sigurg(struct sock *sk)
1670 {
1671 	if (sk->sk_socket && sk->sk_socket->file)
1672 		if (send_sigurg(&sk->sk_socket->file->f_owner))
1673 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1674 }
1675 
1676 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1677 		    unsigned long expires)
1678 {
1679 	if (!mod_timer(timer, expires))
1680 		sock_hold(sk);
1681 }
1682 
1683 EXPORT_SYMBOL(sk_reset_timer);
1684 
1685 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1686 {
1687 	if (timer_pending(timer) && del_timer(timer))
1688 		__sock_put(sk);
1689 }
1690 
1691 EXPORT_SYMBOL(sk_stop_timer);
1692 
1693 void sock_init_data(struct socket *sock, struct sock *sk)
1694 {
1695 	skb_queue_head_init(&sk->sk_receive_queue);
1696 	skb_queue_head_init(&sk->sk_write_queue);
1697 	skb_queue_head_init(&sk->sk_error_queue);
1698 #ifdef CONFIG_NET_DMA
1699 	skb_queue_head_init(&sk->sk_async_wait_queue);
1700 #endif
1701 
1702 	sk->sk_send_head	=	NULL;
1703 
1704 	init_timer(&sk->sk_timer);
1705 
1706 	sk->sk_allocation	=	GFP_KERNEL;
1707 	sk->sk_rcvbuf		=	sysctl_rmem_default;
1708 	sk->sk_sndbuf		=	sysctl_wmem_default;
1709 	sk->sk_state		=	TCP_CLOSE;
1710 	sk_set_socket(sk, sock);
1711 
1712 	sock_set_flag(sk, SOCK_ZAPPED);
1713 
1714 	if (sock) {
1715 		sk->sk_type	=	sock->type;
1716 		sk->sk_sleep	=	&sock->wait;
1717 		sock->sk	=	sk;
1718 	} else
1719 		sk->sk_sleep	=	NULL;
1720 
1721 	rwlock_init(&sk->sk_dst_lock);
1722 	rwlock_init(&sk->sk_callback_lock);
1723 	lockdep_set_class_and_name(&sk->sk_callback_lock,
1724 			af_callback_keys + sk->sk_family,
1725 			af_family_clock_key_strings[sk->sk_family]);
1726 
1727 	sk->sk_state_change	=	sock_def_wakeup;
1728 	sk->sk_data_ready	=	sock_def_readable;
1729 	sk->sk_write_space	=	sock_def_write_space;
1730 	sk->sk_error_report	=	sock_def_error_report;
1731 	sk->sk_destruct		=	sock_def_destruct;
1732 
1733 	sk->sk_sndmsg_page	=	NULL;
1734 	sk->sk_sndmsg_off	=	0;
1735 
1736 	sk->sk_peercred.pid 	=	0;
1737 	sk->sk_peercred.uid	=	-1;
1738 	sk->sk_peercred.gid	=	-1;
1739 	sk->sk_write_pending	=	0;
1740 	sk->sk_rcvlowat		=	1;
1741 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
1742 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
1743 
1744 	sk->sk_stamp = ktime_set(-1L, 0);
1745 
1746 	atomic_set(&sk->sk_refcnt, 1);
1747 	atomic_set(&sk->sk_drops, 0);
1748 }
1749 
1750 void lock_sock_nested(struct sock *sk, int subclass)
1751 {
1752 	might_sleep();
1753 	spin_lock_bh(&sk->sk_lock.slock);
1754 	if (sk->sk_lock.owned)
1755 		__lock_sock(sk);
1756 	sk->sk_lock.owned = 1;
1757 	spin_unlock(&sk->sk_lock.slock);
1758 	/*
1759 	 * The sk_lock has mutex_lock() semantics here:
1760 	 */
1761 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1762 	local_bh_enable();
1763 }
1764 
1765 EXPORT_SYMBOL(lock_sock_nested);
1766 
1767 void release_sock(struct sock *sk)
1768 {
1769 	/*
1770 	 * The sk_lock has mutex_unlock() semantics:
1771 	 */
1772 	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1773 
1774 	spin_lock_bh(&sk->sk_lock.slock);
1775 	if (sk->sk_backlog.tail)
1776 		__release_sock(sk);
1777 	sk->sk_lock.owned = 0;
1778 	if (waitqueue_active(&sk->sk_lock.wq))
1779 		wake_up(&sk->sk_lock.wq);
1780 	spin_unlock_bh(&sk->sk_lock.slock);
1781 }
1782 EXPORT_SYMBOL(release_sock);
1783 
1784 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1785 {
1786 	struct timeval tv;
1787 	if (!sock_flag(sk, SOCK_TIMESTAMP))
1788 		sock_enable_timestamp(sk);
1789 	tv = ktime_to_timeval(sk->sk_stamp);
1790 	if (tv.tv_sec == -1)
1791 		return -ENOENT;
1792 	if (tv.tv_sec == 0) {
1793 		sk->sk_stamp = ktime_get_real();
1794 		tv = ktime_to_timeval(sk->sk_stamp);
1795 	}
1796 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1797 }
1798 EXPORT_SYMBOL(sock_get_timestamp);
1799 
1800 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1801 {
1802 	struct timespec ts;
1803 	if (!sock_flag(sk, SOCK_TIMESTAMP))
1804 		sock_enable_timestamp(sk);
1805 	ts = ktime_to_timespec(sk->sk_stamp);
1806 	if (ts.tv_sec == -1)
1807 		return -ENOENT;
1808 	if (ts.tv_sec == 0) {
1809 		sk->sk_stamp = ktime_get_real();
1810 		ts = ktime_to_timespec(sk->sk_stamp);
1811 	}
1812 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1813 }
1814 EXPORT_SYMBOL(sock_get_timestampns);
1815 
1816 void sock_enable_timestamp(struct sock *sk)
1817 {
1818 	if (!sock_flag(sk, SOCK_TIMESTAMP)) {
1819 		sock_set_flag(sk, SOCK_TIMESTAMP);
1820 		net_enable_timestamp();
1821 	}
1822 }
1823 
1824 /*
1825  *	Get a socket option on an socket.
1826  *
1827  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
1828  *	asynchronous errors should be reported by getsockopt. We assume
1829  *	this means if you specify SO_ERROR (otherwise whats the point of it).
1830  */
1831 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1832 			   char __user *optval, int __user *optlen)
1833 {
1834 	struct sock *sk = sock->sk;
1835 
1836 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1837 }
1838 
1839 EXPORT_SYMBOL(sock_common_getsockopt);
1840 
1841 #ifdef CONFIG_COMPAT
1842 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
1843 				  char __user *optval, int __user *optlen)
1844 {
1845 	struct sock *sk = sock->sk;
1846 
1847 	if (sk->sk_prot->compat_getsockopt != NULL)
1848 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
1849 						      optval, optlen);
1850 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
1851 }
1852 EXPORT_SYMBOL(compat_sock_common_getsockopt);
1853 #endif
1854 
1855 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
1856 			struct msghdr *msg, size_t size, int flags)
1857 {
1858 	struct sock *sk = sock->sk;
1859 	int addr_len = 0;
1860 	int err;
1861 
1862 	err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
1863 				   flags & ~MSG_DONTWAIT, &addr_len);
1864 	if (err >= 0)
1865 		msg->msg_namelen = addr_len;
1866 	return err;
1867 }
1868 
1869 EXPORT_SYMBOL(sock_common_recvmsg);
1870 
1871 /*
1872  *	Set socket options on an inet socket.
1873  */
1874 int sock_common_setsockopt(struct socket *sock, int level, int optname,
1875 			   char __user *optval, int optlen)
1876 {
1877 	struct sock *sk = sock->sk;
1878 
1879 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1880 }
1881 
1882 EXPORT_SYMBOL(sock_common_setsockopt);
1883 
1884 #ifdef CONFIG_COMPAT
1885 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
1886 				  char __user *optval, int optlen)
1887 {
1888 	struct sock *sk = sock->sk;
1889 
1890 	if (sk->sk_prot->compat_setsockopt != NULL)
1891 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
1892 						      optval, optlen);
1893 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
1894 }
1895 EXPORT_SYMBOL(compat_sock_common_setsockopt);
1896 #endif
1897 
1898 void sk_common_release(struct sock *sk)
1899 {
1900 	if (sk->sk_prot->destroy)
1901 		sk->sk_prot->destroy(sk);
1902 
1903 	/*
1904 	 * Observation: when sock_common_release is called, processes have
1905 	 * no access to socket. But net still has.
1906 	 * Step one, detach it from networking:
1907 	 *
1908 	 * A. Remove from hash tables.
1909 	 */
1910 
1911 	sk->sk_prot->unhash(sk);
1912 
1913 	/*
1914 	 * In this point socket cannot receive new packets, but it is possible
1915 	 * that some packets are in flight because some CPU runs receiver and
1916 	 * did hash table lookup before we unhashed socket. They will achieve
1917 	 * receive queue and will be purged by socket destructor.
1918 	 *
1919 	 * Also we still have packets pending on receive queue and probably,
1920 	 * our own packets waiting in device queues. sock_destroy will drain
1921 	 * receive queue, but transmitted packets will delay socket destruction
1922 	 * until the last reference will be released.
1923 	 */
1924 
1925 	sock_orphan(sk);
1926 
1927 	xfrm_sk_free_policy(sk);
1928 
1929 	sk_refcnt_debug_release(sk);
1930 	sock_put(sk);
1931 }
1932 
1933 EXPORT_SYMBOL(sk_common_release);
1934 
1935 static DEFINE_RWLOCK(proto_list_lock);
1936 static LIST_HEAD(proto_list);
1937 
1938 #ifdef CONFIG_PROC_FS
1939 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
1940 struct prot_inuse {
1941 	int val[PROTO_INUSE_NR];
1942 };
1943 
1944 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
1945 
1946 #ifdef CONFIG_NET_NS
1947 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
1948 {
1949 	int cpu = smp_processor_id();
1950 	per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
1951 }
1952 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
1953 
1954 int sock_prot_inuse_get(struct net *net, struct proto *prot)
1955 {
1956 	int cpu, idx = prot->inuse_idx;
1957 	int res = 0;
1958 
1959 	for_each_possible_cpu(cpu)
1960 		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
1961 
1962 	return res >= 0 ? res : 0;
1963 }
1964 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
1965 
1966 static int sock_inuse_init_net(struct net *net)
1967 {
1968 	net->core.inuse = alloc_percpu(struct prot_inuse);
1969 	return net->core.inuse ? 0 : -ENOMEM;
1970 }
1971 
1972 static void sock_inuse_exit_net(struct net *net)
1973 {
1974 	free_percpu(net->core.inuse);
1975 }
1976 
1977 static struct pernet_operations net_inuse_ops = {
1978 	.init = sock_inuse_init_net,
1979 	.exit = sock_inuse_exit_net,
1980 };
1981 
1982 static __init int net_inuse_init(void)
1983 {
1984 	if (register_pernet_subsys(&net_inuse_ops))
1985 		panic("Cannot initialize net inuse counters");
1986 
1987 	return 0;
1988 }
1989 
1990 core_initcall(net_inuse_init);
1991 #else
1992 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
1993 
1994 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
1995 {
1996 	__get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
1997 }
1998 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
1999 
2000 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2001 {
2002 	int cpu, idx = prot->inuse_idx;
2003 	int res = 0;
2004 
2005 	for_each_possible_cpu(cpu)
2006 		res += per_cpu(prot_inuse, cpu).val[idx];
2007 
2008 	return res >= 0 ? res : 0;
2009 }
2010 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2011 #endif
2012 
2013 static void assign_proto_idx(struct proto *prot)
2014 {
2015 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2016 
2017 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2018 		printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2019 		return;
2020 	}
2021 
2022 	set_bit(prot->inuse_idx, proto_inuse_idx);
2023 }
2024 
2025 static void release_proto_idx(struct proto *prot)
2026 {
2027 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2028 		clear_bit(prot->inuse_idx, proto_inuse_idx);
2029 }
2030 #else
2031 static inline void assign_proto_idx(struct proto *prot)
2032 {
2033 }
2034 
2035 static inline void release_proto_idx(struct proto *prot)
2036 {
2037 }
2038 #endif
2039 
2040 int proto_register(struct proto *prot, int alloc_slab)
2041 {
2042 	if (alloc_slab) {
2043 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2044 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2045 					NULL);
2046 
2047 		if (prot->slab == NULL) {
2048 			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2049 			       prot->name);
2050 			goto out;
2051 		}
2052 
2053 		if (prot->rsk_prot != NULL) {
2054 			static const char mask[] = "request_sock_%s";
2055 
2056 			prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2057 			if (prot->rsk_prot->slab_name == NULL)
2058 				goto out_free_sock_slab;
2059 
2060 			sprintf(prot->rsk_prot->slab_name, mask, prot->name);
2061 			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2062 								 prot->rsk_prot->obj_size, 0,
2063 								 SLAB_HWCACHE_ALIGN, NULL);
2064 
2065 			if (prot->rsk_prot->slab == NULL) {
2066 				printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2067 				       prot->name);
2068 				goto out_free_request_sock_slab_name;
2069 			}
2070 		}
2071 
2072 		if (prot->twsk_prot != NULL) {
2073 			static const char mask[] = "tw_sock_%s";
2074 
2075 			prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2076 
2077 			if (prot->twsk_prot->twsk_slab_name == NULL)
2078 				goto out_free_request_sock_slab;
2079 
2080 			sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
2081 			prot->twsk_prot->twsk_slab =
2082 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2083 						  prot->twsk_prot->twsk_obj_size,
2084 						  0,
2085 						  SLAB_HWCACHE_ALIGN |
2086 							prot->slab_flags,
2087 						  NULL);
2088 			if (prot->twsk_prot->twsk_slab == NULL)
2089 				goto out_free_timewait_sock_slab_name;
2090 		}
2091 	}
2092 
2093 	write_lock(&proto_list_lock);
2094 	list_add(&prot->node, &proto_list);
2095 	assign_proto_idx(prot);
2096 	write_unlock(&proto_list_lock);
2097 	return 0;
2098 
2099 out_free_timewait_sock_slab_name:
2100 	kfree(prot->twsk_prot->twsk_slab_name);
2101 out_free_request_sock_slab:
2102 	if (prot->rsk_prot && prot->rsk_prot->slab) {
2103 		kmem_cache_destroy(prot->rsk_prot->slab);
2104 		prot->rsk_prot->slab = NULL;
2105 	}
2106 out_free_request_sock_slab_name:
2107 	kfree(prot->rsk_prot->slab_name);
2108 out_free_sock_slab:
2109 	kmem_cache_destroy(prot->slab);
2110 	prot->slab = NULL;
2111 out:
2112 	return -ENOBUFS;
2113 }
2114 
2115 EXPORT_SYMBOL(proto_register);
2116 
2117 void proto_unregister(struct proto *prot)
2118 {
2119 	write_lock(&proto_list_lock);
2120 	release_proto_idx(prot);
2121 	list_del(&prot->node);
2122 	write_unlock(&proto_list_lock);
2123 
2124 	if (prot->slab != NULL) {
2125 		kmem_cache_destroy(prot->slab);
2126 		prot->slab = NULL;
2127 	}
2128 
2129 	if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2130 		kmem_cache_destroy(prot->rsk_prot->slab);
2131 		kfree(prot->rsk_prot->slab_name);
2132 		prot->rsk_prot->slab = NULL;
2133 	}
2134 
2135 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2136 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2137 		kfree(prot->twsk_prot->twsk_slab_name);
2138 		prot->twsk_prot->twsk_slab = NULL;
2139 	}
2140 }
2141 
2142 EXPORT_SYMBOL(proto_unregister);
2143 
2144 #ifdef CONFIG_PROC_FS
2145 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2146 	__acquires(proto_list_lock)
2147 {
2148 	read_lock(&proto_list_lock);
2149 	return seq_list_start_head(&proto_list, *pos);
2150 }
2151 
2152 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2153 {
2154 	return seq_list_next(v, &proto_list, pos);
2155 }
2156 
2157 static void proto_seq_stop(struct seq_file *seq, void *v)
2158 	__releases(proto_list_lock)
2159 {
2160 	read_unlock(&proto_list_lock);
2161 }
2162 
2163 static char proto_method_implemented(const void *method)
2164 {
2165 	return method == NULL ? 'n' : 'y';
2166 }
2167 
2168 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2169 {
2170 	seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
2171 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2172 		   proto->name,
2173 		   proto->obj_size,
2174 		   sock_prot_inuse_get(seq_file_net(seq), proto),
2175 		   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2176 		   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2177 		   proto->max_header,
2178 		   proto->slab == NULL ? "no" : "yes",
2179 		   module_name(proto->owner),
2180 		   proto_method_implemented(proto->close),
2181 		   proto_method_implemented(proto->connect),
2182 		   proto_method_implemented(proto->disconnect),
2183 		   proto_method_implemented(proto->accept),
2184 		   proto_method_implemented(proto->ioctl),
2185 		   proto_method_implemented(proto->init),
2186 		   proto_method_implemented(proto->destroy),
2187 		   proto_method_implemented(proto->shutdown),
2188 		   proto_method_implemented(proto->setsockopt),
2189 		   proto_method_implemented(proto->getsockopt),
2190 		   proto_method_implemented(proto->sendmsg),
2191 		   proto_method_implemented(proto->recvmsg),
2192 		   proto_method_implemented(proto->sendpage),
2193 		   proto_method_implemented(proto->bind),
2194 		   proto_method_implemented(proto->backlog_rcv),
2195 		   proto_method_implemented(proto->hash),
2196 		   proto_method_implemented(proto->unhash),
2197 		   proto_method_implemented(proto->get_port),
2198 		   proto_method_implemented(proto->enter_memory_pressure));
2199 }
2200 
2201 static int proto_seq_show(struct seq_file *seq, void *v)
2202 {
2203 	if (v == &proto_list)
2204 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2205 			   "protocol",
2206 			   "size",
2207 			   "sockets",
2208 			   "memory",
2209 			   "press",
2210 			   "maxhdr",
2211 			   "slab",
2212 			   "module",
2213 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2214 	else
2215 		proto_seq_printf(seq, list_entry(v, struct proto, node));
2216 	return 0;
2217 }
2218 
2219 static const struct seq_operations proto_seq_ops = {
2220 	.start  = proto_seq_start,
2221 	.next   = proto_seq_next,
2222 	.stop   = proto_seq_stop,
2223 	.show   = proto_seq_show,
2224 };
2225 
2226 static int proto_seq_open(struct inode *inode, struct file *file)
2227 {
2228 	return seq_open_net(inode, file, &proto_seq_ops,
2229 			    sizeof(struct seq_net_private));
2230 }
2231 
2232 static const struct file_operations proto_seq_fops = {
2233 	.owner		= THIS_MODULE,
2234 	.open		= proto_seq_open,
2235 	.read		= seq_read,
2236 	.llseek		= seq_lseek,
2237 	.release	= seq_release_net,
2238 };
2239 
2240 static __net_init int proto_init_net(struct net *net)
2241 {
2242 	if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2243 		return -ENOMEM;
2244 
2245 	return 0;
2246 }
2247 
2248 static __net_exit void proto_exit_net(struct net *net)
2249 {
2250 	proc_net_remove(net, "protocols");
2251 }
2252 
2253 
2254 static __net_initdata struct pernet_operations proto_net_ops = {
2255 	.init = proto_init_net,
2256 	.exit = proto_exit_net,
2257 };
2258 
2259 static int __init proto_init(void)
2260 {
2261 	return register_pernet_subsys(&proto_net_ops);
2262 }
2263 
2264 subsys_initcall(proto_init);
2265 
2266 #endif /* PROC_FS */
2267 
2268 EXPORT_SYMBOL(sk_alloc);
2269 EXPORT_SYMBOL(sk_free);
2270 EXPORT_SYMBOL(sk_send_sigurg);
2271 EXPORT_SYMBOL(sock_alloc_send_skb);
2272 EXPORT_SYMBOL(sock_init_data);
2273 EXPORT_SYMBOL(sock_kfree_s);
2274 EXPORT_SYMBOL(sock_kmalloc);
2275 EXPORT_SYMBOL(sock_no_accept);
2276 EXPORT_SYMBOL(sock_no_bind);
2277 EXPORT_SYMBOL(sock_no_connect);
2278 EXPORT_SYMBOL(sock_no_getname);
2279 EXPORT_SYMBOL(sock_no_getsockopt);
2280 EXPORT_SYMBOL(sock_no_ioctl);
2281 EXPORT_SYMBOL(sock_no_listen);
2282 EXPORT_SYMBOL(sock_no_mmap);
2283 EXPORT_SYMBOL(sock_no_poll);
2284 EXPORT_SYMBOL(sock_no_recvmsg);
2285 EXPORT_SYMBOL(sock_no_sendmsg);
2286 EXPORT_SYMBOL(sock_no_sendpage);
2287 EXPORT_SYMBOL(sock_no_setsockopt);
2288 EXPORT_SYMBOL(sock_no_shutdown);
2289 EXPORT_SYMBOL(sock_no_socketpair);
2290 EXPORT_SYMBOL(sock_rfree);
2291 EXPORT_SYMBOL(sock_setsockopt);
2292 EXPORT_SYMBOL(sock_wfree);
2293 EXPORT_SYMBOL(sock_wmalloc);
2294 EXPORT_SYMBOL(sock_i_uid);
2295 EXPORT_SYMBOL(sock_i_ino);
2296 EXPORT_SYMBOL(sysctl_optmem_max);
2297