xref: /linux/net/core/sock.c (revision a8fe58cec351c25e09c393bf46117c0c47b5a17c)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/errqueue.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/timer.h>
106 #include <linux/string.h>
107 #include <linux/sockios.h>
108 #include <linux/net.h>
109 #include <linux/mm.h>
110 #include <linux/slab.h>
111 #include <linux/interrupt.h>
112 #include <linux/poll.h>
113 #include <linux/tcp.h>
114 #include <linux/init.h>
115 #include <linux/highmem.h>
116 #include <linux/user_namespace.h>
117 #include <linux/static_key.h>
118 #include <linux/memcontrol.h>
119 #include <linux/prefetch.h>
120 
121 #include <asm/uaccess.h>
122 
123 #include <linux/netdevice.h>
124 #include <net/protocol.h>
125 #include <linux/skbuff.h>
126 #include <net/net_namespace.h>
127 #include <net/request_sock.h>
128 #include <net/sock.h>
129 #include <linux/net_tstamp.h>
130 #include <net/xfrm.h>
131 #include <linux/ipsec.h>
132 #include <net/cls_cgroup.h>
133 #include <net/netprio_cgroup.h>
134 #include <linux/sock_diag.h>
135 
136 #include <linux/filter.h>
137 #include <net/sock_reuseport.h>
138 
139 #include <trace/events/sock.h>
140 
141 #ifdef CONFIG_INET
142 #include <net/tcp.h>
143 #endif
144 
145 #include <net/busy_poll.h>
146 
147 static DEFINE_MUTEX(proto_list_mutex);
148 static LIST_HEAD(proto_list);
149 
150 /**
151  * sk_ns_capable - General socket capability test
152  * @sk: Socket to use a capability on or through
153  * @user_ns: The user namespace of the capability to use
154  * @cap: The capability to use
155  *
156  * Test to see if the opener of the socket had when the socket was
157  * created and the current process has the capability @cap in the user
158  * namespace @user_ns.
159  */
160 bool sk_ns_capable(const struct sock *sk,
161 		   struct user_namespace *user_ns, int cap)
162 {
163 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
164 		ns_capable(user_ns, cap);
165 }
166 EXPORT_SYMBOL(sk_ns_capable);
167 
168 /**
169  * sk_capable - Socket global capability test
170  * @sk: Socket to use a capability on or through
171  * @cap: The global capability to use
172  *
173  * Test to see if the opener of the socket had when the socket was
174  * created and the current process has the capability @cap in all user
175  * namespaces.
176  */
177 bool sk_capable(const struct sock *sk, int cap)
178 {
179 	return sk_ns_capable(sk, &init_user_ns, cap);
180 }
181 EXPORT_SYMBOL(sk_capable);
182 
183 /**
184  * sk_net_capable - Network namespace socket capability test
185  * @sk: Socket to use a capability on or through
186  * @cap: The capability to use
187  *
188  * Test to see if the opener of the socket had when the socket was created
189  * and the current process has the capability @cap over the network namespace
190  * the socket is a member of.
191  */
192 bool sk_net_capable(const struct sock *sk, int cap)
193 {
194 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
195 }
196 EXPORT_SYMBOL(sk_net_capable);
197 
198 /*
199  * Each address family might have different locking rules, so we have
200  * one slock key per address family:
201  */
202 static struct lock_class_key af_family_keys[AF_MAX];
203 static struct lock_class_key af_family_slock_keys[AF_MAX];
204 
205 /*
206  * Make lock validator output more readable. (we pre-construct these
207  * strings build-time, so that runtime initialization of socket
208  * locks is fast):
209  */
210 static const char *const af_family_key_strings[AF_MAX+1] = {
211   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
212   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
213   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
214   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
215   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
216   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
217   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
218   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
219   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
220   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
221   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
222   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
223   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
224   "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
225 };
226 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
227   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
228   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
229   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
230   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
231   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
232   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
233   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
234   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
235   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
236   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
237   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
238   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
239   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
240   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
241 };
242 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
243   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
244   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
245   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
246   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
247   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
248   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
249   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
250   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
251   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
252   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
253   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
254   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
255   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
256   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
257 };
258 
259 /*
260  * sk_callback_lock locking rules are per-address-family,
261  * so split the lock classes by using a per-AF key:
262  */
263 static struct lock_class_key af_callback_keys[AF_MAX];
264 
265 /* Take into consideration the size of the struct sk_buff overhead in the
266  * determination of these values, since that is non-constant across
267  * platforms.  This makes socket queueing behavior and performance
268  * not depend upon such differences.
269  */
270 #define _SK_MEM_PACKETS		256
271 #define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
272 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
273 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
274 
275 /* Run time adjustable parameters. */
276 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
277 EXPORT_SYMBOL(sysctl_wmem_max);
278 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
279 EXPORT_SYMBOL(sysctl_rmem_max);
280 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
281 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
282 
283 /* Maximal space eaten by iovec or ancillary data plus some space */
284 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
285 EXPORT_SYMBOL(sysctl_optmem_max);
286 
287 int sysctl_tstamp_allow_data __read_mostly = 1;
288 
289 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
290 EXPORT_SYMBOL_GPL(memalloc_socks);
291 
292 /**
293  * sk_set_memalloc - sets %SOCK_MEMALLOC
294  * @sk: socket to set it on
295  *
296  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
297  * It's the responsibility of the admin to adjust min_free_kbytes
298  * to meet the requirements
299  */
300 void sk_set_memalloc(struct sock *sk)
301 {
302 	sock_set_flag(sk, SOCK_MEMALLOC);
303 	sk->sk_allocation |= __GFP_MEMALLOC;
304 	static_key_slow_inc(&memalloc_socks);
305 }
306 EXPORT_SYMBOL_GPL(sk_set_memalloc);
307 
308 void sk_clear_memalloc(struct sock *sk)
309 {
310 	sock_reset_flag(sk, SOCK_MEMALLOC);
311 	sk->sk_allocation &= ~__GFP_MEMALLOC;
312 	static_key_slow_dec(&memalloc_socks);
313 
314 	/*
315 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
316 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
317 	 * it has rmem allocations due to the last swapfile being deactivated
318 	 * but there is a risk that the socket is unusable due to exceeding
319 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
320 	 */
321 	sk_mem_reclaim(sk);
322 }
323 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
324 
325 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
326 {
327 	int ret;
328 	unsigned long pflags = current->flags;
329 
330 	/* these should have been dropped before queueing */
331 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
332 
333 	current->flags |= PF_MEMALLOC;
334 	ret = sk->sk_backlog_rcv(sk, skb);
335 	tsk_restore_flags(current, pflags, PF_MEMALLOC);
336 
337 	return ret;
338 }
339 EXPORT_SYMBOL(__sk_backlog_rcv);
340 
341 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
342 {
343 	struct timeval tv;
344 
345 	if (optlen < sizeof(tv))
346 		return -EINVAL;
347 	if (copy_from_user(&tv, optval, sizeof(tv)))
348 		return -EFAULT;
349 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
350 		return -EDOM;
351 
352 	if (tv.tv_sec < 0) {
353 		static int warned __read_mostly;
354 
355 		*timeo_p = 0;
356 		if (warned < 10 && net_ratelimit()) {
357 			warned++;
358 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
359 				__func__, current->comm, task_pid_nr(current));
360 		}
361 		return 0;
362 	}
363 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
364 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
365 		return 0;
366 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
367 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
368 	return 0;
369 }
370 
371 static void sock_warn_obsolete_bsdism(const char *name)
372 {
373 	static int warned;
374 	static char warncomm[TASK_COMM_LEN];
375 	if (strcmp(warncomm, current->comm) && warned < 5) {
376 		strcpy(warncomm,  current->comm);
377 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
378 			warncomm, name);
379 		warned++;
380 	}
381 }
382 
383 static bool sock_needs_netstamp(const struct sock *sk)
384 {
385 	switch (sk->sk_family) {
386 	case AF_UNSPEC:
387 	case AF_UNIX:
388 		return false;
389 	default:
390 		return true;
391 	}
392 }
393 
394 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
395 {
396 	if (sk->sk_flags & flags) {
397 		sk->sk_flags &= ~flags;
398 		if (sock_needs_netstamp(sk) &&
399 		    !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
400 			net_disable_timestamp();
401 	}
402 }
403 
404 
405 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
406 {
407 	int err;
408 	unsigned long flags;
409 	struct sk_buff_head *list = &sk->sk_receive_queue;
410 
411 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
412 		atomic_inc(&sk->sk_drops);
413 		trace_sock_rcvqueue_full(sk, skb);
414 		return -ENOMEM;
415 	}
416 
417 	err = sk_filter(sk, skb);
418 	if (err)
419 		return err;
420 
421 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
422 		atomic_inc(&sk->sk_drops);
423 		return -ENOBUFS;
424 	}
425 
426 	skb->dev = NULL;
427 	skb_set_owner_r(skb, sk);
428 
429 	/* we escape from rcu protected region, make sure we dont leak
430 	 * a norefcounted dst
431 	 */
432 	skb_dst_force(skb);
433 
434 	spin_lock_irqsave(&list->lock, flags);
435 	sock_skb_set_dropcount(sk, skb);
436 	__skb_queue_tail(list, skb);
437 	spin_unlock_irqrestore(&list->lock, flags);
438 
439 	if (!sock_flag(sk, SOCK_DEAD))
440 		sk->sk_data_ready(sk);
441 	return 0;
442 }
443 EXPORT_SYMBOL(sock_queue_rcv_skb);
444 
445 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
446 {
447 	int rc = NET_RX_SUCCESS;
448 
449 	if (sk_filter(sk, skb))
450 		goto discard_and_relse;
451 
452 	skb->dev = NULL;
453 
454 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
455 		atomic_inc(&sk->sk_drops);
456 		goto discard_and_relse;
457 	}
458 	if (nested)
459 		bh_lock_sock_nested(sk);
460 	else
461 		bh_lock_sock(sk);
462 	if (!sock_owned_by_user(sk)) {
463 		/*
464 		 * trylock + unlock semantics:
465 		 */
466 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
467 
468 		rc = sk_backlog_rcv(sk, skb);
469 
470 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
471 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
472 		bh_unlock_sock(sk);
473 		atomic_inc(&sk->sk_drops);
474 		goto discard_and_relse;
475 	}
476 
477 	bh_unlock_sock(sk);
478 out:
479 	sock_put(sk);
480 	return rc;
481 discard_and_relse:
482 	kfree_skb(skb);
483 	goto out;
484 }
485 EXPORT_SYMBOL(sk_receive_skb);
486 
487 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
488 {
489 	struct dst_entry *dst = __sk_dst_get(sk);
490 
491 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
492 		sk_tx_queue_clear(sk);
493 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
494 		dst_release(dst);
495 		return NULL;
496 	}
497 
498 	return dst;
499 }
500 EXPORT_SYMBOL(__sk_dst_check);
501 
502 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
503 {
504 	struct dst_entry *dst = sk_dst_get(sk);
505 
506 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
507 		sk_dst_reset(sk);
508 		dst_release(dst);
509 		return NULL;
510 	}
511 
512 	return dst;
513 }
514 EXPORT_SYMBOL(sk_dst_check);
515 
516 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
517 				int optlen)
518 {
519 	int ret = -ENOPROTOOPT;
520 #ifdef CONFIG_NETDEVICES
521 	struct net *net = sock_net(sk);
522 	char devname[IFNAMSIZ];
523 	int index;
524 
525 	/* Sorry... */
526 	ret = -EPERM;
527 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
528 		goto out;
529 
530 	ret = -EINVAL;
531 	if (optlen < 0)
532 		goto out;
533 
534 	/* Bind this socket to a particular device like "eth0",
535 	 * as specified in the passed interface name. If the
536 	 * name is "" or the option length is zero the socket
537 	 * is not bound.
538 	 */
539 	if (optlen > IFNAMSIZ - 1)
540 		optlen = IFNAMSIZ - 1;
541 	memset(devname, 0, sizeof(devname));
542 
543 	ret = -EFAULT;
544 	if (copy_from_user(devname, optval, optlen))
545 		goto out;
546 
547 	index = 0;
548 	if (devname[0] != '\0') {
549 		struct net_device *dev;
550 
551 		rcu_read_lock();
552 		dev = dev_get_by_name_rcu(net, devname);
553 		if (dev)
554 			index = dev->ifindex;
555 		rcu_read_unlock();
556 		ret = -ENODEV;
557 		if (!dev)
558 			goto out;
559 	}
560 
561 	lock_sock(sk);
562 	sk->sk_bound_dev_if = index;
563 	sk_dst_reset(sk);
564 	release_sock(sk);
565 
566 	ret = 0;
567 
568 out:
569 #endif
570 
571 	return ret;
572 }
573 
574 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
575 				int __user *optlen, int len)
576 {
577 	int ret = -ENOPROTOOPT;
578 #ifdef CONFIG_NETDEVICES
579 	struct net *net = sock_net(sk);
580 	char devname[IFNAMSIZ];
581 
582 	if (sk->sk_bound_dev_if == 0) {
583 		len = 0;
584 		goto zero;
585 	}
586 
587 	ret = -EINVAL;
588 	if (len < IFNAMSIZ)
589 		goto out;
590 
591 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
592 	if (ret)
593 		goto out;
594 
595 	len = strlen(devname) + 1;
596 
597 	ret = -EFAULT;
598 	if (copy_to_user(optval, devname, len))
599 		goto out;
600 
601 zero:
602 	ret = -EFAULT;
603 	if (put_user(len, optlen))
604 		goto out;
605 
606 	ret = 0;
607 
608 out:
609 #endif
610 
611 	return ret;
612 }
613 
614 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
615 {
616 	if (valbool)
617 		sock_set_flag(sk, bit);
618 	else
619 		sock_reset_flag(sk, bit);
620 }
621 
622 bool sk_mc_loop(struct sock *sk)
623 {
624 	if (dev_recursion_level())
625 		return false;
626 	if (!sk)
627 		return true;
628 	switch (sk->sk_family) {
629 	case AF_INET:
630 		return inet_sk(sk)->mc_loop;
631 #if IS_ENABLED(CONFIG_IPV6)
632 	case AF_INET6:
633 		return inet6_sk(sk)->mc_loop;
634 #endif
635 	}
636 	WARN_ON(1);
637 	return true;
638 }
639 EXPORT_SYMBOL(sk_mc_loop);
640 
641 /*
642  *	This is meant for all protocols to use and covers goings on
643  *	at the socket level. Everything here is generic.
644  */
645 
646 int sock_setsockopt(struct socket *sock, int level, int optname,
647 		    char __user *optval, unsigned int optlen)
648 {
649 	struct sock *sk = sock->sk;
650 	int val;
651 	int valbool;
652 	struct linger ling;
653 	int ret = 0;
654 
655 	/*
656 	 *	Options without arguments
657 	 */
658 
659 	if (optname == SO_BINDTODEVICE)
660 		return sock_setbindtodevice(sk, optval, optlen);
661 
662 	if (optlen < sizeof(int))
663 		return -EINVAL;
664 
665 	if (get_user(val, (int __user *)optval))
666 		return -EFAULT;
667 
668 	valbool = val ? 1 : 0;
669 
670 	lock_sock(sk);
671 
672 	switch (optname) {
673 	case SO_DEBUG:
674 		if (val && !capable(CAP_NET_ADMIN))
675 			ret = -EACCES;
676 		else
677 			sock_valbool_flag(sk, SOCK_DBG, valbool);
678 		break;
679 	case SO_REUSEADDR:
680 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
681 		break;
682 	case SO_REUSEPORT:
683 		sk->sk_reuseport = valbool;
684 		break;
685 	case SO_TYPE:
686 	case SO_PROTOCOL:
687 	case SO_DOMAIN:
688 	case SO_ERROR:
689 		ret = -ENOPROTOOPT;
690 		break;
691 	case SO_DONTROUTE:
692 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
693 		break;
694 	case SO_BROADCAST:
695 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
696 		break;
697 	case SO_SNDBUF:
698 		/* Don't error on this BSD doesn't and if you think
699 		 * about it this is right. Otherwise apps have to
700 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
701 		 * are treated in BSD as hints
702 		 */
703 		val = min_t(u32, val, sysctl_wmem_max);
704 set_sndbuf:
705 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
706 		sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
707 		/* Wake up sending tasks if we upped the value. */
708 		sk->sk_write_space(sk);
709 		break;
710 
711 	case SO_SNDBUFFORCE:
712 		if (!capable(CAP_NET_ADMIN)) {
713 			ret = -EPERM;
714 			break;
715 		}
716 		goto set_sndbuf;
717 
718 	case SO_RCVBUF:
719 		/* Don't error on this BSD doesn't and if you think
720 		 * about it this is right. Otherwise apps have to
721 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
722 		 * are treated in BSD as hints
723 		 */
724 		val = min_t(u32, val, sysctl_rmem_max);
725 set_rcvbuf:
726 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
727 		/*
728 		 * We double it on the way in to account for
729 		 * "struct sk_buff" etc. overhead.   Applications
730 		 * assume that the SO_RCVBUF setting they make will
731 		 * allow that much actual data to be received on that
732 		 * socket.
733 		 *
734 		 * Applications are unaware that "struct sk_buff" and
735 		 * other overheads allocate from the receive buffer
736 		 * during socket buffer allocation.
737 		 *
738 		 * And after considering the possible alternatives,
739 		 * returning the value we actually used in getsockopt
740 		 * is the most desirable behavior.
741 		 */
742 		sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
743 		break;
744 
745 	case SO_RCVBUFFORCE:
746 		if (!capable(CAP_NET_ADMIN)) {
747 			ret = -EPERM;
748 			break;
749 		}
750 		goto set_rcvbuf;
751 
752 	case SO_KEEPALIVE:
753 #ifdef CONFIG_INET
754 		if (sk->sk_protocol == IPPROTO_TCP &&
755 		    sk->sk_type == SOCK_STREAM)
756 			tcp_set_keepalive(sk, valbool);
757 #endif
758 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
759 		break;
760 
761 	case SO_OOBINLINE:
762 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
763 		break;
764 
765 	case SO_NO_CHECK:
766 		sk->sk_no_check_tx = valbool;
767 		break;
768 
769 	case SO_PRIORITY:
770 		if ((val >= 0 && val <= 6) ||
771 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
772 			sk->sk_priority = val;
773 		else
774 			ret = -EPERM;
775 		break;
776 
777 	case SO_LINGER:
778 		if (optlen < sizeof(ling)) {
779 			ret = -EINVAL;	/* 1003.1g */
780 			break;
781 		}
782 		if (copy_from_user(&ling, optval, sizeof(ling))) {
783 			ret = -EFAULT;
784 			break;
785 		}
786 		if (!ling.l_onoff)
787 			sock_reset_flag(sk, SOCK_LINGER);
788 		else {
789 #if (BITS_PER_LONG == 32)
790 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
791 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
792 			else
793 #endif
794 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
795 			sock_set_flag(sk, SOCK_LINGER);
796 		}
797 		break;
798 
799 	case SO_BSDCOMPAT:
800 		sock_warn_obsolete_bsdism("setsockopt");
801 		break;
802 
803 	case SO_PASSCRED:
804 		if (valbool)
805 			set_bit(SOCK_PASSCRED, &sock->flags);
806 		else
807 			clear_bit(SOCK_PASSCRED, &sock->flags);
808 		break;
809 
810 	case SO_TIMESTAMP:
811 	case SO_TIMESTAMPNS:
812 		if (valbool)  {
813 			if (optname == SO_TIMESTAMP)
814 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
815 			else
816 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
817 			sock_set_flag(sk, SOCK_RCVTSTAMP);
818 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
819 		} else {
820 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
821 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
822 		}
823 		break;
824 
825 	case SO_TIMESTAMPING:
826 		if (val & ~SOF_TIMESTAMPING_MASK) {
827 			ret = -EINVAL;
828 			break;
829 		}
830 
831 		if (val & SOF_TIMESTAMPING_OPT_ID &&
832 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
833 			if (sk->sk_protocol == IPPROTO_TCP &&
834 			    sk->sk_type == SOCK_STREAM) {
835 				if (sk->sk_state != TCP_ESTABLISHED) {
836 					ret = -EINVAL;
837 					break;
838 				}
839 				sk->sk_tskey = tcp_sk(sk)->snd_una;
840 			} else {
841 				sk->sk_tskey = 0;
842 			}
843 		}
844 		sk->sk_tsflags = val;
845 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
846 			sock_enable_timestamp(sk,
847 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
848 		else
849 			sock_disable_timestamp(sk,
850 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
851 		break;
852 
853 	case SO_RCVLOWAT:
854 		if (val < 0)
855 			val = INT_MAX;
856 		sk->sk_rcvlowat = val ? : 1;
857 		break;
858 
859 	case SO_RCVTIMEO:
860 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
861 		break;
862 
863 	case SO_SNDTIMEO:
864 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
865 		break;
866 
867 	case SO_ATTACH_FILTER:
868 		ret = -EINVAL;
869 		if (optlen == sizeof(struct sock_fprog)) {
870 			struct sock_fprog fprog;
871 
872 			ret = -EFAULT;
873 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
874 				break;
875 
876 			ret = sk_attach_filter(&fprog, sk);
877 		}
878 		break;
879 
880 	case SO_ATTACH_BPF:
881 		ret = -EINVAL;
882 		if (optlen == sizeof(u32)) {
883 			u32 ufd;
884 
885 			ret = -EFAULT;
886 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
887 				break;
888 
889 			ret = sk_attach_bpf(ufd, sk);
890 		}
891 		break;
892 
893 	case SO_ATTACH_REUSEPORT_CBPF:
894 		ret = -EINVAL;
895 		if (optlen == sizeof(struct sock_fprog)) {
896 			struct sock_fprog fprog;
897 
898 			ret = -EFAULT;
899 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
900 				break;
901 
902 			ret = sk_reuseport_attach_filter(&fprog, sk);
903 		}
904 		break;
905 
906 	case SO_ATTACH_REUSEPORT_EBPF:
907 		ret = -EINVAL;
908 		if (optlen == sizeof(u32)) {
909 			u32 ufd;
910 
911 			ret = -EFAULT;
912 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
913 				break;
914 
915 			ret = sk_reuseport_attach_bpf(ufd, sk);
916 		}
917 		break;
918 
919 	case SO_DETACH_FILTER:
920 		ret = sk_detach_filter(sk);
921 		break;
922 
923 	case SO_LOCK_FILTER:
924 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
925 			ret = -EPERM;
926 		else
927 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
928 		break;
929 
930 	case SO_PASSSEC:
931 		if (valbool)
932 			set_bit(SOCK_PASSSEC, &sock->flags);
933 		else
934 			clear_bit(SOCK_PASSSEC, &sock->flags);
935 		break;
936 	case SO_MARK:
937 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
938 			ret = -EPERM;
939 		else
940 			sk->sk_mark = val;
941 		break;
942 
943 	case SO_RXQ_OVFL:
944 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
945 		break;
946 
947 	case SO_WIFI_STATUS:
948 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
949 		break;
950 
951 	case SO_PEEK_OFF:
952 		if (sock->ops->set_peek_off)
953 			ret = sock->ops->set_peek_off(sk, val);
954 		else
955 			ret = -EOPNOTSUPP;
956 		break;
957 
958 	case SO_NOFCS:
959 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
960 		break;
961 
962 	case SO_SELECT_ERR_QUEUE:
963 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
964 		break;
965 
966 #ifdef CONFIG_NET_RX_BUSY_POLL
967 	case SO_BUSY_POLL:
968 		/* allow unprivileged users to decrease the value */
969 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
970 			ret = -EPERM;
971 		else {
972 			if (val < 0)
973 				ret = -EINVAL;
974 			else
975 				sk->sk_ll_usec = val;
976 		}
977 		break;
978 #endif
979 
980 	case SO_MAX_PACING_RATE:
981 		sk->sk_max_pacing_rate = val;
982 		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
983 					 sk->sk_max_pacing_rate);
984 		break;
985 
986 	case SO_INCOMING_CPU:
987 		sk->sk_incoming_cpu = val;
988 		break;
989 
990 	default:
991 		ret = -ENOPROTOOPT;
992 		break;
993 	}
994 	release_sock(sk);
995 	return ret;
996 }
997 EXPORT_SYMBOL(sock_setsockopt);
998 
999 
1000 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1001 			  struct ucred *ucred)
1002 {
1003 	ucred->pid = pid_vnr(pid);
1004 	ucred->uid = ucred->gid = -1;
1005 	if (cred) {
1006 		struct user_namespace *current_ns = current_user_ns();
1007 
1008 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1009 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1010 	}
1011 }
1012 
1013 int sock_getsockopt(struct socket *sock, int level, int optname,
1014 		    char __user *optval, int __user *optlen)
1015 {
1016 	struct sock *sk = sock->sk;
1017 
1018 	union {
1019 		int val;
1020 		struct linger ling;
1021 		struct timeval tm;
1022 	} v;
1023 
1024 	int lv = sizeof(int);
1025 	int len;
1026 
1027 	if (get_user(len, optlen))
1028 		return -EFAULT;
1029 	if (len < 0)
1030 		return -EINVAL;
1031 
1032 	memset(&v, 0, sizeof(v));
1033 
1034 	switch (optname) {
1035 	case SO_DEBUG:
1036 		v.val = sock_flag(sk, SOCK_DBG);
1037 		break;
1038 
1039 	case SO_DONTROUTE:
1040 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1041 		break;
1042 
1043 	case SO_BROADCAST:
1044 		v.val = sock_flag(sk, SOCK_BROADCAST);
1045 		break;
1046 
1047 	case SO_SNDBUF:
1048 		v.val = sk->sk_sndbuf;
1049 		break;
1050 
1051 	case SO_RCVBUF:
1052 		v.val = sk->sk_rcvbuf;
1053 		break;
1054 
1055 	case SO_REUSEADDR:
1056 		v.val = sk->sk_reuse;
1057 		break;
1058 
1059 	case SO_REUSEPORT:
1060 		v.val = sk->sk_reuseport;
1061 		break;
1062 
1063 	case SO_KEEPALIVE:
1064 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1065 		break;
1066 
1067 	case SO_TYPE:
1068 		v.val = sk->sk_type;
1069 		break;
1070 
1071 	case SO_PROTOCOL:
1072 		v.val = sk->sk_protocol;
1073 		break;
1074 
1075 	case SO_DOMAIN:
1076 		v.val = sk->sk_family;
1077 		break;
1078 
1079 	case SO_ERROR:
1080 		v.val = -sock_error(sk);
1081 		if (v.val == 0)
1082 			v.val = xchg(&sk->sk_err_soft, 0);
1083 		break;
1084 
1085 	case SO_OOBINLINE:
1086 		v.val = sock_flag(sk, SOCK_URGINLINE);
1087 		break;
1088 
1089 	case SO_NO_CHECK:
1090 		v.val = sk->sk_no_check_tx;
1091 		break;
1092 
1093 	case SO_PRIORITY:
1094 		v.val = sk->sk_priority;
1095 		break;
1096 
1097 	case SO_LINGER:
1098 		lv		= sizeof(v.ling);
1099 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1100 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1101 		break;
1102 
1103 	case SO_BSDCOMPAT:
1104 		sock_warn_obsolete_bsdism("getsockopt");
1105 		break;
1106 
1107 	case SO_TIMESTAMP:
1108 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1109 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1110 		break;
1111 
1112 	case SO_TIMESTAMPNS:
1113 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1114 		break;
1115 
1116 	case SO_TIMESTAMPING:
1117 		v.val = sk->sk_tsflags;
1118 		break;
1119 
1120 	case SO_RCVTIMEO:
1121 		lv = sizeof(struct timeval);
1122 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1123 			v.tm.tv_sec = 0;
1124 			v.tm.tv_usec = 0;
1125 		} else {
1126 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1127 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1128 		}
1129 		break;
1130 
1131 	case SO_SNDTIMEO:
1132 		lv = sizeof(struct timeval);
1133 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1134 			v.tm.tv_sec = 0;
1135 			v.tm.tv_usec = 0;
1136 		} else {
1137 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1138 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1139 		}
1140 		break;
1141 
1142 	case SO_RCVLOWAT:
1143 		v.val = sk->sk_rcvlowat;
1144 		break;
1145 
1146 	case SO_SNDLOWAT:
1147 		v.val = 1;
1148 		break;
1149 
1150 	case SO_PASSCRED:
1151 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1152 		break;
1153 
1154 	case SO_PEERCRED:
1155 	{
1156 		struct ucred peercred;
1157 		if (len > sizeof(peercred))
1158 			len = sizeof(peercred);
1159 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1160 		if (copy_to_user(optval, &peercred, len))
1161 			return -EFAULT;
1162 		goto lenout;
1163 	}
1164 
1165 	case SO_PEERNAME:
1166 	{
1167 		char address[128];
1168 
1169 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1170 			return -ENOTCONN;
1171 		if (lv < len)
1172 			return -EINVAL;
1173 		if (copy_to_user(optval, address, len))
1174 			return -EFAULT;
1175 		goto lenout;
1176 	}
1177 
1178 	/* Dubious BSD thing... Probably nobody even uses it, but
1179 	 * the UNIX standard wants it for whatever reason... -DaveM
1180 	 */
1181 	case SO_ACCEPTCONN:
1182 		v.val = sk->sk_state == TCP_LISTEN;
1183 		break;
1184 
1185 	case SO_PASSSEC:
1186 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1187 		break;
1188 
1189 	case SO_PEERSEC:
1190 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1191 
1192 	case SO_MARK:
1193 		v.val = sk->sk_mark;
1194 		break;
1195 
1196 	case SO_RXQ_OVFL:
1197 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1198 		break;
1199 
1200 	case SO_WIFI_STATUS:
1201 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1202 		break;
1203 
1204 	case SO_PEEK_OFF:
1205 		if (!sock->ops->set_peek_off)
1206 			return -EOPNOTSUPP;
1207 
1208 		v.val = sk->sk_peek_off;
1209 		break;
1210 	case SO_NOFCS:
1211 		v.val = sock_flag(sk, SOCK_NOFCS);
1212 		break;
1213 
1214 	case SO_BINDTODEVICE:
1215 		return sock_getbindtodevice(sk, optval, optlen, len);
1216 
1217 	case SO_GET_FILTER:
1218 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1219 		if (len < 0)
1220 			return len;
1221 
1222 		goto lenout;
1223 
1224 	case SO_LOCK_FILTER:
1225 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1226 		break;
1227 
1228 	case SO_BPF_EXTENSIONS:
1229 		v.val = bpf_tell_extensions();
1230 		break;
1231 
1232 	case SO_SELECT_ERR_QUEUE:
1233 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1234 		break;
1235 
1236 #ifdef CONFIG_NET_RX_BUSY_POLL
1237 	case SO_BUSY_POLL:
1238 		v.val = sk->sk_ll_usec;
1239 		break;
1240 #endif
1241 
1242 	case SO_MAX_PACING_RATE:
1243 		v.val = sk->sk_max_pacing_rate;
1244 		break;
1245 
1246 	case SO_INCOMING_CPU:
1247 		v.val = sk->sk_incoming_cpu;
1248 		break;
1249 
1250 	default:
1251 		/* We implement the SO_SNDLOWAT etc to not be settable
1252 		 * (1003.1g 7).
1253 		 */
1254 		return -ENOPROTOOPT;
1255 	}
1256 
1257 	if (len > lv)
1258 		len = lv;
1259 	if (copy_to_user(optval, &v, len))
1260 		return -EFAULT;
1261 lenout:
1262 	if (put_user(len, optlen))
1263 		return -EFAULT;
1264 	return 0;
1265 }
1266 
1267 /*
1268  * Initialize an sk_lock.
1269  *
1270  * (We also register the sk_lock with the lock validator.)
1271  */
1272 static inline void sock_lock_init(struct sock *sk)
1273 {
1274 	sock_lock_init_class_and_name(sk,
1275 			af_family_slock_key_strings[sk->sk_family],
1276 			af_family_slock_keys + sk->sk_family,
1277 			af_family_key_strings[sk->sk_family],
1278 			af_family_keys + sk->sk_family);
1279 }
1280 
1281 /*
1282  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1283  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1284  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1285  */
1286 static void sock_copy(struct sock *nsk, const struct sock *osk)
1287 {
1288 #ifdef CONFIG_SECURITY_NETWORK
1289 	void *sptr = nsk->sk_security;
1290 #endif
1291 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1292 
1293 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1294 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1295 
1296 #ifdef CONFIG_SECURITY_NETWORK
1297 	nsk->sk_security = sptr;
1298 	security_sk_clone(osk, nsk);
1299 #endif
1300 }
1301 
1302 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1303 {
1304 	unsigned long nulls1, nulls2;
1305 
1306 	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1307 	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1308 	if (nulls1 > nulls2)
1309 		swap(nulls1, nulls2);
1310 
1311 	if (nulls1 != 0)
1312 		memset((char *)sk, 0, nulls1);
1313 	memset((char *)sk + nulls1 + sizeof(void *), 0,
1314 	       nulls2 - nulls1 - sizeof(void *));
1315 	memset((char *)sk + nulls2 + sizeof(void *), 0,
1316 	       size - nulls2 - sizeof(void *));
1317 }
1318 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1319 
1320 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1321 		int family)
1322 {
1323 	struct sock *sk;
1324 	struct kmem_cache *slab;
1325 
1326 	slab = prot->slab;
1327 	if (slab != NULL) {
1328 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1329 		if (!sk)
1330 			return sk;
1331 		if (priority & __GFP_ZERO) {
1332 			if (prot->clear_sk)
1333 				prot->clear_sk(sk, prot->obj_size);
1334 			else
1335 				sk_prot_clear_nulls(sk, prot->obj_size);
1336 		}
1337 	} else
1338 		sk = kmalloc(prot->obj_size, priority);
1339 
1340 	if (sk != NULL) {
1341 		kmemcheck_annotate_bitfield(sk, flags);
1342 
1343 		if (security_sk_alloc(sk, family, priority))
1344 			goto out_free;
1345 
1346 		if (!try_module_get(prot->owner))
1347 			goto out_free_sec;
1348 		sk_tx_queue_clear(sk);
1349 		cgroup_sk_alloc(&sk->sk_cgrp_data);
1350 	}
1351 
1352 	return sk;
1353 
1354 out_free_sec:
1355 	security_sk_free(sk);
1356 out_free:
1357 	if (slab != NULL)
1358 		kmem_cache_free(slab, sk);
1359 	else
1360 		kfree(sk);
1361 	return NULL;
1362 }
1363 
1364 static void sk_prot_free(struct proto *prot, struct sock *sk)
1365 {
1366 	struct kmem_cache *slab;
1367 	struct module *owner;
1368 
1369 	owner = prot->owner;
1370 	slab = prot->slab;
1371 
1372 	cgroup_sk_free(&sk->sk_cgrp_data);
1373 	security_sk_free(sk);
1374 	if (slab != NULL)
1375 		kmem_cache_free(slab, sk);
1376 	else
1377 		kfree(sk);
1378 	module_put(owner);
1379 }
1380 
1381 /**
1382  *	sk_alloc - All socket objects are allocated here
1383  *	@net: the applicable net namespace
1384  *	@family: protocol family
1385  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1386  *	@prot: struct proto associated with this new sock instance
1387  *	@kern: is this to be a kernel socket?
1388  */
1389 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1390 		      struct proto *prot, int kern)
1391 {
1392 	struct sock *sk;
1393 
1394 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1395 	if (sk) {
1396 		sk->sk_family = family;
1397 		/*
1398 		 * See comment in struct sock definition to understand
1399 		 * why we need sk_prot_creator -acme
1400 		 */
1401 		sk->sk_prot = sk->sk_prot_creator = prot;
1402 		sock_lock_init(sk);
1403 		sk->sk_net_refcnt = kern ? 0 : 1;
1404 		if (likely(sk->sk_net_refcnt))
1405 			get_net(net);
1406 		sock_net_set(sk, net);
1407 		atomic_set(&sk->sk_wmem_alloc, 1);
1408 
1409 		sock_update_classid(&sk->sk_cgrp_data);
1410 		sock_update_netprioidx(&sk->sk_cgrp_data);
1411 	}
1412 
1413 	return sk;
1414 }
1415 EXPORT_SYMBOL(sk_alloc);
1416 
1417 void sk_destruct(struct sock *sk)
1418 {
1419 	struct sk_filter *filter;
1420 
1421 	if (sk->sk_destruct)
1422 		sk->sk_destruct(sk);
1423 
1424 	filter = rcu_dereference_check(sk->sk_filter,
1425 				       atomic_read(&sk->sk_wmem_alloc) == 0);
1426 	if (filter) {
1427 		sk_filter_uncharge(sk, filter);
1428 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1429 	}
1430 	if (rcu_access_pointer(sk->sk_reuseport_cb))
1431 		reuseport_detach_sock(sk);
1432 
1433 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1434 
1435 	if (atomic_read(&sk->sk_omem_alloc))
1436 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1437 			 __func__, atomic_read(&sk->sk_omem_alloc));
1438 
1439 	if (sk->sk_peer_cred)
1440 		put_cred(sk->sk_peer_cred);
1441 	put_pid(sk->sk_peer_pid);
1442 	if (likely(sk->sk_net_refcnt))
1443 		put_net(sock_net(sk));
1444 	sk_prot_free(sk->sk_prot_creator, sk);
1445 }
1446 
1447 static void __sk_free(struct sock *sk)
1448 {
1449 	if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
1450 		sock_diag_broadcast_destroy(sk);
1451 	else
1452 		sk_destruct(sk);
1453 }
1454 
1455 void sk_free(struct sock *sk)
1456 {
1457 	/*
1458 	 * We subtract one from sk_wmem_alloc and can know if
1459 	 * some packets are still in some tx queue.
1460 	 * If not null, sock_wfree() will call __sk_free(sk) later
1461 	 */
1462 	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1463 		__sk_free(sk);
1464 }
1465 EXPORT_SYMBOL(sk_free);
1466 
1467 /**
1468  *	sk_clone_lock - clone a socket, and lock its clone
1469  *	@sk: the socket to clone
1470  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1471  *
1472  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1473  */
1474 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1475 {
1476 	struct sock *newsk;
1477 	bool is_charged = true;
1478 
1479 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1480 	if (newsk != NULL) {
1481 		struct sk_filter *filter;
1482 
1483 		sock_copy(newsk, sk);
1484 
1485 		/* SANITY */
1486 		if (likely(newsk->sk_net_refcnt))
1487 			get_net(sock_net(newsk));
1488 		sk_node_init(&newsk->sk_node);
1489 		sock_lock_init(newsk);
1490 		bh_lock_sock(newsk);
1491 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1492 		newsk->sk_backlog.len = 0;
1493 
1494 		atomic_set(&newsk->sk_rmem_alloc, 0);
1495 		/*
1496 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1497 		 */
1498 		atomic_set(&newsk->sk_wmem_alloc, 1);
1499 		atomic_set(&newsk->sk_omem_alloc, 0);
1500 		skb_queue_head_init(&newsk->sk_receive_queue);
1501 		skb_queue_head_init(&newsk->sk_write_queue);
1502 
1503 		rwlock_init(&newsk->sk_callback_lock);
1504 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1505 				af_callback_keys + newsk->sk_family,
1506 				af_family_clock_key_strings[newsk->sk_family]);
1507 
1508 		newsk->sk_dst_cache	= NULL;
1509 		newsk->sk_wmem_queued	= 0;
1510 		newsk->sk_forward_alloc = 0;
1511 		newsk->sk_send_head	= NULL;
1512 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1513 
1514 		sock_reset_flag(newsk, SOCK_DONE);
1515 		skb_queue_head_init(&newsk->sk_error_queue);
1516 
1517 		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1518 		if (filter != NULL)
1519 			/* though it's an empty new sock, the charging may fail
1520 			 * if sysctl_optmem_max was changed between creation of
1521 			 * original socket and cloning
1522 			 */
1523 			is_charged = sk_filter_charge(newsk, filter);
1524 
1525 		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1526 			/* It is still raw copy of parent, so invalidate
1527 			 * destructor and make plain sk_free() */
1528 			newsk->sk_destruct = NULL;
1529 			bh_unlock_sock(newsk);
1530 			sk_free(newsk);
1531 			newsk = NULL;
1532 			goto out;
1533 		}
1534 
1535 		newsk->sk_err	   = 0;
1536 		newsk->sk_priority = 0;
1537 		newsk->sk_incoming_cpu = raw_smp_processor_id();
1538 		atomic64_set(&newsk->sk_cookie, 0);
1539 		/*
1540 		 * Before updating sk_refcnt, we must commit prior changes to memory
1541 		 * (Documentation/RCU/rculist_nulls.txt for details)
1542 		 */
1543 		smp_wmb();
1544 		atomic_set(&newsk->sk_refcnt, 2);
1545 
1546 		/*
1547 		 * Increment the counter in the same struct proto as the master
1548 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1549 		 * is the same as sk->sk_prot->socks, as this field was copied
1550 		 * with memcpy).
1551 		 *
1552 		 * This _changes_ the previous behaviour, where
1553 		 * tcp_create_openreq_child always was incrementing the
1554 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1555 		 * to be taken into account in all callers. -acme
1556 		 */
1557 		sk_refcnt_debug_inc(newsk);
1558 		sk_set_socket(newsk, NULL);
1559 		newsk->sk_wq = NULL;
1560 
1561 		if (mem_cgroup_sockets_enabled && sk->sk_memcg)
1562 			sock_update_memcg(newsk);
1563 
1564 		if (newsk->sk_prot->sockets_allocated)
1565 			sk_sockets_allocated_inc(newsk);
1566 
1567 		if (sock_needs_netstamp(sk) &&
1568 		    newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1569 			net_enable_timestamp();
1570 	}
1571 out:
1572 	return newsk;
1573 }
1574 EXPORT_SYMBOL_GPL(sk_clone_lock);
1575 
1576 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1577 {
1578 	u32 max_segs = 1;
1579 
1580 	sk_dst_set(sk, dst);
1581 	sk->sk_route_caps = dst->dev->features;
1582 	if (sk->sk_route_caps & NETIF_F_GSO)
1583 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1584 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1585 	if (sk_can_gso(sk)) {
1586 		if (dst->header_len) {
1587 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1588 		} else {
1589 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1590 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1591 			max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1592 		}
1593 	}
1594 	sk->sk_gso_max_segs = max_segs;
1595 }
1596 EXPORT_SYMBOL_GPL(sk_setup_caps);
1597 
1598 /*
1599  *	Simple resource managers for sockets.
1600  */
1601 
1602 
1603 /*
1604  * Write buffer destructor automatically called from kfree_skb.
1605  */
1606 void sock_wfree(struct sk_buff *skb)
1607 {
1608 	struct sock *sk = skb->sk;
1609 	unsigned int len = skb->truesize;
1610 
1611 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1612 		/*
1613 		 * Keep a reference on sk_wmem_alloc, this will be released
1614 		 * after sk_write_space() call
1615 		 */
1616 		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1617 		sk->sk_write_space(sk);
1618 		len = 1;
1619 	}
1620 	/*
1621 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1622 	 * could not do because of in-flight packets
1623 	 */
1624 	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1625 		__sk_free(sk);
1626 }
1627 EXPORT_SYMBOL(sock_wfree);
1628 
1629 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1630 {
1631 	skb_orphan(skb);
1632 	skb->sk = sk;
1633 #ifdef CONFIG_INET
1634 	if (unlikely(!sk_fullsock(sk))) {
1635 		skb->destructor = sock_edemux;
1636 		sock_hold(sk);
1637 		return;
1638 	}
1639 #endif
1640 	skb->destructor = sock_wfree;
1641 	skb_set_hash_from_sk(skb, sk);
1642 	/*
1643 	 * We used to take a refcount on sk, but following operation
1644 	 * is enough to guarantee sk_free() wont free this sock until
1645 	 * all in-flight packets are completed
1646 	 */
1647 	atomic_add(skb->truesize, &sk->sk_wmem_alloc);
1648 }
1649 EXPORT_SYMBOL(skb_set_owner_w);
1650 
1651 void skb_orphan_partial(struct sk_buff *skb)
1652 {
1653 	/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1654 	 * so we do not completely orphan skb, but transfert all
1655 	 * accounted bytes but one, to avoid unexpected reorders.
1656 	 */
1657 	if (skb->destructor == sock_wfree
1658 #ifdef CONFIG_INET
1659 	    || skb->destructor == tcp_wfree
1660 #endif
1661 		) {
1662 		atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1663 		skb->truesize = 1;
1664 	} else {
1665 		skb_orphan(skb);
1666 	}
1667 }
1668 EXPORT_SYMBOL(skb_orphan_partial);
1669 
1670 /*
1671  * Read buffer destructor automatically called from kfree_skb.
1672  */
1673 void sock_rfree(struct sk_buff *skb)
1674 {
1675 	struct sock *sk = skb->sk;
1676 	unsigned int len = skb->truesize;
1677 
1678 	atomic_sub(len, &sk->sk_rmem_alloc);
1679 	sk_mem_uncharge(sk, len);
1680 }
1681 EXPORT_SYMBOL(sock_rfree);
1682 
1683 /*
1684  * Buffer destructor for skbs that are not used directly in read or write
1685  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1686  */
1687 void sock_efree(struct sk_buff *skb)
1688 {
1689 	sock_put(skb->sk);
1690 }
1691 EXPORT_SYMBOL(sock_efree);
1692 
1693 kuid_t sock_i_uid(struct sock *sk)
1694 {
1695 	kuid_t uid;
1696 
1697 	read_lock_bh(&sk->sk_callback_lock);
1698 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1699 	read_unlock_bh(&sk->sk_callback_lock);
1700 	return uid;
1701 }
1702 EXPORT_SYMBOL(sock_i_uid);
1703 
1704 unsigned long sock_i_ino(struct sock *sk)
1705 {
1706 	unsigned long ino;
1707 
1708 	read_lock_bh(&sk->sk_callback_lock);
1709 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1710 	read_unlock_bh(&sk->sk_callback_lock);
1711 	return ino;
1712 }
1713 EXPORT_SYMBOL(sock_i_ino);
1714 
1715 /*
1716  * Allocate a skb from the socket's send buffer.
1717  */
1718 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1719 			     gfp_t priority)
1720 {
1721 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1722 		struct sk_buff *skb = alloc_skb(size, priority);
1723 		if (skb) {
1724 			skb_set_owner_w(skb, sk);
1725 			return skb;
1726 		}
1727 	}
1728 	return NULL;
1729 }
1730 EXPORT_SYMBOL(sock_wmalloc);
1731 
1732 /*
1733  * Allocate a memory block from the socket's option memory buffer.
1734  */
1735 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1736 {
1737 	if ((unsigned int)size <= sysctl_optmem_max &&
1738 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1739 		void *mem;
1740 		/* First do the add, to avoid the race if kmalloc
1741 		 * might sleep.
1742 		 */
1743 		atomic_add(size, &sk->sk_omem_alloc);
1744 		mem = kmalloc(size, priority);
1745 		if (mem)
1746 			return mem;
1747 		atomic_sub(size, &sk->sk_omem_alloc);
1748 	}
1749 	return NULL;
1750 }
1751 EXPORT_SYMBOL(sock_kmalloc);
1752 
1753 /* Free an option memory block. Note, we actually want the inline
1754  * here as this allows gcc to detect the nullify and fold away the
1755  * condition entirely.
1756  */
1757 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1758 				  const bool nullify)
1759 {
1760 	if (WARN_ON_ONCE(!mem))
1761 		return;
1762 	if (nullify)
1763 		kzfree(mem);
1764 	else
1765 		kfree(mem);
1766 	atomic_sub(size, &sk->sk_omem_alloc);
1767 }
1768 
1769 void sock_kfree_s(struct sock *sk, void *mem, int size)
1770 {
1771 	__sock_kfree_s(sk, mem, size, false);
1772 }
1773 EXPORT_SYMBOL(sock_kfree_s);
1774 
1775 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1776 {
1777 	__sock_kfree_s(sk, mem, size, true);
1778 }
1779 EXPORT_SYMBOL(sock_kzfree_s);
1780 
1781 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1782    I think, these locks should be removed for datagram sockets.
1783  */
1784 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1785 {
1786 	DEFINE_WAIT(wait);
1787 
1788 	sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1789 	for (;;) {
1790 		if (!timeo)
1791 			break;
1792 		if (signal_pending(current))
1793 			break;
1794 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1795 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1796 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1797 			break;
1798 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1799 			break;
1800 		if (sk->sk_err)
1801 			break;
1802 		timeo = schedule_timeout(timeo);
1803 	}
1804 	finish_wait(sk_sleep(sk), &wait);
1805 	return timeo;
1806 }
1807 
1808 
1809 /*
1810  *	Generic send/receive buffer handlers
1811  */
1812 
1813 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1814 				     unsigned long data_len, int noblock,
1815 				     int *errcode, int max_page_order)
1816 {
1817 	struct sk_buff *skb;
1818 	long timeo;
1819 	int err;
1820 
1821 	timeo = sock_sndtimeo(sk, noblock);
1822 	for (;;) {
1823 		err = sock_error(sk);
1824 		if (err != 0)
1825 			goto failure;
1826 
1827 		err = -EPIPE;
1828 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1829 			goto failure;
1830 
1831 		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1832 			break;
1833 
1834 		sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1835 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1836 		err = -EAGAIN;
1837 		if (!timeo)
1838 			goto failure;
1839 		if (signal_pending(current))
1840 			goto interrupted;
1841 		timeo = sock_wait_for_wmem(sk, timeo);
1842 	}
1843 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1844 				   errcode, sk->sk_allocation);
1845 	if (skb)
1846 		skb_set_owner_w(skb, sk);
1847 	return skb;
1848 
1849 interrupted:
1850 	err = sock_intr_errno(timeo);
1851 failure:
1852 	*errcode = err;
1853 	return NULL;
1854 }
1855 EXPORT_SYMBOL(sock_alloc_send_pskb);
1856 
1857 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1858 				    int noblock, int *errcode)
1859 {
1860 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1861 }
1862 EXPORT_SYMBOL(sock_alloc_send_skb);
1863 
1864 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
1865 		   struct sockcm_cookie *sockc)
1866 {
1867 	struct cmsghdr *cmsg;
1868 
1869 	for_each_cmsghdr(cmsg, msg) {
1870 		if (!CMSG_OK(msg, cmsg))
1871 			return -EINVAL;
1872 		if (cmsg->cmsg_level != SOL_SOCKET)
1873 			continue;
1874 		switch (cmsg->cmsg_type) {
1875 		case SO_MARK:
1876 			if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1877 				return -EPERM;
1878 			if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
1879 				return -EINVAL;
1880 			sockc->mark = *(u32 *)CMSG_DATA(cmsg);
1881 			break;
1882 		default:
1883 			return -EINVAL;
1884 		}
1885 	}
1886 	return 0;
1887 }
1888 EXPORT_SYMBOL(sock_cmsg_send);
1889 
1890 /* On 32bit arches, an skb frag is limited to 2^15 */
1891 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
1892 
1893 /**
1894  * skb_page_frag_refill - check that a page_frag contains enough room
1895  * @sz: minimum size of the fragment we want to get
1896  * @pfrag: pointer to page_frag
1897  * @gfp: priority for memory allocation
1898  *
1899  * Note: While this allocator tries to use high order pages, there is
1900  * no guarantee that allocations succeed. Therefore, @sz MUST be
1901  * less or equal than PAGE_SIZE.
1902  */
1903 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1904 {
1905 	if (pfrag->page) {
1906 		if (atomic_read(&pfrag->page->_count) == 1) {
1907 			pfrag->offset = 0;
1908 			return true;
1909 		}
1910 		if (pfrag->offset + sz <= pfrag->size)
1911 			return true;
1912 		put_page(pfrag->page);
1913 	}
1914 
1915 	pfrag->offset = 0;
1916 	if (SKB_FRAG_PAGE_ORDER) {
1917 		/* Avoid direct reclaim but allow kswapd to wake */
1918 		pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
1919 					  __GFP_COMP | __GFP_NOWARN |
1920 					  __GFP_NORETRY,
1921 					  SKB_FRAG_PAGE_ORDER);
1922 		if (likely(pfrag->page)) {
1923 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
1924 			return true;
1925 		}
1926 	}
1927 	pfrag->page = alloc_page(gfp);
1928 	if (likely(pfrag->page)) {
1929 		pfrag->size = PAGE_SIZE;
1930 		return true;
1931 	}
1932 	return false;
1933 }
1934 EXPORT_SYMBOL(skb_page_frag_refill);
1935 
1936 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1937 {
1938 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
1939 		return true;
1940 
1941 	sk_enter_memory_pressure(sk);
1942 	sk_stream_moderate_sndbuf(sk);
1943 	return false;
1944 }
1945 EXPORT_SYMBOL(sk_page_frag_refill);
1946 
1947 static void __lock_sock(struct sock *sk)
1948 	__releases(&sk->sk_lock.slock)
1949 	__acquires(&sk->sk_lock.slock)
1950 {
1951 	DEFINE_WAIT(wait);
1952 
1953 	for (;;) {
1954 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1955 					TASK_UNINTERRUPTIBLE);
1956 		spin_unlock_bh(&sk->sk_lock.slock);
1957 		schedule();
1958 		spin_lock_bh(&sk->sk_lock.slock);
1959 		if (!sock_owned_by_user(sk))
1960 			break;
1961 	}
1962 	finish_wait(&sk->sk_lock.wq, &wait);
1963 }
1964 
1965 static void __release_sock(struct sock *sk)
1966 	__releases(&sk->sk_lock.slock)
1967 	__acquires(&sk->sk_lock.slock)
1968 {
1969 	struct sk_buff *skb = sk->sk_backlog.head;
1970 
1971 	do {
1972 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1973 		bh_unlock_sock(sk);
1974 
1975 		do {
1976 			struct sk_buff *next = skb->next;
1977 
1978 			prefetch(next);
1979 			WARN_ON_ONCE(skb_dst_is_noref(skb));
1980 			skb->next = NULL;
1981 			sk_backlog_rcv(sk, skb);
1982 
1983 			/*
1984 			 * We are in process context here with softirqs
1985 			 * disabled, use cond_resched_softirq() to preempt.
1986 			 * This is safe to do because we've taken the backlog
1987 			 * queue private:
1988 			 */
1989 			cond_resched_softirq();
1990 
1991 			skb = next;
1992 		} while (skb != NULL);
1993 
1994 		bh_lock_sock(sk);
1995 	} while ((skb = sk->sk_backlog.head) != NULL);
1996 
1997 	/*
1998 	 * Doing the zeroing here guarantee we can not loop forever
1999 	 * while a wild producer attempts to flood us.
2000 	 */
2001 	sk->sk_backlog.len = 0;
2002 }
2003 
2004 /**
2005  * sk_wait_data - wait for data to arrive at sk_receive_queue
2006  * @sk:    sock to wait on
2007  * @timeo: for how long
2008  * @skb:   last skb seen on sk_receive_queue
2009  *
2010  * Now socket state including sk->sk_err is changed only under lock,
2011  * hence we may omit checks after joining wait queue.
2012  * We check receive queue before schedule() only as optimization;
2013  * it is very likely that release_sock() added new data.
2014  */
2015 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2016 {
2017 	int rc;
2018 	DEFINE_WAIT(wait);
2019 
2020 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2021 	sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2022 	rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
2023 	sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2024 	finish_wait(sk_sleep(sk), &wait);
2025 	return rc;
2026 }
2027 EXPORT_SYMBOL(sk_wait_data);
2028 
2029 /**
2030  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2031  *	@sk: socket
2032  *	@size: memory size to allocate
2033  *	@kind: allocation type
2034  *
2035  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2036  *	rmem allocation. This function assumes that protocols which have
2037  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2038  */
2039 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2040 {
2041 	struct proto *prot = sk->sk_prot;
2042 	int amt = sk_mem_pages(size);
2043 	long allocated;
2044 
2045 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2046 
2047 	allocated = sk_memory_allocated_add(sk, amt);
2048 
2049 	if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2050 	    !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2051 		goto suppress_allocation;
2052 
2053 	/* Under limit. */
2054 	if (allocated <= sk_prot_mem_limits(sk, 0)) {
2055 		sk_leave_memory_pressure(sk);
2056 		return 1;
2057 	}
2058 
2059 	/* Under pressure. */
2060 	if (allocated > sk_prot_mem_limits(sk, 1))
2061 		sk_enter_memory_pressure(sk);
2062 
2063 	/* Over hard limit. */
2064 	if (allocated > sk_prot_mem_limits(sk, 2))
2065 		goto suppress_allocation;
2066 
2067 	/* guarantee minimum buffer size under pressure */
2068 	if (kind == SK_MEM_RECV) {
2069 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2070 			return 1;
2071 
2072 	} else { /* SK_MEM_SEND */
2073 		if (sk->sk_type == SOCK_STREAM) {
2074 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2075 				return 1;
2076 		} else if (atomic_read(&sk->sk_wmem_alloc) <
2077 			   prot->sysctl_wmem[0])
2078 				return 1;
2079 	}
2080 
2081 	if (sk_has_memory_pressure(sk)) {
2082 		int alloc;
2083 
2084 		if (!sk_under_memory_pressure(sk))
2085 			return 1;
2086 		alloc = sk_sockets_allocated_read_positive(sk);
2087 		if (sk_prot_mem_limits(sk, 2) > alloc *
2088 		    sk_mem_pages(sk->sk_wmem_queued +
2089 				 atomic_read(&sk->sk_rmem_alloc) +
2090 				 sk->sk_forward_alloc))
2091 			return 1;
2092 	}
2093 
2094 suppress_allocation:
2095 
2096 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2097 		sk_stream_moderate_sndbuf(sk);
2098 
2099 		/* Fail only if socket is _under_ its sndbuf.
2100 		 * In this case we cannot block, so that we have to fail.
2101 		 */
2102 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2103 			return 1;
2104 	}
2105 
2106 	trace_sock_exceed_buf_limit(sk, prot, allocated);
2107 
2108 	/* Alas. Undo changes. */
2109 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2110 
2111 	sk_memory_allocated_sub(sk, amt);
2112 
2113 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2114 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2115 
2116 	return 0;
2117 }
2118 EXPORT_SYMBOL(__sk_mem_schedule);
2119 
2120 /**
2121  *	__sk_mem_reclaim - reclaim memory_allocated
2122  *	@sk: socket
2123  *	@amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2124  */
2125 void __sk_mem_reclaim(struct sock *sk, int amount)
2126 {
2127 	amount >>= SK_MEM_QUANTUM_SHIFT;
2128 	sk_memory_allocated_sub(sk, amount);
2129 	sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2130 
2131 	if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2132 		mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2133 
2134 	if (sk_under_memory_pressure(sk) &&
2135 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2136 		sk_leave_memory_pressure(sk);
2137 }
2138 EXPORT_SYMBOL(__sk_mem_reclaim);
2139 
2140 
2141 /*
2142  * Set of default routines for initialising struct proto_ops when
2143  * the protocol does not support a particular function. In certain
2144  * cases where it makes no sense for a protocol to have a "do nothing"
2145  * function, some default processing is provided.
2146  */
2147 
2148 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2149 {
2150 	return -EOPNOTSUPP;
2151 }
2152 EXPORT_SYMBOL(sock_no_bind);
2153 
2154 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2155 		    int len, int flags)
2156 {
2157 	return -EOPNOTSUPP;
2158 }
2159 EXPORT_SYMBOL(sock_no_connect);
2160 
2161 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2162 {
2163 	return -EOPNOTSUPP;
2164 }
2165 EXPORT_SYMBOL(sock_no_socketpair);
2166 
2167 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2168 {
2169 	return -EOPNOTSUPP;
2170 }
2171 EXPORT_SYMBOL(sock_no_accept);
2172 
2173 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2174 		    int *len, int peer)
2175 {
2176 	return -EOPNOTSUPP;
2177 }
2178 EXPORT_SYMBOL(sock_no_getname);
2179 
2180 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2181 {
2182 	return 0;
2183 }
2184 EXPORT_SYMBOL(sock_no_poll);
2185 
2186 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2187 {
2188 	return -EOPNOTSUPP;
2189 }
2190 EXPORT_SYMBOL(sock_no_ioctl);
2191 
2192 int sock_no_listen(struct socket *sock, int backlog)
2193 {
2194 	return -EOPNOTSUPP;
2195 }
2196 EXPORT_SYMBOL(sock_no_listen);
2197 
2198 int sock_no_shutdown(struct socket *sock, int how)
2199 {
2200 	return -EOPNOTSUPP;
2201 }
2202 EXPORT_SYMBOL(sock_no_shutdown);
2203 
2204 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2205 		    char __user *optval, unsigned int optlen)
2206 {
2207 	return -EOPNOTSUPP;
2208 }
2209 EXPORT_SYMBOL(sock_no_setsockopt);
2210 
2211 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2212 		    char __user *optval, int __user *optlen)
2213 {
2214 	return -EOPNOTSUPP;
2215 }
2216 EXPORT_SYMBOL(sock_no_getsockopt);
2217 
2218 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2219 {
2220 	return -EOPNOTSUPP;
2221 }
2222 EXPORT_SYMBOL(sock_no_sendmsg);
2223 
2224 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2225 		    int flags)
2226 {
2227 	return -EOPNOTSUPP;
2228 }
2229 EXPORT_SYMBOL(sock_no_recvmsg);
2230 
2231 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2232 {
2233 	/* Mirror missing mmap method error code */
2234 	return -ENODEV;
2235 }
2236 EXPORT_SYMBOL(sock_no_mmap);
2237 
2238 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2239 {
2240 	ssize_t res;
2241 	struct msghdr msg = {.msg_flags = flags};
2242 	struct kvec iov;
2243 	char *kaddr = kmap(page);
2244 	iov.iov_base = kaddr + offset;
2245 	iov.iov_len = size;
2246 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2247 	kunmap(page);
2248 	return res;
2249 }
2250 EXPORT_SYMBOL(sock_no_sendpage);
2251 
2252 /*
2253  *	Default Socket Callbacks
2254  */
2255 
2256 static void sock_def_wakeup(struct sock *sk)
2257 {
2258 	struct socket_wq *wq;
2259 
2260 	rcu_read_lock();
2261 	wq = rcu_dereference(sk->sk_wq);
2262 	if (skwq_has_sleeper(wq))
2263 		wake_up_interruptible_all(&wq->wait);
2264 	rcu_read_unlock();
2265 }
2266 
2267 static void sock_def_error_report(struct sock *sk)
2268 {
2269 	struct socket_wq *wq;
2270 
2271 	rcu_read_lock();
2272 	wq = rcu_dereference(sk->sk_wq);
2273 	if (skwq_has_sleeper(wq))
2274 		wake_up_interruptible_poll(&wq->wait, POLLERR);
2275 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2276 	rcu_read_unlock();
2277 }
2278 
2279 static void sock_def_readable(struct sock *sk)
2280 {
2281 	struct socket_wq *wq;
2282 
2283 	rcu_read_lock();
2284 	wq = rcu_dereference(sk->sk_wq);
2285 	if (skwq_has_sleeper(wq))
2286 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2287 						POLLRDNORM | POLLRDBAND);
2288 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2289 	rcu_read_unlock();
2290 }
2291 
2292 static void sock_def_write_space(struct sock *sk)
2293 {
2294 	struct socket_wq *wq;
2295 
2296 	rcu_read_lock();
2297 
2298 	/* Do not wake up a writer until he can make "significant"
2299 	 * progress.  --DaveM
2300 	 */
2301 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2302 		wq = rcu_dereference(sk->sk_wq);
2303 		if (skwq_has_sleeper(wq))
2304 			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2305 						POLLWRNORM | POLLWRBAND);
2306 
2307 		/* Should agree with poll, otherwise some programs break */
2308 		if (sock_writeable(sk))
2309 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2310 	}
2311 
2312 	rcu_read_unlock();
2313 }
2314 
2315 static void sock_def_destruct(struct sock *sk)
2316 {
2317 }
2318 
2319 void sk_send_sigurg(struct sock *sk)
2320 {
2321 	if (sk->sk_socket && sk->sk_socket->file)
2322 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2323 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2324 }
2325 EXPORT_SYMBOL(sk_send_sigurg);
2326 
2327 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2328 		    unsigned long expires)
2329 {
2330 	if (!mod_timer(timer, expires))
2331 		sock_hold(sk);
2332 }
2333 EXPORT_SYMBOL(sk_reset_timer);
2334 
2335 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2336 {
2337 	if (del_timer(timer))
2338 		__sock_put(sk);
2339 }
2340 EXPORT_SYMBOL(sk_stop_timer);
2341 
2342 void sock_init_data(struct socket *sock, struct sock *sk)
2343 {
2344 	skb_queue_head_init(&sk->sk_receive_queue);
2345 	skb_queue_head_init(&sk->sk_write_queue);
2346 	skb_queue_head_init(&sk->sk_error_queue);
2347 
2348 	sk->sk_send_head	=	NULL;
2349 
2350 	init_timer(&sk->sk_timer);
2351 
2352 	sk->sk_allocation	=	GFP_KERNEL;
2353 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2354 	sk->sk_sndbuf		=	sysctl_wmem_default;
2355 	sk->sk_state		=	TCP_CLOSE;
2356 	sk_set_socket(sk, sock);
2357 
2358 	sock_set_flag(sk, SOCK_ZAPPED);
2359 
2360 	if (sock) {
2361 		sk->sk_type	=	sock->type;
2362 		sk->sk_wq	=	sock->wq;
2363 		sock->sk	=	sk;
2364 	} else
2365 		sk->sk_wq	=	NULL;
2366 
2367 	rwlock_init(&sk->sk_callback_lock);
2368 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2369 			af_callback_keys + sk->sk_family,
2370 			af_family_clock_key_strings[sk->sk_family]);
2371 
2372 	sk->sk_state_change	=	sock_def_wakeup;
2373 	sk->sk_data_ready	=	sock_def_readable;
2374 	sk->sk_write_space	=	sock_def_write_space;
2375 	sk->sk_error_report	=	sock_def_error_report;
2376 	sk->sk_destruct		=	sock_def_destruct;
2377 
2378 	sk->sk_frag.page	=	NULL;
2379 	sk->sk_frag.offset	=	0;
2380 	sk->sk_peek_off		=	-1;
2381 
2382 	sk->sk_peer_pid 	=	NULL;
2383 	sk->sk_peer_cred	=	NULL;
2384 	sk->sk_write_pending	=	0;
2385 	sk->sk_rcvlowat		=	1;
2386 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2387 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2388 
2389 	sk->sk_stamp = ktime_set(-1L, 0);
2390 
2391 #ifdef CONFIG_NET_RX_BUSY_POLL
2392 	sk->sk_napi_id		=	0;
2393 	sk->sk_ll_usec		=	sysctl_net_busy_read;
2394 #endif
2395 
2396 	sk->sk_max_pacing_rate = ~0U;
2397 	sk->sk_pacing_rate = ~0U;
2398 	sk->sk_incoming_cpu = -1;
2399 	/*
2400 	 * Before updating sk_refcnt, we must commit prior changes to memory
2401 	 * (Documentation/RCU/rculist_nulls.txt for details)
2402 	 */
2403 	smp_wmb();
2404 	atomic_set(&sk->sk_refcnt, 1);
2405 	atomic_set(&sk->sk_drops, 0);
2406 }
2407 EXPORT_SYMBOL(sock_init_data);
2408 
2409 void lock_sock_nested(struct sock *sk, int subclass)
2410 {
2411 	might_sleep();
2412 	spin_lock_bh(&sk->sk_lock.slock);
2413 	if (sk->sk_lock.owned)
2414 		__lock_sock(sk);
2415 	sk->sk_lock.owned = 1;
2416 	spin_unlock(&sk->sk_lock.slock);
2417 	/*
2418 	 * The sk_lock has mutex_lock() semantics here:
2419 	 */
2420 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2421 	local_bh_enable();
2422 }
2423 EXPORT_SYMBOL(lock_sock_nested);
2424 
2425 void release_sock(struct sock *sk)
2426 {
2427 	/*
2428 	 * The sk_lock has mutex_unlock() semantics:
2429 	 */
2430 	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2431 
2432 	spin_lock_bh(&sk->sk_lock.slock);
2433 	if (sk->sk_backlog.tail)
2434 		__release_sock(sk);
2435 
2436 	/* Warning : release_cb() might need to release sk ownership,
2437 	 * ie call sock_release_ownership(sk) before us.
2438 	 */
2439 	if (sk->sk_prot->release_cb)
2440 		sk->sk_prot->release_cb(sk);
2441 
2442 	sock_release_ownership(sk);
2443 	if (waitqueue_active(&sk->sk_lock.wq))
2444 		wake_up(&sk->sk_lock.wq);
2445 	spin_unlock_bh(&sk->sk_lock.slock);
2446 }
2447 EXPORT_SYMBOL(release_sock);
2448 
2449 /**
2450  * lock_sock_fast - fast version of lock_sock
2451  * @sk: socket
2452  *
2453  * This version should be used for very small section, where process wont block
2454  * return false if fast path is taken
2455  *   sk_lock.slock locked, owned = 0, BH disabled
2456  * return true if slow path is taken
2457  *   sk_lock.slock unlocked, owned = 1, BH enabled
2458  */
2459 bool lock_sock_fast(struct sock *sk)
2460 {
2461 	might_sleep();
2462 	spin_lock_bh(&sk->sk_lock.slock);
2463 
2464 	if (!sk->sk_lock.owned)
2465 		/*
2466 		 * Note : We must disable BH
2467 		 */
2468 		return false;
2469 
2470 	__lock_sock(sk);
2471 	sk->sk_lock.owned = 1;
2472 	spin_unlock(&sk->sk_lock.slock);
2473 	/*
2474 	 * The sk_lock has mutex_lock() semantics here:
2475 	 */
2476 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2477 	local_bh_enable();
2478 	return true;
2479 }
2480 EXPORT_SYMBOL(lock_sock_fast);
2481 
2482 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2483 {
2484 	struct timeval tv;
2485 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2486 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2487 	tv = ktime_to_timeval(sk->sk_stamp);
2488 	if (tv.tv_sec == -1)
2489 		return -ENOENT;
2490 	if (tv.tv_sec == 0) {
2491 		sk->sk_stamp = ktime_get_real();
2492 		tv = ktime_to_timeval(sk->sk_stamp);
2493 	}
2494 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2495 }
2496 EXPORT_SYMBOL(sock_get_timestamp);
2497 
2498 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2499 {
2500 	struct timespec ts;
2501 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2502 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2503 	ts = ktime_to_timespec(sk->sk_stamp);
2504 	if (ts.tv_sec == -1)
2505 		return -ENOENT;
2506 	if (ts.tv_sec == 0) {
2507 		sk->sk_stamp = ktime_get_real();
2508 		ts = ktime_to_timespec(sk->sk_stamp);
2509 	}
2510 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2511 }
2512 EXPORT_SYMBOL(sock_get_timestampns);
2513 
2514 void sock_enable_timestamp(struct sock *sk, int flag)
2515 {
2516 	if (!sock_flag(sk, flag)) {
2517 		unsigned long previous_flags = sk->sk_flags;
2518 
2519 		sock_set_flag(sk, flag);
2520 		/*
2521 		 * we just set one of the two flags which require net
2522 		 * time stamping, but time stamping might have been on
2523 		 * already because of the other one
2524 		 */
2525 		if (sock_needs_netstamp(sk) &&
2526 		    !(previous_flags & SK_FLAGS_TIMESTAMP))
2527 			net_enable_timestamp();
2528 	}
2529 }
2530 
2531 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2532 		       int level, int type)
2533 {
2534 	struct sock_exterr_skb *serr;
2535 	struct sk_buff *skb;
2536 	int copied, err;
2537 
2538 	err = -EAGAIN;
2539 	skb = sock_dequeue_err_skb(sk);
2540 	if (skb == NULL)
2541 		goto out;
2542 
2543 	copied = skb->len;
2544 	if (copied > len) {
2545 		msg->msg_flags |= MSG_TRUNC;
2546 		copied = len;
2547 	}
2548 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
2549 	if (err)
2550 		goto out_free_skb;
2551 
2552 	sock_recv_timestamp(msg, sk, skb);
2553 
2554 	serr = SKB_EXT_ERR(skb);
2555 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2556 
2557 	msg->msg_flags |= MSG_ERRQUEUE;
2558 	err = copied;
2559 
2560 out_free_skb:
2561 	kfree_skb(skb);
2562 out:
2563 	return err;
2564 }
2565 EXPORT_SYMBOL(sock_recv_errqueue);
2566 
2567 /*
2568  *	Get a socket option on an socket.
2569  *
2570  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2571  *	asynchronous errors should be reported by getsockopt. We assume
2572  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2573  */
2574 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2575 			   char __user *optval, int __user *optlen)
2576 {
2577 	struct sock *sk = sock->sk;
2578 
2579 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2580 }
2581 EXPORT_SYMBOL(sock_common_getsockopt);
2582 
2583 #ifdef CONFIG_COMPAT
2584 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2585 				  char __user *optval, int __user *optlen)
2586 {
2587 	struct sock *sk = sock->sk;
2588 
2589 	if (sk->sk_prot->compat_getsockopt != NULL)
2590 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2591 						      optval, optlen);
2592 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2593 }
2594 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2595 #endif
2596 
2597 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2598 			int flags)
2599 {
2600 	struct sock *sk = sock->sk;
2601 	int addr_len = 0;
2602 	int err;
2603 
2604 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2605 				   flags & ~MSG_DONTWAIT, &addr_len);
2606 	if (err >= 0)
2607 		msg->msg_namelen = addr_len;
2608 	return err;
2609 }
2610 EXPORT_SYMBOL(sock_common_recvmsg);
2611 
2612 /*
2613  *	Set socket options on an inet socket.
2614  */
2615 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2616 			   char __user *optval, unsigned int optlen)
2617 {
2618 	struct sock *sk = sock->sk;
2619 
2620 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2621 }
2622 EXPORT_SYMBOL(sock_common_setsockopt);
2623 
2624 #ifdef CONFIG_COMPAT
2625 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2626 				  char __user *optval, unsigned int optlen)
2627 {
2628 	struct sock *sk = sock->sk;
2629 
2630 	if (sk->sk_prot->compat_setsockopt != NULL)
2631 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2632 						      optval, optlen);
2633 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2634 }
2635 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2636 #endif
2637 
2638 void sk_common_release(struct sock *sk)
2639 {
2640 	if (sk->sk_prot->destroy)
2641 		sk->sk_prot->destroy(sk);
2642 
2643 	/*
2644 	 * Observation: when sock_common_release is called, processes have
2645 	 * no access to socket. But net still has.
2646 	 * Step one, detach it from networking:
2647 	 *
2648 	 * A. Remove from hash tables.
2649 	 */
2650 
2651 	sk->sk_prot->unhash(sk);
2652 
2653 	/*
2654 	 * In this point socket cannot receive new packets, but it is possible
2655 	 * that some packets are in flight because some CPU runs receiver and
2656 	 * did hash table lookup before we unhashed socket. They will achieve
2657 	 * receive queue and will be purged by socket destructor.
2658 	 *
2659 	 * Also we still have packets pending on receive queue and probably,
2660 	 * our own packets waiting in device queues. sock_destroy will drain
2661 	 * receive queue, but transmitted packets will delay socket destruction
2662 	 * until the last reference will be released.
2663 	 */
2664 
2665 	sock_orphan(sk);
2666 
2667 	xfrm_sk_free_policy(sk);
2668 
2669 	sk_refcnt_debug_release(sk);
2670 
2671 	if (sk->sk_frag.page) {
2672 		put_page(sk->sk_frag.page);
2673 		sk->sk_frag.page = NULL;
2674 	}
2675 
2676 	sock_put(sk);
2677 }
2678 EXPORT_SYMBOL(sk_common_release);
2679 
2680 #ifdef CONFIG_PROC_FS
2681 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
2682 struct prot_inuse {
2683 	int val[PROTO_INUSE_NR];
2684 };
2685 
2686 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2687 
2688 #ifdef CONFIG_NET_NS
2689 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2690 {
2691 	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2692 }
2693 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2694 
2695 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2696 {
2697 	int cpu, idx = prot->inuse_idx;
2698 	int res = 0;
2699 
2700 	for_each_possible_cpu(cpu)
2701 		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2702 
2703 	return res >= 0 ? res : 0;
2704 }
2705 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2706 
2707 static int __net_init sock_inuse_init_net(struct net *net)
2708 {
2709 	net->core.inuse = alloc_percpu(struct prot_inuse);
2710 	return net->core.inuse ? 0 : -ENOMEM;
2711 }
2712 
2713 static void __net_exit sock_inuse_exit_net(struct net *net)
2714 {
2715 	free_percpu(net->core.inuse);
2716 }
2717 
2718 static struct pernet_operations net_inuse_ops = {
2719 	.init = sock_inuse_init_net,
2720 	.exit = sock_inuse_exit_net,
2721 };
2722 
2723 static __init int net_inuse_init(void)
2724 {
2725 	if (register_pernet_subsys(&net_inuse_ops))
2726 		panic("Cannot initialize net inuse counters");
2727 
2728 	return 0;
2729 }
2730 
2731 core_initcall(net_inuse_init);
2732 #else
2733 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2734 
2735 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2736 {
2737 	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2738 }
2739 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2740 
2741 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2742 {
2743 	int cpu, idx = prot->inuse_idx;
2744 	int res = 0;
2745 
2746 	for_each_possible_cpu(cpu)
2747 		res += per_cpu(prot_inuse, cpu).val[idx];
2748 
2749 	return res >= 0 ? res : 0;
2750 }
2751 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2752 #endif
2753 
2754 static void assign_proto_idx(struct proto *prot)
2755 {
2756 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2757 
2758 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2759 		pr_err("PROTO_INUSE_NR exhausted\n");
2760 		return;
2761 	}
2762 
2763 	set_bit(prot->inuse_idx, proto_inuse_idx);
2764 }
2765 
2766 static void release_proto_idx(struct proto *prot)
2767 {
2768 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2769 		clear_bit(prot->inuse_idx, proto_inuse_idx);
2770 }
2771 #else
2772 static inline void assign_proto_idx(struct proto *prot)
2773 {
2774 }
2775 
2776 static inline void release_proto_idx(struct proto *prot)
2777 {
2778 }
2779 #endif
2780 
2781 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2782 {
2783 	if (!rsk_prot)
2784 		return;
2785 	kfree(rsk_prot->slab_name);
2786 	rsk_prot->slab_name = NULL;
2787 	kmem_cache_destroy(rsk_prot->slab);
2788 	rsk_prot->slab = NULL;
2789 }
2790 
2791 static int req_prot_init(const struct proto *prot)
2792 {
2793 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
2794 
2795 	if (!rsk_prot)
2796 		return 0;
2797 
2798 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2799 					prot->name);
2800 	if (!rsk_prot->slab_name)
2801 		return -ENOMEM;
2802 
2803 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2804 					   rsk_prot->obj_size, 0,
2805 					   prot->slab_flags, NULL);
2806 
2807 	if (!rsk_prot->slab) {
2808 		pr_crit("%s: Can't create request sock SLAB cache!\n",
2809 			prot->name);
2810 		return -ENOMEM;
2811 	}
2812 	return 0;
2813 }
2814 
2815 int proto_register(struct proto *prot, int alloc_slab)
2816 {
2817 	if (alloc_slab) {
2818 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2819 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2820 					NULL);
2821 
2822 		if (prot->slab == NULL) {
2823 			pr_crit("%s: Can't create sock SLAB cache!\n",
2824 				prot->name);
2825 			goto out;
2826 		}
2827 
2828 		if (req_prot_init(prot))
2829 			goto out_free_request_sock_slab;
2830 
2831 		if (prot->twsk_prot != NULL) {
2832 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2833 
2834 			if (prot->twsk_prot->twsk_slab_name == NULL)
2835 				goto out_free_request_sock_slab;
2836 
2837 			prot->twsk_prot->twsk_slab =
2838 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2839 						  prot->twsk_prot->twsk_obj_size,
2840 						  0,
2841 						  prot->slab_flags,
2842 						  NULL);
2843 			if (prot->twsk_prot->twsk_slab == NULL)
2844 				goto out_free_timewait_sock_slab_name;
2845 		}
2846 	}
2847 
2848 	mutex_lock(&proto_list_mutex);
2849 	list_add(&prot->node, &proto_list);
2850 	assign_proto_idx(prot);
2851 	mutex_unlock(&proto_list_mutex);
2852 	return 0;
2853 
2854 out_free_timewait_sock_slab_name:
2855 	kfree(prot->twsk_prot->twsk_slab_name);
2856 out_free_request_sock_slab:
2857 	req_prot_cleanup(prot->rsk_prot);
2858 
2859 	kmem_cache_destroy(prot->slab);
2860 	prot->slab = NULL;
2861 out:
2862 	return -ENOBUFS;
2863 }
2864 EXPORT_SYMBOL(proto_register);
2865 
2866 void proto_unregister(struct proto *prot)
2867 {
2868 	mutex_lock(&proto_list_mutex);
2869 	release_proto_idx(prot);
2870 	list_del(&prot->node);
2871 	mutex_unlock(&proto_list_mutex);
2872 
2873 	kmem_cache_destroy(prot->slab);
2874 	prot->slab = NULL;
2875 
2876 	req_prot_cleanup(prot->rsk_prot);
2877 
2878 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2879 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2880 		kfree(prot->twsk_prot->twsk_slab_name);
2881 		prot->twsk_prot->twsk_slab = NULL;
2882 	}
2883 }
2884 EXPORT_SYMBOL(proto_unregister);
2885 
2886 #ifdef CONFIG_PROC_FS
2887 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2888 	__acquires(proto_list_mutex)
2889 {
2890 	mutex_lock(&proto_list_mutex);
2891 	return seq_list_start_head(&proto_list, *pos);
2892 }
2893 
2894 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2895 {
2896 	return seq_list_next(v, &proto_list, pos);
2897 }
2898 
2899 static void proto_seq_stop(struct seq_file *seq, void *v)
2900 	__releases(proto_list_mutex)
2901 {
2902 	mutex_unlock(&proto_list_mutex);
2903 }
2904 
2905 static char proto_method_implemented(const void *method)
2906 {
2907 	return method == NULL ? 'n' : 'y';
2908 }
2909 static long sock_prot_memory_allocated(struct proto *proto)
2910 {
2911 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2912 }
2913 
2914 static char *sock_prot_memory_pressure(struct proto *proto)
2915 {
2916 	return proto->memory_pressure != NULL ?
2917 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2918 }
2919 
2920 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2921 {
2922 
2923 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2924 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2925 		   proto->name,
2926 		   proto->obj_size,
2927 		   sock_prot_inuse_get(seq_file_net(seq), proto),
2928 		   sock_prot_memory_allocated(proto),
2929 		   sock_prot_memory_pressure(proto),
2930 		   proto->max_header,
2931 		   proto->slab == NULL ? "no" : "yes",
2932 		   module_name(proto->owner),
2933 		   proto_method_implemented(proto->close),
2934 		   proto_method_implemented(proto->connect),
2935 		   proto_method_implemented(proto->disconnect),
2936 		   proto_method_implemented(proto->accept),
2937 		   proto_method_implemented(proto->ioctl),
2938 		   proto_method_implemented(proto->init),
2939 		   proto_method_implemented(proto->destroy),
2940 		   proto_method_implemented(proto->shutdown),
2941 		   proto_method_implemented(proto->setsockopt),
2942 		   proto_method_implemented(proto->getsockopt),
2943 		   proto_method_implemented(proto->sendmsg),
2944 		   proto_method_implemented(proto->recvmsg),
2945 		   proto_method_implemented(proto->sendpage),
2946 		   proto_method_implemented(proto->bind),
2947 		   proto_method_implemented(proto->backlog_rcv),
2948 		   proto_method_implemented(proto->hash),
2949 		   proto_method_implemented(proto->unhash),
2950 		   proto_method_implemented(proto->get_port),
2951 		   proto_method_implemented(proto->enter_memory_pressure));
2952 }
2953 
2954 static int proto_seq_show(struct seq_file *seq, void *v)
2955 {
2956 	if (v == &proto_list)
2957 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2958 			   "protocol",
2959 			   "size",
2960 			   "sockets",
2961 			   "memory",
2962 			   "press",
2963 			   "maxhdr",
2964 			   "slab",
2965 			   "module",
2966 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2967 	else
2968 		proto_seq_printf(seq, list_entry(v, struct proto, node));
2969 	return 0;
2970 }
2971 
2972 static const struct seq_operations proto_seq_ops = {
2973 	.start  = proto_seq_start,
2974 	.next   = proto_seq_next,
2975 	.stop   = proto_seq_stop,
2976 	.show   = proto_seq_show,
2977 };
2978 
2979 static int proto_seq_open(struct inode *inode, struct file *file)
2980 {
2981 	return seq_open_net(inode, file, &proto_seq_ops,
2982 			    sizeof(struct seq_net_private));
2983 }
2984 
2985 static const struct file_operations proto_seq_fops = {
2986 	.owner		= THIS_MODULE,
2987 	.open		= proto_seq_open,
2988 	.read		= seq_read,
2989 	.llseek		= seq_lseek,
2990 	.release	= seq_release_net,
2991 };
2992 
2993 static __net_init int proto_init_net(struct net *net)
2994 {
2995 	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2996 		return -ENOMEM;
2997 
2998 	return 0;
2999 }
3000 
3001 static __net_exit void proto_exit_net(struct net *net)
3002 {
3003 	remove_proc_entry("protocols", net->proc_net);
3004 }
3005 
3006 
3007 static __net_initdata struct pernet_operations proto_net_ops = {
3008 	.init = proto_init_net,
3009 	.exit = proto_exit_net,
3010 };
3011 
3012 static int __init proto_init(void)
3013 {
3014 	return register_pernet_subsys(&proto_net_ops);
3015 }
3016 
3017 subsys_initcall(proto_init);
3018 
3019 #endif /* PROC_FS */
3020