xref: /linux/net/core/sock.c (revision 3ce095c16263630dde46d6051854073edaacf3d7)
1 /*
2  * INET		An implementation of the TCP/IP protocol suite for the LINUX
3  *		operating system.  INET is implemented using the  BSD Socket
4  *		interface as the means of communication with the user level.
5  *
6  *		Generic socket support routines. Memory allocators, socket lock/release
7  *		handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:	Ross Biro
11  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *		Florian La Roche, <flla@stud.uni-sb.de>
13  *		Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *		Alan Cox	: 	Numerous verify_area() problems
17  *		Alan Cox	:	Connecting on a connecting socket
18  *					now returns an error for tcp.
19  *		Alan Cox	:	sock->protocol is set correctly.
20  *					and is not sometimes left as 0.
21  *		Alan Cox	:	connect handles icmp errors on a
22  *					connect properly. Unfortunately there
23  *					is a restart syscall nasty there. I
24  *					can't match BSD without hacking the C
25  *					library. Ideas urgently sought!
26  *		Alan Cox	:	Disallow bind() to addresses that are
27  *					not ours - especially broadcast ones!!
28  *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
29  *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
30  *					instead they leave that for the DESTROY timer.
31  *		Alan Cox	:	Clean up error flag in accept
32  *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
33  *					was buggy. Put a remove_sock() in the handler
34  *					for memory when we hit 0. Also altered the timer
35  *					code. The ACK stuff can wait and needs major
36  *					TCP layer surgery.
37  *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
38  *					and fixed timer/inet_bh race.
39  *		Alan Cox	:	Added zapped flag for TCP
40  *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
41  *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
43  *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
46  *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
47  *	Pauline Middelink	:	identd support
48  *		Alan Cox	:	Fixed connect() taking signals I think.
49  *		Alan Cox	:	SO_LINGER supported
50  *		Alan Cox	:	Error reporting fixes
51  *		Anonymous	:	inet_create tidied up (sk->reuse setting)
52  *		Alan Cox	:	inet sockets don't set sk->type!
53  *		Alan Cox	:	Split socket option code
54  *		Alan Cox	:	Callbacks
55  *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
56  *		Alex		:	Removed restriction on inet fioctl
57  *		Alan Cox	:	Splitting INET from NET core
58  *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
59  *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
60  *		Alan Cox	:	Split IP from generic code
61  *		Alan Cox	:	New kfree_skbmem()
62  *		Alan Cox	:	Make SO_DEBUG superuser only.
63  *		Alan Cox	:	Allow anyone to clear SO_DEBUG
64  *					(compatibility fix)
65  *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
66  *		Alan Cox	:	Allocator for a socket is settable.
67  *		Alan Cox	:	SO_ERROR includes soft errors.
68  *		Alan Cox	:	Allow NULL arguments on some SO_ opts
69  *		Alan Cox	: 	Generic socket allocation to make hooks
70  *					easier (suggested by Craig Metz).
71  *		Michael Pall	:	SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
79  *		Andi Kleen	:	Fix write_space callback
80  *		Chris Evans	:	Security fixes - signedness again
81  *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *		This program is free software; you can redistribute it and/or
87  *		modify it under the terms of the GNU General Public License
88  *		as published by the Free Software Foundation; either version
89  *		2 of the License, or (at your option) any later version.
90  */
91 
92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
93 
94 #include <linux/capability.h>
95 #include <linux/errno.h>
96 #include <linux/errqueue.h>
97 #include <linux/types.h>
98 #include <linux/socket.h>
99 #include <linux/in.h>
100 #include <linux/kernel.h>
101 #include <linux/module.h>
102 #include <linux/proc_fs.h>
103 #include <linux/seq_file.h>
104 #include <linux/sched.h>
105 #include <linux/timer.h>
106 #include <linux/string.h>
107 #include <linux/sockios.h>
108 #include <linux/net.h>
109 #include <linux/mm.h>
110 #include <linux/slab.h>
111 #include <linux/interrupt.h>
112 #include <linux/poll.h>
113 #include <linux/tcp.h>
114 #include <linux/init.h>
115 #include <linux/highmem.h>
116 #include <linux/user_namespace.h>
117 #include <linux/static_key.h>
118 #include <linux/memcontrol.h>
119 #include <linux/prefetch.h>
120 
121 #include <asm/uaccess.h>
122 
123 #include <linux/netdevice.h>
124 #include <net/protocol.h>
125 #include <linux/skbuff.h>
126 #include <net/net_namespace.h>
127 #include <net/request_sock.h>
128 #include <net/sock.h>
129 #include <linux/net_tstamp.h>
130 #include <net/xfrm.h>
131 #include <linux/ipsec.h>
132 #include <net/cls_cgroup.h>
133 #include <net/netprio_cgroup.h>
134 
135 #include <linux/filter.h>
136 
137 #include <trace/events/sock.h>
138 
139 #ifdef CONFIG_INET
140 #include <net/tcp.h>
141 #endif
142 
143 #include <net/busy_poll.h>
144 
145 static DEFINE_MUTEX(proto_list_mutex);
146 static LIST_HEAD(proto_list);
147 
148 /**
149  * sk_ns_capable - General socket capability test
150  * @sk: Socket to use a capability on or through
151  * @user_ns: The user namespace of the capability to use
152  * @cap: The capability to use
153  *
154  * Test to see if the opener of the socket had when the socket was
155  * created and the current process has the capability @cap in the user
156  * namespace @user_ns.
157  */
158 bool sk_ns_capable(const struct sock *sk,
159 		   struct user_namespace *user_ns, int cap)
160 {
161 	return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
162 		ns_capable(user_ns, cap);
163 }
164 EXPORT_SYMBOL(sk_ns_capable);
165 
166 /**
167  * sk_capable - Socket global capability test
168  * @sk: Socket to use a capability on or through
169  * @cap: The global capability to use
170  *
171  * Test to see if the opener of the socket had when the socket was
172  * created and the current process has the capability @cap in all user
173  * namespaces.
174  */
175 bool sk_capable(const struct sock *sk, int cap)
176 {
177 	return sk_ns_capable(sk, &init_user_ns, cap);
178 }
179 EXPORT_SYMBOL(sk_capable);
180 
181 /**
182  * sk_net_capable - Network namespace socket capability test
183  * @sk: Socket to use a capability on or through
184  * @cap: The capability to use
185  *
186  * Test to see if the opener of the socket had when the socket was created
187  * and the current process has the capability @cap over the network namespace
188  * the socket is a member of.
189  */
190 bool sk_net_capable(const struct sock *sk, int cap)
191 {
192 	return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
193 }
194 EXPORT_SYMBOL(sk_net_capable);
195 
196 
197 #ifdef CONFIG_MEMCG_KMEM
198 int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
199 {
200 	struct proto *proto;
201 	int ret = 0;
202 
203 	mutex_lock(&proto_list_mutex);
204 	list_for_each_entry(proto, &proto_list, node) {
205 		if (proto->init_cgroup) {
206 			ret = proto->init_cgroup(memcg, ss);
207 			if (ret)
208 				goto out;
209 		}
210 	}
211 
212 	mutex_unlock(&proto_list_mutex);
213 	return ret;
214 out:
215 	list_for_each_entry_continue_reverse(proto, &proto_list, node)
216 		if (proto->destroy_cgroup)
217 			proto->destroy_cgroup(memcg);
218 	mutex_unlock(&proto_list_mutex);
219 	return ret;
220 }
221 
222 void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
223 {
224 	struct proto *proto;
225 
226 	mutex_lock(&proto_list_mutex);
227 	list_for_each_entry_reverse(proto, &proto_list, node)
228 		if (proto->destroy_cgroup)
229 			proto->destroy_cgroup(memcg);
230 	mutex_unlock(&proto_list_mutex);
231 }
232 #endif
233 
234 /*
235  * Each address family might have different locking rules, so we have
236  * one slock key per address family:
237  */
238 static struct lock_class_key af_family_keys[AF_MAX];
239 static struct lock_class_key af_family_slock_keys[AF_MAX];
240 
241 #if defined(CONFIG_MEMCG_KMEM)
242 struct static_key memcg_socket_limit_enabled;
243 EXPORT_SYMBOL(memcg_socket_limit_enabled);
244 #endif
245 
246 /*
247  * Make lock validator output more readable. (we pre-construct these
248  * strings build-time, so that runtime initialization of socket
249  * locks is fast):
250  */
251 static const char *const af_family_key_strings[AF_MAX+1] = {
252   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
253   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
254   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
255   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
256   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
257   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
258   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
259   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
260   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
261   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
262   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
263   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
264   "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
265   "sk_lock-AF_NFC"   , "sk_lock-AF_VSOCK"    , "sk_lock-AF_MAX"
266 };
267 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
268   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
269   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
270   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
271   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
272   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
273   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
274   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
275   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
276   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
277   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
278   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
279   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
280   "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
281   "slock-AF_NFC"   , "slock-AF_VSOCK"    ,"slock-AF_MAX"
282 };
283 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
284   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
285   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
286   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
287   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
288   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
289   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
290   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
291   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
292   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
293   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
294   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
295   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
296   "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
297   "clock-AF_NFC"   , "clock-AF_VSOCK"    , "clock-AF_MAX"
298 };
299 
300 /*
301  * sk_callback_lock locking rules are per-address-family,
302  * so split the lock classes by using a per-AF key:
303  */
304 static struct lock_class_key af_callback_keys[AF_MAX];
305 
306 /* Take into consideration the size of the struct sk_buff overhead in the
307  * determination of these values, since that is non-constant across
308  * platforms.  This makes socket queueing behavior and performance
309  * not depend upon such differences.
310  */
311 #define _SK_MEM_PACKETS		256
312 #define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
313 #define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
314 #define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
315 
316 /* Run time adjustable parameters. */
317 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
318 EXPORT_SYMBOL(sysctl_wmem_max);
319 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
320 EXPORT_SYMBOL(sysctl_rmem_max);
321 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
322 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
323 
324 /* Maximal space eaten by iovec or ancillary data plus some space */
325 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
326 EXPORT_SYMBOL(sysctl_optmem_max);
327 
328 int sysctl_tstamp_allow_data __read_mostly = 1;
329 
330 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
331 EXPORT_SYMBOL_GPL(memalloc_socks);
332 
333 /**
334  * sk_set_memalloc - sets %SOCK_MEMALLOC
335  * @sk: socket to set it on
336  *
337  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
338  * It's the responsibility of the admin to adjust min_free_kbytes
339  * to meet the requirements
340  */
341 void sk_set_memalloc(struct sock *sk)
342 {
343 	sock_set_flag(sk, SOCK_MEMALLOC);
344 	sk->sk_allocation |= __GFP_MEMALLOC;
345 	static_key_slow_inc(&memalloc_socks);
346 }
347 EXPORT_SYMBOL_GPL(sk_set_memalloc);
348 
349 void sk_clear_memalloc(struct sock *sk)
350 {
351 	sock_reset_flag(sk, SOCK_MEMALLOC);
352 	sk->sk_allocation &= ~__GFP_MEMALLOC;
353 	static_key_slow_dec(&memalloc_socks);
354 
355 	/*
356 	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
357 	 * progress of swapping. SOCK_MEMALLOC may be cleared while
358 	 * it has rmem allocations due to the last swapfile being deactivated
359 	 * but there is a risk that the socket is unusable due to exceeding
360 	 * the rmem limits. Reclaim the reserves and obey rmem limits again.
361 	 */
362 	sk_mem_reclaim(sk);
363 }
364 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
365 
366 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
367 {
368 	int ret;
369 	unsigned long pflags = current->flags;
370 
371 	/* these should have been dropped before queueing */
372 	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
373 
374 	current->flags |= PF_MEMALLOC;
375 	ret = sk->sk_backlog_rcv(sk, skb);
376 	tsk_restore_flags(current, pflags, PF_MEMALLOC);
377 
378 	return ret;
379 }
380 EXPORT_SYMBOL(__sk_backlog_rcv);
381 
382 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
383 {
384 	struct timeval tv;
385 
386 	if (optlen < sizeof(tv))
387 		return -EINVAL;
388 	if (copy_from_user(&tv, optval, sizeof(tv)))
389 		return -EFAULT;
390 	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
391 		return -EDOM;
392 
393 	if (tv.tv_sec < 0) {
394 		static int warned __read_mostly;
395 
396 		*timeo_p = 0;
397 		if (warned < 10 && net_ratelimit()) {
398 			warned++;
399 			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
400 				__func__, current->comm, task_pid_nr(current));
401 		}
402 		return 0;
403 	}
404 	*timeo_p = MAX_SCHEDULE_TIMEOUT;
405 	if (tv.tv_sec == 0 && tv.tv_usec == 0)
406 		return 0;
407 	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
408 		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
409 	return 0;
410 }
411 
412 static void sock_warn_obsolete_bsdism(const char *name)
413 {
414 	static int warned;
415 	static char warncomm[TASK_COMM_LEN];
416 	if (strcmp(warncomm, current->comm) && warned < 5) {
417 		strcpy(warncomm,  current->comm);
418 		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
419 			warncomm, name);
420 		warned++;
421 	}
422 }
423 
424 #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
425 
426 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
427 {
428 	if (sk->sk_flags & flags) {
429 		sk->sk_flags &= ~flags;
430 		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
431 			net_disable_timestamp();
432 	}
433 }
434 
435 
436 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
437 {
438 	int err;
439 	unsigned long flags;
440 	struct sk_buff_head *list = &sk->sk_receive_queue;
441 
442 	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
443 		atomic_inc(&sk->sk_drops);
444 		trace_sock_rcvqueue_full(sk, skb);
445 		return -ENOMEM;
446 	}
447 
448 	err = sk_filter(sk, skb);
449 	if (err)
450 		return err;
451 
452 	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
453 		atomic_inc(&sk->sk_drops);
454 		return -ENOBUFS;
455 	}
456 
457 	skb->dev = NULL;
458 	skb_set_owner_r(skb, sk);
459 
460 	/* we escape from rcu protected region, make sure we dont leak
461 	 * a norefcounted dst
462 	 */
463 	skb_dst_force(skb);
464 
465 	spin_lock_irqsave(&list->lock, flags);
466 	sock_skb_set_dropcount(sk, skb);
467 	__skb_queue_tail(list, skb);
468 	spin_unlock_irqrestore(&list->lock, flags);
469 
470 	if (!sock_flag(sk, SOCK_DEAD))
471 		sk->sk_data_ready(sk);
472 	return 0;
473 }
474 EXPORT_SYMBOL(sock_queue_rcv_skb);
475 
476 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
477 {
478 	int rc = NET_RX_SUCCESS;
479 
480 	if (sk_filter(sk, skb))
481 		goto discard_and_relse;
482 
483 	skb->dev = NULL;
484 
485 	if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
486 		atomic_inc(&sk->sk_drops);
487 		goto discard_and_relse;
488 	}
489 	if (nested)
490 		bh_lock_sock_nested(sk);
491 	else
492 		bh_lock_sock(sk);
493 	if (!sock_owned_by_user(sk)) {
494 		/*
495 		 * trylock + unlock semantics:
496 		 */
497 		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
498 
499 		rc = sk_backlog_rcv(sk, skb);
500 
501 		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
502 	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
503 		bh_unlock_sock(sk);
504 		atomic_inc(&sk->sk_drops);
505 		goto discard_and_relse;
506 	}
507 
508 	bh_unlock_sock(sk);
509 out:
510 	sock_put(sk);
511 	return rc;
512 discard_and_relse:
513 	kfree_skb(skb);
514 	goto out;
515 }
516 EXPORT_SYMBOL(sk_receive_skb);
517 
518 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
519 {
520 	struct dst_entry *dst = __sk_dst_get(sk);
521 
522 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
523 		sk_tx_queue_clear(sk);
524 		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
525 		dst_release(dst);
526 		return NULL;
527 	}
528 
529 	return dst;
530 }
531 EXPORT_SYMBOL(__sk_dst_check);
532 
533 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
534 {
535 	struct dst_entry *dst = sk_dst_get(sk);
536 
537 	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
538 		sk_dst_reset(sk);
539 		dst_release(dst);
540 		return NULL;
541 	}
542 
543 	return dst;
544 }
545 EXPORT_SYMBOL(sk_dst_check);
546 
547 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
548 				int optlen)
549 {
550 	int ret = -ENOPROTOOPT;
551 #ifdef CONFIG_NETDEVICES
552 	struct net *net = sock_net(sk);
553 	char devname[IFNAMSIZ];
554 	int index;
555 
556 	/* Sorry... */
557 	ret = -EPERM;
558 	if (!ns_capable(net->user_ns, CAP_NET_RAW))
559 		goto out;
560 
561 	ret = -EINVAL;
562 	if (optlen < 0)
563 		goto out;
564 
565 	/* Bind this socket to a particular device like "eth0",
566 	 * as specified in the passed interface name. If the
567 	 * name is "" or the option length is zero the socket
568 	 * is not bound.
569 	 */
570 	if (optlen > IFNAMSIZ - 1)
571 		optlen = IFNAMSIZ - 1;
572 	memset(devname, 0, sizeof(devname));
573 
574 	ret = -EFAULT;
575 	if (copy_from_user(devname, optval, optlen))
576 		goto out;
577 
578 	index = 0;
579 	if (devname[0] != '\0') {
580 		struct net_device *dev;
581 
582 		rcu_read_lock();
583 		dev = dev_get_by_name_rcu(net, devname);
584 		if (dev)
585 			index = dev->ifindex;
586 		rcu_read_unlock();
587 		ret = -ENODEV;
588 		if (!dev)
589 			goto out;
590 	}
591 
592 	lock_sock(sk);
593 	sk->sk_bound_dev_if = index;
594 	sk_dst_reset(sk);
595 	release_sock(sk);
596 
597 	ret = 0;
598 
599 out:
600 #endif
601 
602 	return ret;
603 }
604 
605 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
606 				int __user *optlen, int len)
607 {
608 	int ret = -ENOPROTOOPT;
609 #ifdef CONFIG_NETDEVICES
610 	struct net *net = sock_net(sk);
611 	char devname[IFNAMSIZ];
612 
613 	if (sk->sk_bound_dev_if == 0) {
614 		len = 0;
615 		goto zero;
616 	}
617 
618 	ret = -EINVAL;
619 	if (len < IFNAMSIZ)
620 		goto out;
621 
622 	ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
623 	if (ret)
624 		goto out;
625 
626 	len = strlen(devname) + 1;
627 
628 	ret = -EFAULT;
629 	if (copy_to_user(optval, devname, len))
630 		goto out;
631 
632 zero:
633 	ret = -EFAULT;
634 	if (put_user(len, optlen))
635 		goto out;
636 
637 	ret = 0;
638 
639 out:
640 #endif
641 
642 	return ret;
643 }
644 
645 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
646 {
647 	if (valbool)
648 		sock_set_flag(sk, bit);
649 	else
650 		sock_reset_flag(sk, bit);
651 }
652 
653 bool sk_mc_loop(struct sock *sk)
654 {
655 	if (dev_recursion_level())
656 		return false;
657 	if (!sk)
658 		return true;
659 	switch (sk->sk_family) {
660 	case AF_INET:
661 		return inet_sk(sk)->mc_loop;
662 #if IS_ENABLED(CONFIG_IPV6)
663 	case AF_INET6:
664 		return inet6_sk(sk)->mc_loop;
665 #endif
666 	}
667 	WARN_ON(1);
668 	return true;
669 }
670 EXPORT_SYMBOL(sk_mc_loop);
671 
672 /*
673  *	This is meant for all protocols to use and covers goings on
674  *	at the socket level. Everything here is generic.
675  */
676 
677 int sock_setsockopt(struct socket *sock, int level, int optname,
678 		    char __user *optval, unsigned int optlen)
679 {
680 	struct sock *sk = sock->sk;
681 	int val;
682 	int valbool;
683 	struct linger ling;
684 	int ret = 0;
685 
686 	/*
687 	 *	Options without arguments
688 	 */
689 
690 	if (optname == SO_BINDTODEVICE)
691 		return sock_setbindtodevice(sk, optval, optlen);
692 
693 	if (optlen < sizeof(int))
694 		return -EINVAL;
695 
696 	if (get_user(val, (int __user *)optval))
697 		return -EFAULT;
698 
699 	valbool = val ? 1 : 0;
700 
701 	lock_sock(sk);
702 
703 	switch (optname) {
704 	case SO_DEBUG:
705 		if (val && !capable(CAP_NET_ADMIN))
706 			ret = -EACCES;
707 		else
708 			sock_valbool_flag(sk, SOCK_DBG, valbool);
709 		break;
710 	case SO_REUSEADDR:
711 		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
712 		break;
713 	case SO_REUSEPORT:
714 		sk->sk_reuseport = valbool;
715 		break;
716 	case SO_TYPE:
717 	case SO_PROTOCOL:
718 	case SO_DOMAIN:
719 	case SO_ERROR:
720 		ret = -ENOPROTOOPT;
721 		break;
722 	case SO_DONTROUTE:
723 		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
724 		break;
725 	case SO_BROADCAST:
726 		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
727 		break;
728 	case SO_SNDBUF:
729 		/* Don't error on this BSD doesn't and if you think
730 		 * about it this is right. Otherwise apps have to
731 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
732 		 * are treated in BSD as hints
733 		 */
734 		val = min_t(u32, val, sysctl_wmem_max);
735 set_sndbuf:
736 		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
737 		sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
738 		/* Wake up sending tasks if we upped the value. */
739 		sk->sk_write_space(sk);
740 		break;
741 
742 	case SO_SNDBUFFORCE:
743 		if (!capable(CAP_NET_ADMIN)) {
744 			ret = -EPERM;
745 			break;
746 		}
747 		goto set_sndbuf;
748 
749 	case SO_RCVBUF:
750 		/* Don't error on this BSD doesn't and if you think
751 		 * about it this is right. Otherwise apps have to
752 		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
753 		 * are treated in BSD as hints
754 		 */
755 		val = min_t(u32, val, sysctl_rmem_max);
756 set_rcvbuf:
757 		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
758 		/*
759 		 * We double it on the way in to account for
760 		 * "struct sk_buff" etc. overhead.   Applications
761 		 * assume that the SO_RCVBUF setting they make will
762 		 * allow that much actual data to be received on that
763 		 * socket.
764 		 *
765 		 * Applications are unaware that "struct sk_buff" and
766 		 * other overheads allocate from the receive buffer
767 		 * during socket buffer allocation.
768 		 *
769 		 * And after considering the possible alternatives,
770 		 * returning the value we actually used in getsockopt
771 		 * is the most desirable behavior.
772 		 */
773 		sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
774 		break;
775 
776 	case SO_RCVBUFFORCE:
777 		if (!capable(CAP_NET_ADMIN)) {
778 			ret = -EPERM;
779 			break;
780 		}
781 		goto set_rcvbuf;
782 
783 	case SO_KEEPALIVE:
784 #ifdef CONFIG_INET
785 		if (sk->sk_protocol == IPPROTO_TCP &&
786 		    sk->sk_type == SOCK_STREAM)
787 			tcp_set_keepalive(sk, valbool);
788 #endif
789 		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
790 		break;
791 
792 	case SO_OOBINLINE:
793 		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
794 		break;
795 
796 	case SO_NO_CHECK:
797 		sk->sk_no_check_tx = valbool;
798 		break;
799 
800 	case SO_PRIORITY:
801 		if ((val >= 0 && val <= 6) ||
802 		    ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
803 			sk->sk_priority = val;
804 		else
805 			ret = -EPERM;
806 		break;
807 
808 	case SO_LINGER:
809 		if (optlen < sizeof(ling)) {
810 			ret = -EINVAL;	/* 1003.1g */
811 			break;
812 		}
813 		if (copy_from_user(&ling, optval, sizeof(ling))) {
814 			ret = -EFAULT;
815 			break;
816 		}
817 		if (!ling.l_onoff)
818 			sock_reset_flag(sk, SOCK_LINGER);
819 		else {
820 #if (BITS_PER_LONG == 32)
821 			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
822 				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
823 			else
824 #endif
825 				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
826 			sock_set_flag(sk, SOCK_LINGER);
827 		}
828 		break;
829 
830 	case SO_BSDCOMPAT:
831 		sock_warn_obsolete_bsdism("setsockopt");
832 		break;
833 
834 	case SO_PASSCRED:
835 		if (valbool)
836 			set_bit(SOCK_PASSCRED, &sock->flags);
837 		else
838 			clear_bit(SOCK_PASSCRED, &sock->flags);
839 		break;
840 
841 	case SO_TIMESTAMP:
842 	case SO_TIMESTAMPNS:
843 		if (valbool)  {
844 			if (optname == SO_TIMESTAMP)
845 				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
846 			else
847 				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
848 			sock_set_flag(sk, SOCK_RCVTSTAMP);
849 			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
850 		} else {
851 			sock_reset_flag(sk, SOCK_RCVTSTAMP);
852 			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
853 		}
854 		break;
855 
856 	case SO_TIMESTAMPING:
857 		if (val & ~SOF_TIMESTAMPING_MASK) {
858 			ret = -EINVAL;
859 			break;
860 		}
861 
862 		if (val & SOF_TIMESTAMPING_OPT_ID &&
863 		    !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
864 			if (sk->sk_protocol == IPPROTO_TCP) {
865 				if (sk->sk_state != TCP_ESTABLISHED) {
866 					ret = -EINVAL;
867 					break;
868 				}
869 				sk->sk_tskey = tcp_sk(sk)->snd_una;
870 			} else {
871 				sk->sk_tskey = 0;
872 			}
873 		}
874 		sk->sk_tsflags = val;
875 		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
876 			sock_enable_timestamp(sk,
877 					      SOCK_TIMESTAMPING_RX_SOFTWARE);
878 		else
879 			sock_disable_timestamp(sk,
880 					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
881 		break;
882 
883 	case SO_RCVLOWAT:
884 		if (val < 0)
885 			val = INT_MAX;
886 		sk->sk_rcvlowat = val ? : 1;
887 		break;
888 
889 	case SO_RCVTIMEO:
890 		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
891 		break;
892 
893 	case SO_SNDTIMEO:
894 		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
895 		break;
896 
897 	case SO_ATTACH_FILTER:
898 		ret = -EINVAL;
899 		if (optlen == sizeof(struct sock_fprog)) {
900 			struct sock_fprog fprog;
901 
902 			ret = -EFAULT;
903 			if (copy_from_user(&fprog, optval, sizeof(fprog)))
904 				break;
905 
906 			ret = sk_attach_filter(&fprog, sk);
907 		}
908 		break;
909 
910 	case SO_ATTACH_BPF:
911 		ret = -EINVAL;
912 		if (optlen == sizeof(u32)) {
913 			u32 ufd;
914 
915 			ret = -EFAULT;
916 			if (copy_from_user(&ufd, optval, sizeof(ufd)))
917 				break;
918 
919 			ret = sk_attach_bpf(ufd, sk);
920 		}
921 		break;
922 
923 	case SO_DETACH_FILTER:
924 		ret = sk_detach_filter(sk);
925 		break;
926 
927 	case SO_LOCK_FILTER:
928 		if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
929 			ret = -EPERM;
930 		else
931 			sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
932 		break;
933 
934 	case SO_PASSSEC:
935 		if (valbool)
936 			set_bit(SOCK_PASSSEC, &sock->flags);
937 		else
938 			clear_bit(SOCK_PASSSEC, &sock->flags);
939 		break;
940 	case SO_MARK:
941 		if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
942 			ret = -EPERM;
943 		else
944 			sk->sk_mark = val;
945 		break;
946 
947 	case SO_RXQ_OVFL:
948 		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
949 		break;
950 
951 	case SO_WIFI_STATUS:
952 		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
953 		break;
954 
955 	case SO_PEEK_OFF:
956 		if (sock->ops->set_peek_off)
957 			ret = sock->ops->set_peek_off(sk, val);
958 		else
959 			ret = -EOPNOTSUPP;
960 		break;
961 
962 	case SO_NOFCS:
963 		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
964 		break;
965 
966 	case SO_SELECT_ERR_QUEUE:
967 		sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
968 		break;
969 
970 #ifdef CONFIG_NET_RX_BUSY_POLL
971 	case SO_BUSY_POLL:
972 		/* allow unprivileged users to decrease the value */
973 		if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
974 			ret = -EPERM;
975 		else {
976 			if (val < 0)
977 				ret = -EINVAL;
978 			else
979 				sk->sk_ll_usec = val;
980 		}
981 		break;
982 #endif
983 
984 	case SO_MAX_PACING_RATE:
985 		sk->sk_max_pacing_rate = val;
986 		sk->sk_pacing_rate = min(sk->sk_pacing_rate,
987 					 sk->sk_max_pacing_rate);
988 		break;
989 
990 	default:
991 		ret = -ENOPROTOOPT;
992 		break;
993 	}
994 	release_sock(sk);
995 	return ret;
996 }
997 EXPORT_SYMBOL(sock_setsockopt);
998 
999 
1000 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1001 			  struct ucred *ucred)
1002 {
1003 	ucred->pid = pid_vnr(pid);
1004 	ucred->uid = ucred->gid = -1;
1005 	if (cred) {
1006 		struct user_namespace *current_ns = current_user_ns();
1007 
1008 		ucred->uid = from_kuid_munged(current_ns, cred->euid);
1009 		ucred->gid = from_kgid_munged(current_ns, cred->egid);
1010 	}
1011 }
1012 
1013 int sock_getsockopt(struct socket *sock, int level, int optname,
1014 		    char __user *optval, int __user *optlen)
1015 {
1016 	struct sock *sk = sock->sk;
1017 
1018 	union {
1019 		int val;
1020 		struct linger ling;
1021 		struct timeval tm;
1022 	} v;
1023 
1024 	int lv = sizeof(int);
1025 	int len;
1026 
1027 	if (get_user(len, optlen))
1028 		return -EFAULT;
1029 	if (len < 0)
1030 		return -EINVAL;
1031 
1032 	memset(&v, 0, sizeof(v));
1033 
1034 	switch (optname) {
1035 	case SO_DEBUG:
1036 		v.val = sock_flag(sk, SOCK_DBG);
1037 		break;
1038 
1039 	case SO_DONTROUTE:
1040 		v.val = sock_flag(sk, SOCK_LOCALROUTE);
1041 		break;
1042 
1043 	case SO_BROADCAST:
1044 		v.val = sock_flag(sk, SOCK_BROADCAST);
1045 		break;
1046 
1047 	case SO_SNDBUF:
1048 		v.val = sk->sk_sndbuf;
1049 		break;
1050 
1051 	case SO_RCVBUF:
1052 		v.val = sk->sk_rcvbuf;
1053 		break;
1054 
1055 	case SO_REUSEADDR:
1056 		v.val = sk->sk_reuse;
1057 		break;
1058 
1059 	case SO_REUSEPORT:
1060 		v.val = sk->sk_reuseport;
1061 		break;
1062 
1063 	case SO_KEEPALIVE:
1064 		v.val = sock_flag(sk, SOCK_KEEPOPEN);
1065 		break;
1066 
1067 	case SO_TYPE:
1068 		v.val = sk->sk_type;
1069 		break;
1070 
1071 	case SO_PROTOCOL:
1072 		v.val = sk->sk_protocol;
1073 		break;
1074 
1075 	case SO_DOMAIN:
1076 		v.val = sk->sk_family;
1077 		break;
1078 
1079 	case SO_ERROR:
1080 		v.val = -sock_error(sk);
1081 		if (v.val == 0)
1082 			v.val = xchg(&sk->sk_err_soft, 0);
1083 		break;
1084 
1085 	case SO_OOBINLINE:
1086 		v.val = sock_flag(sk, SOCK_URGINLINE);
1087 		break;
1088 
1089 	case SO_NO_CHECK:
1090 		v.val = sk->sk_no_check_tx;
1091 		break;
1092 
1093 	case SO_PRIORITY:
1094 		v.val = sk->sk_priority;
1095 		break;
1096 
1097 	case SO_LINGER:
1098 		lv		= sizeof(v.ling);
1099 		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
1100 		v.ling.l_linger	= sk->sk_lingertime / HZ;
1101 		break;
1102 
1103 	case SO_BSDCOMPAT:
1104 		sock_warn_obsolete_bsdism("getsockopt");
1105 		break;
1106 
1107 	case SO_TIMESTAMP:
1108 		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1109 				!sock_flag(sk, SOCK_RCVTSTAMPNS);
1110 		break;
1111 
1112 	case SO_TIMESTAMPNS:
1113 		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1114 		break;
1115 
1116 	case SO_TIMESTAMPING:
1117 		v.val = sk->sk_tsflags;
1118 		break;
1119 
1120 	case SO_RCVTIMEO:
1121 		lv = sizeof(struct timeval);
1122 		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1123 			v.tm.tv_sec = 0;
1124 			v.tm.tv_usec = 0;
1125 		} else {
1126 			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1127 			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
1128 		}
1129 		break;
1130 
1131 	case SO_SNDTIMEO:
1132 		lv = sizeof(struct timeval);
1133 		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1134 			v.tm.tv_sec = 0;
1135 			v.tm.tv_usec = 0;
1136 		} else {
1137 			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1138 			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
1139 		}
1140 		break;
1141 
1142 	case SO_RCVLOWAT:
1143 		v.val = sk->sk_rcvlowat;
1144 		break;
1145 
1146 	case SO_SNDLOWAT:
1147 		v.val = 1;
1148 		break;
1149 
1150 	case SO_PASSCRED:
1151 		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1152 		break;
1153 
1154 	case SO_PEERCRED:
1155 	{
1156 		struct ucred peercred;
1157 		if (len > sizeof(peercred))
1158 			len = sizeof(peercred);
1159 		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1160 		if (copy_to_user(optval, &peercred, len))
1161 			return -EFAULT;
1162 		goto lenout;
1163 	}
1164 
1165 	case SO_PEERNAME:
1166 	{
1167 		char address[128];
1168 
1169 		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
1170 			return -ENOTCONN;
1171 		if (lv < len)
1172 			return -EINVAL;
1173 		if (copy_to_user(optval, address, len))
1174 			return -EFAULT;
1175 		goto lenout;
1176 	}
1177 
1178 	/* Dubious BSD thing... Probably nobody even uses it, but
1179 	 * the UNIX standard wants it for whatever reason... -DaveM
1180 	 */
1181 	case SO_ACCEPTCONN:
1182 		v.val = sk->sk_state == TCP_LISTEN;
1183 		break;
1184 
1185 	case SO_PASSSEC:
1186 		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1187 		break;
1188 
1189 	case SO_PEERSEC:
1190 		return security_socket_getpeersec_stream(sock, optval, optlen, len);
1191 
1192 	case SO_MARK:
1193 		v.val = sk->sk_mark;
1194 		break;
1195 
1196 	case SO_RXQ_OVFL:
1197 		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1198 		break;
1199 
1200 	case SO_WIFI_STATUS:
1201 		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1202 		break;
1203 
1204 	case SO_PEEK_OFF:
1205 		if (!sock->ops->set_peek_off)
1206 			return -EOPNOTSUPP;
1207 
1208 		v.val = sk->sk_peek_off;
1209 		break;
1210 	case SO_NOFCS:
1211 		v.val = sock_flag(sk, SOCK_NOFCS);
1212 		break;
1213 
1214 	case SO_BINDTODEVICE:
1215 		return sock_getbindtodevice(sk, optval, optlen, len);
1216 
1217 	case SO_GET_FILTER:
1218 		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1219 		if (len < 0)
1220 			return len;
1221 
1222 		goto lenout;
1223 
1224 	case SO_LOCK_FILTER:
1225 		v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1226 		break;
1227 
1228 	case SO_BPF_EXTENSIONS:
1229 		v.val = bpf_tell_extensions();
1230 		break;
1231 
1232 	case SO_SELECT_ERR_QUEUE:
1233 		v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1234 		break;
1235 
1236 #ifdef CONFIG_NET_RX_BUSY_POLL
1237 	case SO_BUSY_POLL:
1238 		v.val = sk->sk_ll_usec;
1239 		break;
1240 #endif
1241 
1242 	case SO_MAX_PACING_RATE:
1243 		v.val = sk->sk_max_pacing_rate;
1244 		break;
1245 
1246 	case SO_INCOMING_CPU:
1247 		v.val = sk->sk_incoming_cpu;
1248 		break;
1249 
1250 	default:
1251 		/* We implement the SO_SNDLOWAT etc to not be settable
1252 		 * (1003.1g 7).
1253 		 */
1254 		return -ENOPROTOOPT;
1255 	}
1256 
1257 	if (len > lv)
1258 		len = lv;
1259 	if (copy_to_user(optval, &v, len))
1260 		return -EFAULT;
1261 lenout:
1262 	if (put_user(len, optlen))
1263 		return -EFAULT;
1264 	return 0;
1265 }
1266 
1267 /*
1268  * Initialize an sk_lock.
1269  *
1270  * (We also register the sk_lock with the lock validator.)
1271  */
1272 static inline void sock_lock_init(struct sock *sk)
1273 {
1274 	sock_lock_init_class_and_name(sk,
1275 			af_family_slock_key_strings[sk->sk_family],
1276 			af_family_slock_keys + sk->sk_family,
1277 			af_family_key_strings[sk->sk_family],
1278 			af_family_keys + sk->sk_family);
1279 }
1280 
1281 /*
1282  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1283  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1284  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1285  */
1286 static void sock_copy(struct sock *nsk, const struct sock *osk)
1287 {
1288 #ifdef CONFIG_SECURITY_NETWORK
1289 	void *sptr = nsk->sk_security;
1290 #endif
1291 	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1292 
1293 	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1294 	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1295 
1296 #ifdef CONFIG_SECURITY_NETWORK
1297 	nsk->sk_security = sptr;
1298 	security_sk_clone(osk, nsk);
1299 #endif
1300 }
1301 
1302 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
1303 {
1304 	unsigned long nulls1, nulls2;
1305 
1306 	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
1307 	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
1308 	if (nulls1 > nulls2)
1309 		swap(nulls1, nulls2);
1310 
1311 	if (nulls1 != 0)
1312 		memset((char *)sk, 0, nulls1);
1313 	memset((char *)sk + nulls1 + sizeof(void *), 0,
1314 	       nulls2 - nulls1 - sizeof(void *));
1315 	memset((char *)sk + nulls2 + sizeof(void *), 0,
1316 	       size - nulls2 - sizeof(void *));
1317 }
1318 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);
1319 
1320 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1321 		int family)
1322 {
1323 	struct sock *sk;
1324 	struct kmem_cache *slab;
1325 
1326 	slab = prot->slab;
1327 	if (slab != NULL) {
1328 		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1329 		if (!sk)
1330 			return sk;
1331 		if (priority & __GFP_ZERO) {
1332 			if (prot->clear_sk)
1333 				prot->clear_sk(sk, prot->obj_size);
1334 			else
1335 				sk_prot_clear_nulls(sk, prot->obj_size);
1336 		}
1337 	} else
1338 		sk = kmalloc(prot->obj_size, priority);
1339 
1340 	if (sk != NULL) {
1341 		kmemcheck_annotate_bitfield(sk, flags);
1342 
1343 		if (security_sk_alloc(sk, family, priority))
1344 			goto out_free;
1345 
1346 		if (!try_module_get(prot->owner))
1347 			goto out_free_sec;
1348 		sk_tx_queue_clear(sk);
1349 	}
1350 
1351 	return sk;
1352 
1353 out_free_sec:
1354 	security_sk_free(sk);
1355 out_free:
1356 	if (slab != NULL)
1357 		kmem_cache_free(slab, sk);
1358 	else
1359 		kfree(sk);
1360 	return NULL;
1361 }
1362 
1363 static void sk_prot_free(struct proto *prot, struct sock *sk)
1364 {
1365 	struct kmem_cache *slab;
1366 	struct module *owner;
1367 
1368 	owner = prot->owner;
1369 	slab = prot->slab;
1370 
1371 	security_sk_free(sk);
1372 	if (slab != NULL)
1373 		kmem_cache_free(slab, sk);
1374 	else
1375 		kfree(sk);
1376 	module_put(owner);
1377 }
1378 
1379 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
1380 void sock_update_netprioidx(struct sock *sk)
1381 {
1382 	if (in_interrupt())
1383 		return;
1384 
1385 	sk->sk_cgrp_prioidx = task_netprioidx(current);
1386 }
1387 EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1388 #endif
1389 
1390 /**
1391  *	sk_alloc - All socket objects are allocated here
1392  *	@net: the applicable net namespace
1393  *	@family: protocol family
1394  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1395  *	@prot: struct proto associated with this new sock instance
1396  */
1397 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1398 		      struct proto *prot)
1399 {
1400 	struct sock *sk;
1401 
1402 	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1403 	if (sk) {
1404 		sk->sk_family = family;
1405 		/*
1406 		 * See comment in struct sock definition to understand
1407 		 * why we need sk_prot_creator -acme
1408 		 */
1409 		sk->sk_prot = sk->sk_prot_creator = prot;
1410 		sock_lock_init(sk);
1411 		sock_net_set(sk, get_net(net));
1412 		atomic_set(&sk->sk_wmem_alloc, 1);
1413 
1414 		sock_update_classid(sk);
1415 		sock_update_netprioidx(sk);
1416 	}
1417 
1418 	return sk;
1419 }
1420 EXPORT_SYMBOL(sk_alloc);
1421 
1422 static void __sk_free(struct sock *sk)
1423 {
1424 	struct sk_filter *filter;
1425 
1426 	if (sk->sk_destruct)
1427 		sk->sk_destruct(sk);
1428 
1429 	filter = rcu_dereference_check(sk->sk_filter,
1430 				       atomic_read(&sk->sk_wmem_alloc) == 0);
1431 	if (filter) {
1432 		sk_filter_uncharge(sk, filter);
1433 		RCU_INIT_POINTER(sk->sk_filter, NULL);
1434 	}
1435 
1436 	sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1437 
1438 	if (atomic_read(&sk->sk_omem_alloc))
1439 		pr_debug("%s: optmem leakage (%d bytes) detected\n",
1440 			 __func__, atomic_read(&sk->sk_omem_alloc));
1441 
1442 	if (sk->sk_peer_cred)
1443 		put_cred(sk->sk_peer_cred);
1444 	put_pid(sk->sk_peer_pid);
1445 	put_net(sock_net(sk));
1446 	sk_prot_free(sk->sk_prot_creator, sk);
1447 }
1448 
1449 void sk_free(struct sock *sk)
1450 {
1451 	/*
1452 	 * We subtract one from sk_wmem_alloc and can know if
1453 	 * some packets are still in some tx queue.
1454 	 * If not null, sock_wfree() will call __sk_free(sk) later
1455 	 */
1456 	if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1457 		__sk_free(sk);
1458 }
1459 EXPORT_SYMBOL(sk_free);
1460 
1461 /*
1462  * Last sock_put should drop reference to sk->sk_net. It has already
1463  * been dropped in sk_change_net. Taking reference to stopping namespace
1464  * is not an option.
1465  * Take reference to a socket to remove it from hash _alive_ and after that
1466  * destroy it in the context of init_net.
1467  */
1468 void sk_release_kernel(struct sock *sk)
1469 {
1470 	if (sk == NULL || sk->sk_socket == NULL)
1471 		return;
1472 
1473 	sock_hold(sk);
1474 	sock_release(sk->sk_socket);
1475 	sock_net_set(sk, get_net(&init_net));
1476 	sock_put(sk);
1477 }
1478 EXPORT_SYMBOL(sk_release_kernel);
1479 
1480 static void sk_update_clone(const struct sock *sk, struct sock *newsk)
1481 {
1482 	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
1483 		sock_update_memcg(newsk);
1484 }
1485 
1486 /**
1487  *	sk_clone_lock - clone a socket, and lock its clone
1488  *	@sk: the socket to clone
1489  *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1490  *
1491  *	Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1492  */
1493 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1494 {
1495 	struct sock *newsk;
1496 	bool is_charged = true;
1497 
1498 	newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1499 	if (newsk != NULL) {
1500 		struct sk_filter *filter;
1501 
1502 		sock_copy(newsk, sk);
1503 
1504 		/* SANITY */
1505 		get_net(sock_net(newsk));
1506 		sk_node_init(&newsk->sk_node);
1507 		sock_lock_init(newsk);
1508 		bh_lock_sock(newsk);
1509 		newsk->sk_backlog.head	= newsk->sk_backlog.tail = NULL;
1510 		newsk->sk_backlog.len = 0;
1511 
1512 		atomic_set(&newsk->sk_rmem_alloc, 0);
1513 		/*
1514 		 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1515 		 */
1516 		atomic_set(&newsk->sk_wmem_alloc, 1);
1517 		atomic_set(&newsk->sk_omem_alloc, 0);
1518 		skb_queue_head_init(&newsk->sk_receive_queue);
1519 		skb_queue_head_init(&newsk->sk_write_queue);
1520 
1521 		spin_lock_init(&newsk->sk_dst_lock);
1522 		rwlock_init(&newsk->sk_callback_lock);
1523 		lockdep_set_class_and_name(&newsk->sk_callback_lock,
1524 				af_callback_keys + newsk->sk_family,
1525 				af_family_clock_key_strings[newsk->sk_family]);
1526 
1527 		newsk->sk_dst_cache	= NULL;
1528 		newsk->sk_wmem_queued	= 0;
1529 		newsk->sk_forward_alloc = 0;
1530 		newsk->sk_send_head	= NULL;
1531 		newsk->sk_userlocks	= sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1532 
1533 		sock_reset_flag(newsk, SOCK_DONE);
1534 		skb_queue_head_init(&newsk->sk_error_queue);
1535 
1536 		filter = rcu_dereference_protected(newsk->sk_filter, 1);
1537 		if (filter != NULL)
1538 			/* though it's an empty new sock, the charging may fail
1539 			 * if sysctl_optmem_max was changed between creation of
1540 			 * original socket and cloning
1541 			 */
1542 			is_charged = sk_filter_charge(newsk, filter);
1543 
1544 		if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) {
1545 			/* It is still raw copy of parent, so invalidate
1546 			 * destructor and make plain sk_free() */
1547 			newsk->sk_destruct = NULL;
1548 			bh_unlock_sock(newsk);
1549 			sk_free(newsk);
1550 			newsk = NULL;
1551 			goto out;
1552 		}
1553 
1554 		newsk->sk_err	   = 0;
1555 		newsk->sk_priority = 0;
1556 		newsk->sk_incoming_cpu = raw_smp_processor_id();
1557 		atomic64_set(&newsk->sk_cookie, 0);
1558 		/*
1559 		 * Before updating sk_refcnt, we must commit prior changes to memory
1560 		 * (Documentation/RCU/rculist_nulls.txt for details)
1561 		 */
1562 		smp_wmb();
1563 		atomic_set(&newsk->sk_refcnt, 2);
1564 
1565 		/*
1566 		 * Increment the counter in the same struct proto as the master
1567 		 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1568 		 * is the same as sk->sk_prot->socks, as this field was copied
1569 		 * with memcpy).
1570 		 *
1571 		 * This _changes_ the previous behaviour, where
1572 		 * tcp_create_openreq_child always was incrementing the
1573 		 * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1574 		 * to be taken into account in all callers. -acme
1575 		 */
1576 		sk_refcnt_debug_inc(newsk);
1577 		sk_set_socket(newsk, NULL);
1578 		newsk->sk_wq = NULL;
1579 
1580 		sk_update_clone(sk, newsk);
1581 
1582 		if (newsk->sk_prot->sockets_allocated)
1583 			sk_sockets_allocated_inc(newsk);
1584 
1585 		if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1586 			net_enable_timestamp();
1587 	}
1588 out:
1589 	return newsk;
1590 }
1591 EXPORT_SYMBOL_GPL(sk_clone_lock);
1592 
1593 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1594 {
1595 	__sk_dst_set(sk, dst);
1596 	sk->sk_route_caps = dst->dev->features;
1597 	if (sk->sk_route_caps & NETIF_F_GSO)
1598 		sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1599 	sk->sk_route_caps &= ~sk->sk_route_nocaps;
1600 	if (sk_can_gso(sk)) {
1601 		if (dst->header_len) {
1602 			sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1603 		} else {
1604 			sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1605 			sk->sk_gso_max_size = dst->dev->gso_max_size;
1606 			sk->sk_gso_max_segs = dst->dev->gso_max_segs;
1607 		}
1608 	}
1609 }
1610 EXPORT_SYMBOL_GPL(sk_setup_caps);
1611 
1612 /*
1613  *	Simple resource managers for sockets.
1614  */
1615 
1616 
1617 /*
1618  * Write buffer destructor automatically called from kfree_skb.
1619  */
1620 void sock_wfree(struct sk_buff *skb)
1621 {
1622 	struct sock *sk = skb->sk;
1623 	unsigned int len = skb->truesize;
1624 
1625 	if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1626 		/*
1627 		 * Keep a reference on sk_wmem_alloc, this will be released
1628 		 * after sk_write_space() call
1629 		 */
1630 		atomic_sub(len - 1, &sk->sk_wmem_alloc);
1631 		sk->sk_write_space(sk);
1632 		len = 1;
1633 	}
1634 	/*
1635 	 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1636 	 * could not do because of in-flight packets
1637 	 */
1638 	if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1639 		__sk_free(sk);
1640 }
1641 EXPORT_SYMBOL(sock_wfree);
1642 
1643 void skb_orphan_partial(struct sk_buff *skb)
1644 {
1645 	/* TCP stack sets skb->ooo_okay based on sk_wmem_alloc,
1646 	 * so we do not completely orphan skb, but transfert all
1647 	 * accounted bytes but one, to avoid unexpected reorders.
1648 	 */
1649 	if (skb->destructor == sock_wfree
1650 #ifdef CONFIG_INET
1651 	    || skb->destructor == tcp_wfree
1652 #endif
1653 		) {
1654 		atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc);
1655 		skb->truesize = 1;
1656 	} else {
1657 		skb_orphan(skb);
1658 	}
1659 }
1660 EXPORT_SYMBOL(skb_orphan_partial);
1661 
1662 /*
1663  * Read buffer destructor automatically called from kfree_skb.
1664  */
1665 void sock_rfree(struct sk_buff *skb)
1666 {
1667 	struct sock *sk = skb->sk;
1668 	unsigned int len = skb->truesize;
1669 
1670 	atomic_sub(len, &sk->sk_rmem_alloc);
1671 	sk_mem_uncharge(sk, len);
1672 }
1673 EXPORT_SYMBOL(sock_rfree);
1674 
1675 /*
1676  * Buffer destructor for skbs that are not used directly in read or write
1677  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1678  */
1679 void sock_efree(struct sk_buff *skb)
1680 {
1681 	sock_put(skb->sk);
1682 }
1683 EXPORT_SYMBOL(sock_efree);
1684 
1685 kuid_t sock_i_uid(struct sock *sk)
1686 {
1687 	kuid_t uid;
1688 
1689 	read_lock_bh(&sk->sk_callback_lock);
1690 	uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1691 	read_unlock_bh(&sk->sk_callback_lock);
1692 	return uid;
1693 }
1694 EXPORT_SYMBOL(sock_i_uid);
1695 
1696 unsigned long sock_i_ino(struct sock *sk)
1697 {
1698 	unsigned long ino;
1699 
1700 	read_lock_bh(&sk->sk_callback_lock);
1701 	ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1702 	read_unlock_bh(&sk->sk_callback_lock);
1703 	return ino;
1704 }
1705 EXPORT_SYMBOL(sock_i_ino);
1706 
1707 /*
1708  * Allocate a skb from the socket's send buffer.
1709  */
1710 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1711 			     gfp_t priority)
1712 {
1713 	if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1714 		struct sk_buff *skb = alloc_skb(size, priority);
1715 		if (skb) {
1716 			skb_set_owner_w(skb, sk);
1717 			return skb;
1718 		}
1719 	}
1720 	return NULL;
1721 }
1722 EXPORT_SYMBOL(sock_wmalloc);
1723 
1724 /*
1725  * Allocate a memory block from the socket's option memory buffer.
1726  */
1727 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1728 {
1729 	if ((unsigned int)size <= sysctl_optmem_max &&
1730 	    atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1731 		void *mem;
1732 		/* First do the add, to avoid the race if kmalloc
1733 		 * might sleep.
1734 		 */
1735 		atomic_add(size, &sk->sk_omem_alloc);
1736 		mem = kmalloc(size, priority);
1737 		if (mem)
1738 			return mem;
1739 		atomic_sub(size, &sk->sk_omem_alloc);
1740 	}
1741 	return NULL;
1742 }
1743 EXPORT_SYMBOL(sock_kmalloc);
1744 
1745 /* Free an option memory block. Note, we actually want the inline
1746  * here as this allows gcc to detect the nullify and fold away the
1747  * condition entirely.
1748  */
1749 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
1750 				  const bool nullify)
1751 {
1752 	if (WARN_ON_ONCE(!mem))
1753 		return;
1754 	if (nullify)
1755 		kzfree(mem);
1756 	else
1757 		kfree(mem);
1758 	atomic_sub(size, &sk->sk_omem_alloc);
1759 }
1760 
1761 void sock_kfree_s(struct sock *sk, void *mem, int size)
1762 {
1763 	__sock_kfree_s(sk, mem, size, false);
1764 }
1765 EXPORT_SYMBOL(sock_kfree_s);
1766 
1767 void sock_kzfree_s(struct sock *sk, void *mem, int size)
1768 {
1769 	__sock_kfree_s(sk, mem, size, true);
1770 }
1771 EXPORT_SYMBOL(sock_kzfree_s);
1772 
1773 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1774    I think, these locks should be removed for datagram sockets.
1775  */
1776 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1777 {
1778 	DEFINE_WAIT(wait);
1779 
1780 	clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1781 	for (;;) {
1782 		if (!timeo)
1783 			break;
1784 		if (signal_pending(current))
1785 			break;
1786 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1787 		prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1788 		if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1789 			break;
1790 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1791 			break;
1792 		if (sk->sk_err)
1793 			break;
1794 		timeo = schedule_timeout(timeo);
1795 	}
1796 	finish_wait(sk_sleep(sk), &wait);
1797 	return timeo;
1798 }
1799 
1800 
1801 /*
1802  *	Generic send/receive buffer handlers
1803  */
1804 
1805 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1806 				     unsigned long data_len, int noblock,
1807 				     int *errcode, int max_page_order)
1808 {
1809 	struct sk_buff *skb;
1810 	long timeo;
1811 	int err;
1812 
1813 	timeo = sock_sndtimeo(sk, noblock);
1814 	for (;;) {
1815 		err = sock_error(sk);
1816 		if (err != 0)
1817 			goto failure;
1818 
1819 		err = -EPIPE;
1820 		if (sk->sk_shutdown & SEND_SHUTDOWN)
1821 			goto failure;
1822 
1823 		if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
1824 			break;
1825 
1826 		set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1827 		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1828 		err = -EAGAIN;
1829 		if (!timeo)
1830 			goto failure;
1831 		if (signal_pending(current))
1832 			goto interrupted;
1833 		timeo = sock_wait_for_wmem(sk, timeo);
1834 	}
1835 	skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
1836 				   errcode, sk->sk_allocation);
1837 	if (skb)
1838 		skb_set_owner_w(skb, sk);
1839 	return skb;
1840 
1841 interrupted:
1842 	err = sock_intr_errno(timeo);
1843 failure:
1844 	*errcode = err;
1845 	return NULL;
1846 }
1847 EXPORT_SYMBOL(sock_alloc_send_pskb);
1848 
1849 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1850 				    int noblock, int *errcode)
1851 {
1852 	return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
1853 }
1854 EXPORT_SYMBOL(sock_alloc_send_skb);
1855 
1856 /* On 32bit arches, an skb frag is limited to 2^15 */
1857 #define SKB_FRAG_PAGE_ORDER	get_order(32768)
1858 
1859 /**
1860  * skb_page_frag_refill - check that a page_frag contains enough room
1861  * @sz: minimum size of the fragment we want to get
1862  * @pfrag: pointer to page_frag
1863  * @gfp: priority for memory allocation
1864  *
1865  * Note: While this allocator tries to use high order pages, there is
1866  * no guarantee that allocations succeed. Therefore, @sz MUST be
1867  * less or equal than PAGE_SIZE.
1868  */
1869 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
1870 {
1871 	if (pfrag->page) {
1872 		if (atomic_read(&pfrag->page->_count) == 1) {
1873 			pfrag->offset = 0;
1874 			return true;
1875 		}
1876 		if (pfrag->offset + sz <= pfrag->size)
1877 			return true;
1878 		put_page(pfrag->page);
1879 	}
1880 
1881 	pfrag->offset = 0;
1882 	if (SKB_FRAG_PAGE_ORDER) {
1883 		pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP |
1884 					  __GFP_NOWARN | __GFP_NORETRY,
1885 					  SKB_FRAG_PAGE_ORDER);
1886 		if (likely(pfrag->page)) {
1887 			pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
1888 			return true;
1889 		}
1890 	}
1891 	pfrag->page = alloc_page(gfp);
1892 	if (likely(pfrag->page)) {
1893 		pfrag->size = PAGE_SIZE;
1894 		return true;
1895 	}
1896 	return false;
1897 }
1898 EXPORT_SYMBOL(skb_page_frag_refill);
1899 
1900 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
1901 {
1902 	if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
1903 		return true;
1904 
1905 	sk_enter_memory_pressure(sk);
1906 	sk_stream_moderate_sndbuf(sk);
1907 	return false;
1908 }
1909 EXPORT_SYMBOL(sk_page_frag_refill);
1910 
1911 static void __lock_sock(struct sock *sk)
1912 	__releases(&sk->sk_lock.slock)
1913 	__acquires(&sk->sk_lock.slock)
1914 {
1915 	DEFINE_WAIT(wait);
1916 
1917 	for (;;) {
1918 		prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1919 					TASK_UNINTERRUPTIBLE);
1920 		spin_unlock_bh(&sk->sk_lock.slock);
1921 		schedule();
1922 		spin_lock_bh(&sk->sk_lock.slock);
1923 		if (!sock_owned_by_user(sk))
1924 			break;
1925 	}
1926 	finish_wait(&sk->sk_lock.wq, &wait);
1927 }
1928 
1929 static void __release_sock(struct sock *sk)
1930 	__releases(&sk->sk_lock.slock)
1931 	__acquires(&sk->sk_lock.slock)
1932 {
1933 	struct sk_buff *skb = sk->sk_backlog.head;
1934 
1935 	do {
1936 		sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1937 		bh_unlock_sock(sk);
1938 
1939 		do {
1940 			struct sk_buff *next = skb->next;
1941 
1942 			prefetch(next);
1943 			WARN_ON_ONCE(skb_dst_is_noref(skb));
1944 			skb->next = NULL;
1945 			sk_backlog_rcv(sk, skb);
1946 
1947 			/*
1948 			 * We are in process context here with softirqs
1949 			 * disabled, use cond_resched_softirq() to preempt.
1950 			 * This is safe to do because we've taken the backlog
1951 			 * queue private:
1952 			 */
1953 			cond_resched_softirq();
1954 
1955 			skb = next;
1956 		} while (skb != NULL);
1957 
1958 		bh_lock_sock(sk);
1959 	} while ((skb = sk->sk_backlog.head) != NULL);
1960 
1961 	/*
1962 	 * Doing the zeroing here guarantee we can not loop forever
1963 	 * while a wild producer attempts to flood us.
1964 	 */
1965 	sk->sk_backlog.len = 0;
1966 }
1967 
1968 /**
1969  * sk_wait_data - wait for data to arrive at sk_receive_queue
1970  * @sk:    sock to wait on
1971  * @timeo: for how long
1972  *
1973  * Now socket state including sk->sk_err is changed only under lock,
1974  * hence we may omit checks after joining wait queue.
1975  * We check receive queue before schedule() only as optimization;
1976  * it is very likely that release_sock() added new data.
1977  */
1978 int sk_wait_data(struct sock *sk, long *timeo)
1979 {
1980 	int rc;
1981 	DEFINE_WAIT(wait);
1982 
1983 	prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1984 	set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1985 	rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1986 	clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1987 	finish_wait(sk_sleep(sk), &wait);
1988 	return rc;
1989 }
1990 EXPORT_SYMBOL(sk_wait_data);
1991 
1992 /**
1993  *	__sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1994  *	@sk: socket
1995  *	@size: memory size to allocate
1996  *	@kind: allocation type
1997  *
1998  *	If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1999  *	rmem allocation. This function assumes that protocols which have
2000  *	memory_pressure use sk_wmem_queued as write buffer accounting.
2001  */
2002 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2003 {
2004 	struct proto *prot = sk->sk_prot;
2005 	int amt = sk_mem_pages(size);
2006 	long allocated;
2007 	int parent_status = UNDER_LIMIT;
2008 
2009 	sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
2010 
2011 	allocated = sk_memory_allocated_add(sk, amt, &parent_status);
2012 
2013 	/* Under limit. */
2014 	if (parent_status == UNDER_LIMIT &&
2015 			allocated <= sk_prot_mem_limits(sk, 0)) {
2016 		sk_leave_memory_pressure(sk);
2017 		return 1;
2018 	}
2019 
2020 	/* Under pressure. (we or our parents) */
2021 	if ((parent_status > SOFT_LIMIT) ||
2022 			allocated > sk_prot_mem_limits(sk, 1))
2023 		sk_enter_memory_pressure(sk);
2024 
2025 	/* Over hard limit (we or our parents) */
2026 	if ((parent_status == OVER_LIMIT) ||
2027 			(allocated > sk_prot_mem_limits(sk, 2)))
2028 		goto suppress_allocation;
2029 
2030 	/* guarantee minimum buffer size under pressure */
2031 	if (kind == SK_MEM_RECV) {
2032 		if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
2033 			return 1;
2034 
2035 	} else { /* SK_MEM_SEND */
2036 		if (sk->sk_type == SOCK_STREAM) {
2037 			if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
2038 				return 1;
2039 		} else if (atomic_read(&sk->sk_wmem_alloc) <
2040 			   prot->sysctl_wmem[0])
2041 				return 1;
2042 	}
2043 
2044 	if (sk_has_memory_pressure(sk)) {
2045 		int alloc;
2046 
2047 		if (!sk_under_memory_pressure(sk))
2048 			return 1;
2049 		alloc = sk_sockets_allocated_read_positive(sk);
2050 		if (sk_prot_mem_limits(sk, 2) > alloc *
2051 		    sk_mem_pages(sk->sk_wmem_queued +
2052 				 atomic_read(&sk->sk_rmem_alloc) +
2053 				 sk->sk_forward_alloc))
2054 			return 1;
2055 	}
2056 
2057 suppress_allocation:
2058 
2059 	if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2060 		sk_stream_moderate_sndbuf(sk);
2061 
2062 		/* Fail only if socket is _under_ its sndbuf.
2063 		 * In this case we cannot block, so that we have to fail.
2064 		 */
2065 		if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2066 			return 1;
2067 	}
2068 
2069 	trace_sock_exceed_buf_limit(sk, prot, allocated);
2070 
2071 	/* Alas. Undo changes. */
2072 	sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
2073 
2074 	sk_memory_allocated_sub(sk, amt);
2075 
2076 	return 0;
2077 }
2078 EXPORT_SYMBOL(__sk_mem_schedule);
2079 
2080 /**
2081  *	__sk_reclaim - reclaim memory_allocated
2082  *	@sk: socket
2083  */
2084 void __sk_mem_reclaim(struct sock *sk)
2085 {
2086 	sk_memory_allocated_sub(sk,
2087 				sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT);
2088 	sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
2089 
2090 	if (sk_under_memory_pressure(sk) &&
2091 	    (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2092 		sk_leave_memory_pressure(sk);
2093 }
2094 EXPORT_SYMBOL(__sk_mem_reclaim);
2095 
2096 
2097 /*
2098  * Set of default routines for initialising struct proto_ops when
2099  * the protocol does not support a particular function. In certain
2100  * cases where it makes no sense for a protocol to have a "do nothing"
2101  * function, some default processing is provided.
2102  */
2103 
2104 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2105 {
2106 	return -EOPNOTSUPP;
2107 }
2108 EXPORT_SYMBOL(sock_no_bind);
2109 
2110 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2111 		    int len, int flags)
2112 {
2113 	return -EOPNOTSUPP;
2114 }
2115 EXPORT_SYMBOL(sock_no_connect);
2116 
2117 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2118 {
2119 	return -EOPNOTSUPP;
2120 }
2121 EXPORT_SYMBOL(sock_no_socketpair);
2122 
2123 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
2124 {
2125 	return -EOPNOTSUPP;
2126 }
2127 EXPORT_SYMBOL(sock_no_accept);
2128 
2129 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2130 		    int *len, int peer)
2131 {
2132 	return -EOPNOTSUPP;
2133 }
2134 EXPORT_SYMBOL(sock_no_getname);
2135 
2136 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2137 {
2138 	return 0;
2139 }
2140 EXPORT_SYMBOL(sock_no_poll);
2141 
2142 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2143 {
2144 	return -EOPNOTSUPP;
2145 }
2146 EXPORT_SYMBOL(sock_no_ioctl);
2147 
2148 int sock_no_listen(struct socket *sock, int backlog)
2149 {
2150 	return -EOPNOTSUPP;
2151 }
2152 EXPORT_SYMBOL(sock_no_listen);
2153 
2154 int sock_no_shutdown(struct socket *sock, int how)
2155 {
2156 	return -EOPNOTSUPP;
2157 }
2158 EXPORT_SYMBOL(sock_no_shutdown);
2159 
2160 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2161 		    char __user *optval, unsigned int optlen)
2162 {
2163 	return -EOPNOTSUPP;
2164 }
2165 EXPORT_SYMBOL(sock_no_setsockopt);
2166 
2167 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2168 		    char __user *optval, int __user *optlen)
2169 {
2170 	return -EOPNOTSUPP;
2171 }
2172 EXPORT_SYMBOL(sock_no_getsockopt);
2173 
2174 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2175 {
2176 	return -EOPNOTSUPP;
2177 }
2178 EXPORT_SYMBOL(sock_no_sendmsg);
2179 
2180 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2181 		    int flags)
2182 {
2183 	return -EOPNOTSUPP;
2184 }
2185 EXPORT_SYMBOL(sock_no_recvmsg);
2186 
2187 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2188 {
2189 	/* Mirror missing mmap method error code */
2190 	return -ENODEV;
2191 }
2192 EXPORT_SYMBOL(sock_no_mmap);
2193 
2194 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2195 {
2196 	ssize_t res;
2197 	struct msghdr msg = {.msg_flags = flags};
2198 	struct kvec iov;
2199 	char *kaddr = kmap(page);
2200 	iov.iov_base = kaddr + offset;
2201 	iov.iov_len = size;
2202 	res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2203 	kunmap(page);
2204 	return res;
2205 }
2206 EXPORT_SYMBOL(sock_no_sendpage);
2207 
2208 /*
2209  *	Default Socket Callbacks
2210  */
2211 
2212 static void sock_def_wakeup(struct sock *sk)
2213 {
2214 	struct socket_wq *wq;
2215 
2216 	rcu_read_lock();
2217 	wq = rcu_dereference(sk->sk_wq);
2218 	if (wq_has_sleeper(wq))
2219 		wake_up_interruptible_all(&wq->wait);
2220 	rcu_read_unlock();
2221 }
2222 
2223 static void sock_def_error_report(struct sock *sk)
2224 {
2225 	struct socket_wq *wq;
2226 
2227 	rcu_read_lock();
2228 	wq = rcu_dereference(sk->sk_wq);
2229 	if (wq_has_sleeper(wq))
2230 		wake_up_interruptible_poll(&wq->wait, POLLERR);
2231 	sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2232 	rcu_read_unlock();
2233 }
2234 
2235 static void sock_def_readable(struct sock *sk)
2236 {
2237 	struct socket_wq *wq;
2238 
2239 	rcu_read_lock();
2240 	wq = rcu_dereference(sk->sk_wq);
2241 	if (wq_has_sleeper(wq))
2242 		wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
2243 						POLLRDNORM | POLLRDBAND);
2244 	sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2245 	rcu_read_unlock();
2246 }
2247 
2248 static void sock_def_write_space(struct sock *sk)
2249 {
2250 	struct socket_wq *wq;
2251 
2252 	rcu_read_lock();
2253 
2254 	/* Do not wake up a writer until he can make "significant"
2255 	 * progress.  --DaveM
2256 	 */
2257 	if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2258 		wq = rcu_dereference(sk->sk_wq);
2259 		if (wq_has_sleeper(wq))
2260 			wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
2261 						POLLWRNORM | POLLWRBAND);
2262 
2263 		/* Should agree with poll, otherwise some programs break */
2264 		if (sock_writeable(sk))
2265 			sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2266 	}
2267 
2268 	rcu_read_unlock();
2269 }
2270 
2271 static void sock_def_destruct(struct sock *sk)
2272 {
2273 	kfree(sk->sk_protinfo);
2274 }
2275 
2276 void sk_send_sigurg(struct sock *sk)
2277 {
2278 	if (sk->sk_socket && sk->sk_socket->file)
2279 		if (send_sigurg(&sk->sk_socket->file->f_owner))
2280 			sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2281 }
2282 EXPORT_SYMBOL(sk_send_sigurg);
2283 
2284 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2285 		    unsigned long expires)
2286 {
2287 	if (!mod_timer(timer, expires))
2288 		sock_hold(sk);
2289 }
2290 EXPORT_SYMBOL(sk_reset_timer);
2291 
2292 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2293 {
2294 	if (del_timer(timer))
2295 		__sock_put(sk);
2296 }
2297 EXPORT_SYMBOL(sk_stop_timer);
2298 
2299 void sock_init_data(struct socket *sock, struct sock *sk)
2300 {
2301 	skb_queue_head_init(&sk->sk_receive_queue);
2302 	skb_queue_head_init(&sk->sk_write_queue);
2303 	skb_queue_head_init(&sk->sk_error_queue);
2304 
2305 	sk->sk_send_head	=	NULL;
2306 
2307 	init_timer(&sk->sk_timer);
2308 
2309 	sk->sk_allocation	=	GFP_KERNEL;
2310 	sk->sk_rcvbuf		=	sysctl_rmem_default;
2311 	sk->sk_sndbuf		=	sysctl_wmem_default;
2312 	sk->sk_state		=	TCP_CLOSE;
2313 	sk_set_socket(sk, sock);
2314 
2315 	sock_set_flag(sk, SOCK_ZAPPED);
2316 
2317 	if (sock) {
2318 		sk->sk_type	=	sock->type;
2319 		sk->sk_wq	=	sock->wq;
2320 		sock->sk	=	sk;
2321 	} else
2322 		sk->sk_wq	=	NULL;
2323 
2324 	spin_lock_init(&sk->sk_dst_lock);
2325 	rwlock_init(&sk->sk_callback_lock);
2326 	lockdep_set_class_and_name(&sk->sk_callback_lock,
2327 			af_callback_keys + sk->sk_family,
2328 			af_family_clock_key_strings[sk->sk_family]);
2329 
2330 	sk->sk_state_change	=	sock_def_wakeup;
2331 	sk->sk_data_ready	=	sock_def_readable;
2332 	sk->sk_write_space	=	sock_def_write_space;
2333 	sk->sk_error_report	=	sock_def_error_report;
2334 	sk->sk_destruct		=	sock_def_destruct;
2335 
2336 	sk->sk_frag.page	=	NULL;
2337 	sk->sk_frag.offset	=	0;
2338 	sk->sk_peek_off		=	-1;
2339 
2340 	sk->sk_peer_pid 	=	NULL;
2341 	sk->sk_peer_cred	=	NULL;
2342 	sk->sk_write_pending	=	0;
2343 	sk->sk_rcvlowat		=	1;
2344 	sk->sk_rcvtimeo		=	MAX_SCHEDULE_TIMEOUT;
2345 	sk->sk_sndtimeo		=	MAX_SCHEDULE_TIMEOUT;
2346 
2347 	sk->sk_stamp = ktime_set(-1L, 0);
2348 
2349 #ifdef CONFIG_NET_RX_BUSY_POLL
2350 	sk->sk_napi_id		=	0;
2351 	sk->sk_ll_usec		=	sysctl_net_busy_read;
2352 #endif
2353 
2354 	sk->sk_max_pacing_rate = ~0U;
2355 	sk->sk_pacing_rate = ~0U;
2356 	/*
2357 	 * Before updating sk_refcnt, we must commit prior changes to memory
2358 	 * (Documentation/RCU/rculist_nulls.txt for details)
2359 	 */
2360 	smp_wmb();
2361 	atomic_set(&sk->sk_refcnt, 1);
2362 	atomic_set(&sk->sk_drops, 0);
2363 }
2364 EXPORT_SYMBOL(sock_init_data);
2365 
2366 void lock_sock_nested(struct sock *sk, int subclass)
2367 {
2368 	might_sleep();
2369 	spin_lock_bh(&sk->sk_lock.slock);
2370 	if (sk->sk_lock.owned)
2371 		__lock_sock(sk);
2372 	sk->sk_lock.owned = 1;
2373 	spin_unlock(&sk->sk_lock.slock);
2374 	/*
2375 	 * The sk_lock has mutex_lock() semantics here:
2376 	 */
2377 	mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2378 	local_bh_enable();
2379 }
2380 EXPORT_SYMBOL(lock_sock_nested);
2381 
2382 void release_sock(struct sock *sk)
2383 {
2384 	/*
2385 	 * The sk_lock has mutex_unlock() semantics:
2386 	 */
2387 	mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
2388 
2389 	spin_lock_bh(&sk->sk_lock.slock);
2390 	if (sk->sk_backlog.tail)
2391 		__release_sock(sk);
2392 
2393 	/* Warning : release_cb() might need to release sk ownership,
2394 	 * ie call sock_release_ownership(sk) before us.
2395 	 */
2396 	if (sk->sk_prot->release_cb)
2397 		sk->sk_prot->release_cb(sk);
2398 
2399 	sock_release_ownership(sk);
2400 	if (waitqueue_active(&sk->sk_lock.wq))
2401 		wake_up(&sk->sk_lock.wq);
2402 	spin_unlock_bh(&sk->sk_lock.slock);
2403 }
2404 EXPORT_SYMBOL(release_sock);
2405 
2406 /**
2407  * lock_sock_fast - fast version of lock_sock
2408  * @sk: socket
2409  *
2410  * This version should be used for very small section, where process wont block
2411  * return false if fast path is taken
2412  *   sk_lock.slock locked, owned = 0, BH disabled
2413  * return true if slow path is taken
2414  *   sk_lock.slock unlocked, owned = 1, BH enabled
2415  */
2416 bool lock_sock_fast(struct sock *sk)
2417 {
2418 	might_sleep();
2419 	spin_lock_bh(&sk->sk_lock.slock);
2420 
2421 	if (!sk->sk_lock.owned)
2422 		/*
2423 		 * Note : We must disable BH
2424 		 */
2425 		return false;
2426 
2427 	__lock_sock(sk);
2428 	sk->sk_lock.owned = 1;
2429 	spin_unlock(&sk->sk_lock.slock);
2430 	/*
2431 	 * The sk_lock has mutex_lock() semantics here:
2432 	 */
2433 	mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2434 	local_bh_enable();
2435 	return true;
2436 }
2437 EXPORT_SYMBOL(lock_sock_fast);
2438 
2439 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2440 {
2441 	struct timeval tv;
2442 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2443 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2444 	tv = ktime_to_timeval(sk->sk_stamp);
2445 	if (tv.tv_sec == -1)
2446 		return -ENOENT;
2447 	if (tv.tv_sec == 0) {
2448 		sk->sk_stamp = ktime_get_real();
2449 		tv = ktime_to_timeval(sk->sk_stamp);
2450 	}
2451 	return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2452 }
2453 EXPORT_SYMBOL(sock_get_timestamp);
2454 
2455 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2456 {
2457 	struct timespec ts;
2458 	if (!sock_flag(sk, SOCK_TIMESTAMP))
2459 		sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2460 	ts = ktime_to_timespec(sk->sk_stamp);
2461 	if (ts.tv_sec == -1)
2462 		return -ENOENT;
2463 	if (ts.tv_sec == 0) {
2464 		sk->sk_stamp = ktime_get_real();
2465 		ts = ktime_to_timespec(sk->sk_stamp);
2466 	}
2467 	return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2468 }
2469 EXPORT_SYMBOL(sock_get_timestampns);
2470 
2471 void sock_enable_timestamp(struct sock *sk, int flag)
2472 {
2473 	if (!sock_flag(sk, flag)) {
2474 		unsigned long previous_flags = sk->sk_flags;
2475 
2476 		sock_set_flag(sk, flag);
2477 		/*
2478 		 * we just set one of the two flags which require net
2479 		 * time stamping, but time stamping might have been on
2480 		 * already because of the other one
2481 		 */
2482 		if (!(previous_flags & SK_FLAGS_TIMESTAMP))
2483 			net_enable_timestamp();
2484 	}
2485 }
2486 
2487 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2488 		       int level, int type)
2489 {
2490 	struct sock_exterr_skb *serr;
2491 	struct sk_buff *skb;
2492 	int copied, err;
2493 
2494 	err = -EAGAIN;
2495 	skb = sock_dequeue_err_skb(sk);
2496 	if (skb == NULL)
2497 		goto out;
2498 
2499 	copied = skb->len;
2500 	if (copied > len) {
2501 		msg->msg_flags |= MSG_TRUNC;
2502 		copied = len;
2503 	}
2504 	err = skb_copy_datagram_msg(skb, 0, msg, copied);
2505 	if (err)
2506 		goto out_free_skb;
2507 
2508 	sock_recv_timestamp(msg, sk, skb);
2509 
2510 	serr = SKB_EXT_ERR(skb);
2511 	put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2512 
2513 	msg->msg_flags |= MSG_ERRQUEUE;
2514 	err = copied;
2515 
2516 out_free_skb:
2517 	kfree_skb(skb);
2518 out:
2519 	return err;
2520 }
2521 EXPORT_SYMBOL(sock_recv_errqueue);
2522 
2523 /*
2524  *	Get a socket option on an socket.
2525  *
2526  *	FIX: POSIX 1003.1g is very ambiguous here. It states that
2527  *	asynchronous errors should be reported by getsockopt. We assume
2528  *	this means if you specify SO_ERROR (otherwise whats the point of it).
2529  */
2530 int sock_common_getsockopt(struct socket *sock, int level, int optname,
2531 			   char __user *optval, int __user *optlen)
2532 {
2533 	struct sock *sk = sock->sk;
2534 
2535 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2536 }
2537 EXPORT_SYMBOL(sock_common_getsockopt);
2538 
2539 #ifdef CONFIG_COMPAT
2540 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2541 				  char __user *optval, int __user *optlen)
2542 {
2543 	struct sock *sk = sock->sk;
2544 
2545 	if (sk->sk_prot->compat_getsockopt != NULL)
2546 		return sk->sk_prot->compat_getsockopt(sk, level, optname,
2547 						      optval, optlen);
2548 	return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2549 }
2550 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2551 #endif
2552 
2553 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2554 			int flags)
2555 {
2556 	struct sock *sk = sock->sk;
2557 	int addr_len = 0;
2558 	int err;
2559 
2560 	err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2561 				   flags & ~MSG_DONTWAIT, &addr_len);
2562 	if (err >= 0)
2563 		msg->msg_namelen = addr_len;
2564 	return err;
2565 }
2566 EXPORT_SYMBOL(sock_common_recvmsg);
2567 
2568 /*
2569  *	Set socket options on an inet socket.
2570  */
2571 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2572 			   char __user *optval, unsigned int optlen)
2573 {
2574 	struct sock *sk = sock->sk;
2575 
2576 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2577 }
2578 EXPORT_SYMBOL(sock_common_setsockopt);
2579 
2580 #ifdef CONFIG_COMPAT
2581 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2582 				  char __user *optval, unsigned int optlen)
2583 {
2584 	struct sock *sk = sock->sk;
2585 
2586 	if (sk->sk_prot->compat_setsockopt != NULL)
2587 		return sk->sk_prot->compat_setsockopt(sk, level, optname,
2588 						      optval, optlen);
2589 	return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2590 }
2591 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2592 #endif
2593 
2594 void sk_common_release(struct sock *sk)
2595 {
2596 	if (sk->sk_prot->destroy)
2597 		sk->sk_prot->destroy(sk);
2598 
2599 	/*
2600 	 * Observation: when sock_common_release is called, processes have
2601 	 * no access to socket. But net still has.
2602 	 * Step one, detach it from networking:
2603 	 *
2604 	 * A. Remove from hash tables.
2605 	 */
2606 
2607 	sk->sk_prot->unhash(sk);
2608 
2609 	/*
2610 	 * In this point socket cannot receive new packets, but it is possible
2611 	 * that some packets are in flight because some CPU runs receiver and
2612 	 * did hash table lookup before we unhashed socket. They will achieve
2613 	 * receive queue and will be purged by socket destructor.
2614 	 *
2615 	 * Also we still have packets pending on receive queue and probably,
2616 	 * our own packets waiting in device queues. sock_destroy will drain
2617 	 * receive queue, but transmitted packets will delay socket destruction
2618 	 * until the last reference will be released.
2619 	 */
2620 
2621 	sock_orphan(sk);
2622 
2623 	xfrm_sk_free_policy(sk);
2624 
2625 	sk_refcnt_debug_release(sk);
2626 
2627 	if (sk->sk_frag.page) {
2628 		put_page(sk->sk_frag.page);
2629 		sk->sk_frag.page = NULL;
2630 	}
2631 
2632 	sock_put(sk);
2633 }
2634 EXPORT_SYMBOL(sk_common_release);
2635 
2636 #ifdef CONFIG_PROC_FS
2637 #define PROTO_INUSE_NR	64	/* should be enough for the first time */
2638 struct prot_inuse {
2639 	int val[PROTO_INUSE_NR];
2640 };
2641 
2642 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2643 
2644 #ifdef CONFIG_NET_NS
2645 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2646 {
2647 	__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
2648 }
2649 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2650 
2651 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2652 {
2653 	int cpu, idx = prot->inuse_idx;
2654 	int res = 0;
2655 
2656 	for_each_possible_cpu(cpu)
2657 		res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2658 
2659 	return res >= 0 ? res : 0;
2660 }
2661 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2662 
2663 static int __net_init sock_inuse_init_net(struct net *net)
2664 {
2665 	net->core.inuse = alloc_percpu(struct prot_inuse);
2666 	return net->core.inuse ? 0 : -ENOMEM;
2667 }
2668 
2669 static void __net_exit sock_inuse_exit_net(struct net *net)
2670 {
2671 	free_percpu(net->core.inuse);
2672 }
2673 
2674 static struct pernet_operations net_inuse_ops = {
2675 	.init = sock_inuse_init_net,
2676 	.exit = sock_inuse_exit_net,
2677 };
2678 
2679 static __init int net_inuse_init(void)
2680 {
2681 	if (register_pernet_subsys(&net_inuse_ops))
2682 		panic("Cannot initialize net inuse counters");
2683 
2684 	return 0;
2685 }
2686 
2687 core_initcall(net_inuse_init);
2688 #else
2689 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2690 
2691 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2692 {
2693 	__this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
2694 }
2695 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2696 
2697 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2698 {
2699 	int cpu, idx = prot->inuse_idx;
2700 	int res = 0;
2701 
2702 	for_each_possible_cpu(cpu)
2703 		res += per_cpu(prot_inuse, cpu).val[idx];
2704 
2705 	return res >= 0 ? res : 0;
2706 }
2707 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2708 #endif
2709 
2710 static void assign_proto_idx(struct proto *prot)
2711 {
2712 	prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2713 
2714 	if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2715 		pr_err("PROTO_INUSE_NR exhausted\n");
2716 		return;
2717 	}
2718 
2719 	set_bit(prot->inuse_idx, proto_inuse_idx);
2720 }
2721 
2722 static void release_proto_idx(struct proto *prot)
2723 {
2724 	if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2725 		clear_bit(prot->inuse_idx, proto_inuse_idx);
2726 }
2727 #else
2728 static inline void assign_proto_idx(struct proto *prot)
2729 {
2730 }
2731 
2732 static inline void release_proto_idx(struct proto *prot)
2733 {
2734 }
2735 #endif
2736 
2737 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
2738 {
2739 	if (!rsk_prot)
2740 		return;
2741 	kfree(rsk_prot->slab_name);
2742 	rsk_prot->slab_name = NULL;
2743 	if (rsk_prot->slab) {
2744 		kmem_cache_destroy(rsk_prot->slab);
2745 		rsk_prot->slab = NULL;
2746 	}
2747 }
2748 
2749 static int req_prot_init(const struct proto *prot)
2750 {
2751 	struct request_sock_ops *rsk_prot = prot->rsk_prot;
2752 
2753 	if (!rsk_prot)
2754 		return 0;
2755 
2756 	rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
2757 					prot->name);
2758 	if (!rsk_prot->slab_name)
2759 		return -ENOMEM;
2760 
2761 	rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
2762 					   rsk_prot->obj_size, 0,
2763 					   0, NULL);
2764 
2765 	if (!rsk_prot->slab) {
2766 		pr_crit("%s: Can't create request sock SLAB cache!\n",
2767 			prot->name);
2768 		return -ENOMEM;
2769 	}
2770 	return 0;
2771 }
2772 
2773 int proto_register(struct proto *prot, int alloc_slab)
2774 {
2775 	if (alloc_slab) {
2776 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2777 					SLAB_HWCACHE_ALIGN | prot->slab_flags,
2778 					NULL);
2779 
2780 		if (prot->slab == NULL) {
2781 			pr_crit("%s: Can't create sock SLAB cache!\n",
2782 				prot->name);
2783 			goto out;
2784 		}
2785 
2786 		if (req_prot_init(prot))
2787 			goto out_free_request_sock_slab;
2788 
2789 		if (prot->twsk_prot != NULL) {
2790 			prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
2791 
2792 			if (prot->twsk_prot->twsk_slab_name == NULL)
2793 				goto out_free_request_sock_slab;
2794 
2795 			prot->twsk_prot->twsk_slab =
2796 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2797 						  prot->twsk_prot->twsk_obj_size,
2798 						  0,
2799 						  prot->slab_flags,
2800 						  NULL);
2801 			if (prot->twsk_prot->twsk_slab == NULL)
2802 				goto out_free_timewait_sock_slab_name;
2803 		}
2804 	}
2805 
2806 	mutex_lock(&proto_list_mutex);
2807 	list_add(&prot->node, &proto_list);
2808 	assign_proto_idx(prot);
2809 	mutex_unlock(&proto_list_mutex);
2810 	return 0;
2811 
2812 out_free_timewait_sock_slab_name:
2813 	kfree(prot->twsk_prot->twsk_slab_name);
2814 out_free_request_sock_slab:
2815 	req_prot_cleanup(prot->rsk_prot);
2816 
2817 	kmem_cache_destroy(prot->slab);
2818 	prot->slab = NULL;
2819 out:
2820 	return -ENOBUFS;
2821 }
2822 EXPORT_SYMBOL(proto_register);
2823 
2824 void proto_unregister(struct proto *prot)
2825 {
2826 	mutex_lock(&proto_list_mutex);
2827 	release_proto_idx(prot);
2828 	list_del(&prot->node);
2829 	mutex_unlock(&proto_list_mutex);
2830 
2831 	if (prot->slab != NULL) {
2832 		kmem_cache_destroy(prot->slab);
2833 		prot->slab = NULL;
2834 	}
2835 
2836 	req_prot_cleanup(prot->rsk_prot);
2837 
2838 	if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2839 		kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2840 		kfree(prot->twsk_prot->twsk_slab_name);
2841 		prot->twsk_prot->twsk_slab = NULL;
2842 	}
2843 }
2844 EXPORT_SYMBOL(proto_unregister);
2845 
2846 #ifdef CONFIG_PROC_FS
2847 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2848 	__acquires(proto_list_mutex)
2849 {
2850 	mutex_lock(&proto_list_mutex);
2851 	return seq_list_start_head(&proto_list, *pos);
2852 }
2853 
2854 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2855 {
2856 	return seq_list_next(v, &proto_list, pos);
2857 }
2858 
2859 static void proto_seq_stop(struct seq_file *seq, void *v)
2860 	__releases(proto_list_mutex)
2861 {
2862 	mutex_unlock(&proto_list_mutex);
2863 }
2864 
2865 static char proto_method_implemented(const void *method)
2866 {
2867 	return method == NULL ? 'n' : 'y';
2868 }
2869 static long sock_prot_memory_allocated(struct proto *proto)
2870 {
2871 	return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
2872 }
2873 
2874 static char *sock_prot_memory_pressure(struct proto *proto)
2875 {
2876 	return proto->memory_pressure != NULL ?
2877 	proto_memory_pressure(proto) ? "yes" : "no" : "NI";
2878 }
2879 
2880 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2881 {
2882 
2883 	seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
2884 			"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2885 		   proto->name,
2886 		   proto->obj_size,
2887 		   sock_prot_inuse_get(seq_file_net(seq), proto),
2888 		   sock_prot_memory_allocated(proto),
2889 		   sock_prot_memory_pressure(proto),
2890 		   proto->max_header,
2891 		   proto->slab == NULL ? "no" : "yes",
2892 		   module_name(proto->owner),
2893 		   proto_method_implemented(proto->close),
2894 		   proto_method_implemented(proto->connect),
2895 		   proto_method_implemented(proto->disconnect),
2896 		   proto_method_implemented(proto->accept),
2897 		   proto_method_implemented(proto->ioctl),
2898 		   proto_method_implemented(proto->init),
2899 		   proto_method_implemented(proto->destroy),
2900 		   proto_method_implemented(proto->shutdown),
2901 		   proto_method_implemented(proto->setsockopt),
2902 		   proto_method_implemented(proto->getsockopt),
2903 		   proto_method_implemented(proto->sendmsg),
2904 		   proto_method_implemented(proto->recvmsg),
2905 		   proto_method_implemented(proto->sendpage),
2906 		   proto_method_implemented(proto->bind),
2907 		   proto_method_implemented(proto->backlog_rcv),
2908 		   proto_method_implemented(proto->hash),
2909 		   proto_method_implemented(proto->unhash),
2910 		   proto_method_implemented(proto->get_port),
2911 		   proto_method_implemented(proto->enter_memory_pressure));
2912 }
2913 
2914 static int proto_seq_show(struct seq_file *seq, void *v)
2915 {
2916 	if (v == &proto_list)
2917 		seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2918 			   "protocol",
2919 			   "size",
2920 			   "sockets",
2921 			   "memory",
2922 			   "press",
2923 			   "maxhdr",
2924 			   "slab",
2925 			   "module",
2926 			   "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2927 	else
2928 		proto_seq_printf(seq, list_entry(v, struct proto, node));
2929 	return 0;
2930 }
2931 
2932 static const struct seq_operations proto_seq_ops = {
2933 	.start  = proto_seq_start,
2934 	.next   = proto_seq_next,
2935 	.stop   = proto_seq_stop,
2936 	.show   = proto_seq_show,
2937 };
2938 
2939 static int proto_seq_open(struct inode *inode, struct file *file)
2940 {
2941 	return seq_open_net(inode, file, &proto_seq_ops,
2942 			    sizeof(struct seq_net_private));
2943 }
2944 
2945 static const struct file_operations proto_seq_fops = {
2946 	.owner		= THIS_MODULE,
2947 	.open		= proto_seq_open,
2948 	.read		= seq_read,
2949 	.llseek		= seq_lseek,
2950 	.release	= seq_release_net,
2951 };
2952 
2953 static __net_init int proto_init_net(struct net *net)
2954 {
2955 	if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops))
2956 		return -ENOMEM;
2957 
2958 	return 0;
2959 }
2960 
2961 static __net_exit void proto_exit_net(struct net *net)
2962 {
2963 	remove_proc_entry("protocols", net->proc_net);
2964 }
2965 
2966 
2967 static __net_initdata struct pernet_operations proto_net_ops = {
2968 	.init = proto_init_net,
2969 	.exit = proto_exit_net,
2970 };
2971 
2972 static int __init proto_init(void)
2973 {
2974 	return register_pernet_subsys(&proto_net_ops);
2975 }
2976 
2977 subsys_initcall(proto_init);
2978 
2979 #endif /* PROC_FS */
2980