1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * Generic socket support routines. Memory allocators, socket lock/release
8 * handler for protocols to use and generic option handler.
9 *
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
14 *
15 * Fixes:
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
36 * TCP layer surgery.
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
64 * (compatibility fix)
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
82 *
83 * To Fix:
84 */
85
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
87
88 #include <linux/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
94 #include <linux/in.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
121
122 #include <linux/uaccess.h>
123
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <linux/skbuff_ref.h>
128 #include <net/net_namespace.h>
129 #include <net/request_sock.h>
130 #include <net/sock.h>
131 #include <net/proto_memory.h>
132 #include <linux/net_tstamp.h>
133 #include <net/xfrm.h>
134 #include <linux/ipsec.h>
135 #include <net/cls_cgroup.h>
136 #include <net/netprio_cgroup.h>
137 #include <linux/sock_diag.h>
138
139 #include <linux/filter.h>
140 #include <net/sock_reuseport.h>
141 #include <net/bpf_sk_storage.h>
142
143 #include <trace/events/sock.h>
144
145 #include <net/tcp.h>
146 #include <net/busy_poll.h>
147 #include <net/phonet/phonet.h>
148
149 #include <linux/ethtool.h>
150
151 #include <uapi/linux/pidfd.h>
152
153 #include "dev.h"
154
155 static DEFINE_MUTEX(proto_list_mutex);
156 static LIST_HEAD(proto_list);
157
158 static void sock_def_write_space_wfree(struct sock *sk);
159 static void sock_def_write_space(struct sock *sk);
160
161 /**
162 * sk_ns_capable - General socket capability test
163 * @sk: Socket to use a capability on or through
164 * @user_ns: The user namespace of the capability to use
165 * @cap: The capability to use
166 *
167 * Test to see if the opener of the socket had when the socket was
168 * created and the current process has the capability @cap in the user
169 * namespace @user_ns.
170 */
sk_ns_capable(const struct sock * sk,struct user_namespace * user_ns,int cap)171 bool sk_ns_capable(const struct sock *sk,
172 struct user_namespace *user_ns, int cap)
173 {
174 return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
175 ns_capable(user_ns, cap);
176 }
177 EXPORT_SYMBOL(sk_ns_capable);
178
179 /**
180 * sk_capable - Socket global capability test
181 * @sk: Socket to use a capability on or through
182 * @cap: The global capability to use
183 *
184 * Test to see if the opener of the socket had when the socket was
185 * created and the current process has the capability @cap in all user
186 * namespaces.
187 */
sk_capable(const struct sock * sk,int cap)188 bool sk_capable(const struct sock *sk, int cap)
189 {
190 return sk_ns_capable(sk, &init_user_ns, cap);
191 }
192 EXPORT_SYMBOL(sk_capable);
193
194 /**
195 * sk_net_capable - Network namespace socket capability test
196 * @sk: Socket to use a capability on or through
197 * @cap: The capability to use
198 *
199 * Test to see if the opener of the socket had when the socket was created
200 * and the current process has the capability @cap over the network namespace
201 * the socket is a member of.
202 */
sk_net_capable(const struct sock * sk,int cap)203 bool sk_net_capable(const struct sock *sk, int cap)
204 {
205 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
206 }
207 EXPORT_SYMBOL(sk_net_capable);
208
209 /*
210 * Each address family might have different locking rules, so we have
211 * one slock key per address family and separate keys for internal and
212 * userspace sockets.
213 */
214 static struct lock_class_key af_family_keys[AF_MAX];
215 static struct lock_class_key af_family_kern_keys[AF_MAX];
216 static struct lock_class_key af_family_slock_keys[AF_MAX];
217 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
218
219 /*
220 * Make lock validator output more readable. (we pre-construct these
221 * strings build-time, so that runtime initialization of socket
222 * locks is fast):
223 */
224
225 #define _sock_locks(x) \
226 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
227 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
228 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
229 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
230 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
231 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
232 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
233 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
234 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
235 x "27" , x "28" , x "AF_CAN" , \
236 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
237 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
238 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
239 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
240 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
241 x "AF_MCTP" , \
242 x "AF_MAX"
243
244 static const char *const af_family_key_strings[AF_MAX+1] = {
245 _sock_locks("sk_lock-")
246 };
247 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
248 _sock_locks("slock-")
249 };
250 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
251 _sock_locks("clock-")
252 };
253
254 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
255 _sock_locks("k-sk_lock-")
256 };
257 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
258 _sock_locks("k-slock-")
259 };
260 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
261 _sock_locks("k-clock-")
262 };
263 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
264 _sock_locks("rlock-")
265 };
266 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
267 _sock_locks("wlock-")
268 };
269 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
270 _sock_locks("elock-")
271 };
272
273 /*
274 * sk_callback_lock and sk queues locking rules are per-address-family,
275 * so split the lock classes by using a per-AF key:
276 */
277 static struct lock_class_key af_callback_keys[AF_MAX];
278 static struct lock_class_key af_rlock_keys[AF_MAX];
279 static struct lock_class_key af_wlock_keys[AF_MAX];
280 static struct lock_class_key af_elock_keys[AF_MAX];
281 static struct lock_class_key af_kern_callback_keys[AF_MAX];
282
283 /* Run time adjustable parameters. */
284 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
285 EXPORT_SYMBOL(sysctl_wmem_max);
286 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
287 EXPORT_SYMBOL(sysctl_rmem_max);
288 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
289 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
290
291 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
292 EXPORT_SYMBOL_GPL(memalloc_socks_key);
293
294 /**
295 * sk_set_memalloc - sets %SOCK_MEMALLOC
296 * @sk: socket to set it on
297 *
298 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
299 * It's the responsibility of the admin to adjust min_free_kbytes
300 * to meet the requirements
301 */
sk_set_memalloc(struct sock * sk)302 void sk_set_memalloc(struct sock *sk)
303 {
304 sock_set_flag(sk, SOCK_MEMALLOC);
305 sk->sk_allocation |= __GFP_MEMALLOC;
306 static_branch_inc(&memalloc_socks_key);
307 }
308 EXPORT_SYMBOL_GPL(sk_set_memalloc);
309
sk_clear_memalloc(struct sock * sk)310 void sk_clear_memalloc(struct sock *sk)
311 {
312 sock_reset_flag(sk, SOCK_MEMALLOC);
313 sk->sk_allocation &= ~__GFP_MEMALLOC;
314 static_branch_dec(&memalloc_socks_key);
315
316 /*
317 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
318 * progress of swapping. SOCK_MEMALLOC may be cleared while
319 * it has rmem allocations due to the last swapfile being deactivated
320 * but there is a risk that the socket is unusable due to exceeding
321 * the rmem limits. Reclaim the reserves and obey rmem limits again.
322 */
323 sk_mem_reclaim(sk);
324 }
325 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
326
__sk_backlog_rcv(struct sock * sk,struct sk_buff * skb)327 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
328 {
329 int ret;
330 unsigned int noreclaim_flag;
331
332 /* these should have been dropped before queueing */
333 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
334
335 noreclaim_flag = memalloc_noreclaim_save();
336 ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
337 tcp_v6_do_rcv,
338 tcp_v4_do_rcv,
339 sk, skb);
340 memalloc_noreclaim_restore(noreclaim_flag);
341
342 return ret;
343 }
344 EXPORT_SYMBOL(__sk_backlog_rcv);
345
sk_error_report(struct sock * sk)346 void sk_error_report(struct sock *sk)
347 {
348 sk->sk_error_report(sk);
349
350 switch (sk->sk_family) {
351 case AF_INET:
352 fallthrough;
353 case AF_INET6:
354 trace_inet_sk_error_report(sk);
355 break;
356 default:
357 break;
358 }
359 }
360 EXPORT_SYMBOL(sk_error_report);
361
sock_get_timeout(long timeo,void * optval,bool old_timeval)362 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
363 {
364 struct __kernel_sock_timeval tv;
365
366 if (timeo == MAX_SCHEDULE_TIMEOUT) {
367 tv.tv_sec = 0;
368 tv.tv_usec = 0;
369 } else {
370 tv.tv_sec = timeo / HZ;
371 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
372 }
373
374 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
375 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
376 *(struct old_timeval32 *)optval = tv32;
377 return sizeof(tv32);
378 }
379
380 if (old_timeval) {
381 struct __kernel_old_timeval old_tv;
382 old_tv.tv_sec = tv.tv_sec;
383 old_tv.tv_usec = tv.tv_usec;
384 *(struct __kernel_old_timeval *)optval = old_tv;
385 return sizeof(old_tv);
386 }
387
388 *(struct __kernel_sock_timeval *)optval = tv;
389 return sizeof(tv);
390 }
391 EXPORT_SYMBOL(sock_get_timeout);
392
sock_copy_user_timeval(struct __kernel_sock_timeval * tv,sockptr_t optval,int optlen,bool old_timeval)393 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
394 sockptr_t optval, int optlen, bool old_timeval)
395 {
396 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
397 struct old_timeval32 tv32;
398
399 if (optlen < sizeof(tv32))
400 return -EINVAL;
401
402 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
403 return -EFAULT;
404 tv->tv_sec = tv32.tv_sec;
405 tv->tv_usec = tv32.tv_usec;
406 } else if (old_timeval) {
407 struct __kernel_old_timeval old_tv;
408
409 if (optlen < sizeof(old_tv))
410 return -EINVAL;
411 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
412 return -EFAULT;
413 tv->tv_sec = old_tv.tv_sec;
414 tv->tv_usec = old_tv.tv_usec;
415 } else {
416 if (optlen < sizeof(*tv))
417 return -EINVAL;
418 if (copy_from_sockptr(tv, optval, sizeof(*tv)))
419 return -EFAULT;
420 }
421
422 return 0;
423 }
424 EXPORT_SYMBOL(sock_copy_user_timeval);
425
sock_set_timeout(long * timeo_p,sockptr_t optval,int optlen,bool old_timeval)426 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
427 bool old_timeval)
428 {
429 struct __kernel_sock_timeval tv;
430 int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
431 long val;
432
433 if (err)
434 return err;
435
436 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
437 return -EDOM;
438
439 if (tv.tv_sec < 0) {
440 static int warned __read_mostly;
441
442 WRITE_ONCE(*timeo_p, 0);
443 if (warned < 10 && net_ratelimit()) {
444 warned++;
445 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
446 __func__, current->comm, task_pid_nr(current));
447 }
448 return 0;
449 }
450 val = MAX_SCHEDULE_TIMEOUT;
451 if ((tv.tv_sec || tv.tv_usec) &&
452 (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
453 val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
454 USEC_PER_SEC / HZ);
455 WRITE_ONCE(*timeo_p, val);
456 return 0;
457 }
458
sk_set_prio_allowed(const struct sock * sk,int val)459 static bool sk_set_prio_allowed(const struct sock *sk, int val)
460 {
461 return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) ||
462 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
463 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN));
464 }
465
sock_needs_netstamp(const struct sock * sk)466 static bool sock_needs_netstamp(const struct sock *sk)
467 {
468 switch (sk->sk_family) {
469 case AF_UNSPEC:
470 case AF_UNIX:
471 return false;
472 default:
473 return true;
474 }
475 }
476
sock_disable_timestamp(struct sock * sk,unsigned long flags)477 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
478 {
479 if (sk->sk_flags & flags) {
480 sk->sk_flags &= ~flags;
481 if (sock_needs_netstamp(sk) &&
482 !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
483 net_disable_timestamp();
484 }
485 }
486
487
__sock_queue_rcv_skb(struct sock * sk,struct sk_buff * skb)488 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
489 {
490 unsigned long flags;
491 struct sk_buff_head *list = &sk->sk_receive_queue;
492
493 if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) {
494 atomic_inc(&sk->sk_drops);
495 trace_sock_rcvqueue_full(sk, skb);
496 return -ENOMEM;
497 }
498
499 if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
500 atomic_inc(&sk->sk_drops);
501 return -ENOBUFS;
502 }
503
504 skb->dev = NULL;
505 skb_set_owner_r(skb, sk);
506
507 /* we escape from rcu protected region, make sure we dont leak
508 * a norefcounted dst
509 */
510 skb_dst_force(skb);
511
512 spin_lock_irqsave(&list->lock, flags);
513 sock_skb_set_dropcount(sk, skb);
514 __skb_queue_tail(list, skb);
515 spin_unlock_irqrestore(&list->lock, flags);
516
517 if (!sock_flag(sk, SOCK_DEAD))
518 sk->sk_data_ready(sk);
519 return 0;
520 }
521 EXPORT_SYMBOL(__sock_queue_rcv_skb);
522
sock_queue_rcv_skb_reason(struct sock * sk,struct sk_buff * skb,enum skb_drop_reason * reason)523 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
524 enum skb_drop_reason *reason)
525 {
526 enum skb_drop_reason drop_reason;
527 int err;
528
529 err = sk_filter(sk, skb);
530 if (err) {
531 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
532 goto out;
533 }
534 err = __sock_queue_rcv_skb(sk, skb);
535 switch (err) {
536 case -ENOMEM:
537 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
538 break;
539 case -ENOBUFS:
540 drop_reason = SKB_DROP_REASON_PROTO_MEM;
541 break;
542 default:
543 drop_reason = SKB_NOT_DROPPED_YET;
544 break;
545 }
546 out:
547 if (reason)
548 *reason = drop_reason;
549 return err;
550 }
551 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
552
__sk_receive_skb(struct sock * sk,struct sk_buff * skb,const int nested,unsigned int trim_cap,bool refcounted)553 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
554 const int nested, unsigned int trim_cap, bool refcounted)
555 {
556 int rc = NET_RX_SUCCESS;
557
558 if (sk_filter_trim_cap(sk, skb, trim_cap))
559 goto discard_and_relse;
560
561 skb->dev = NULL;
562
563 if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) {
564 atomic_inc(&sk->sk_drops);
565 goto discard_and_relse;
566 }
567 if (nested)
568 bh_lock_sock_nested(sk);
569 else
570 bh_lock_sock(sk);
571 if (!sock_owned_by_user(sk)) {
572 /*
573 * trylock + unlock semantics:
574 */
575 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
576
577 rc = sk_backlog_rcv(sk, skb);
578
579 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
580 } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
581 bh_unlock_sock(sk);
582 atomic_inc(&sk->sk_drops);
583 goto discard_and_relse;
584 }
585
586 bh_unlock_sock(sk);
587 out:
588 if (refcounted)
589 sock_put(sk);
590 return rc;
591 discard_and_relse:
592 kfree_skb(skb);
593 goto out;
594 }
595 EXPORT_SYMBOL(__sk_receive_skb);
596
597 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
598 u32));
599 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
600 u32));
__sk_dst_check(struct sock * sk,u32 cookie)601 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
602 {
603 struct dst_entry *dst = __sk_dst_get(sk);
604
605 if (dst && dst->obsolete &&
606 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
607 dst, cookie) == NULL) {
608 sk_tx_queue_clear(sk);
609 WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
610 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
611 dst_release(dst);
612 return NULL;
613 }
614
615 return dst;
616 }
617 EXPORT_SYMBOL(__sk_dst_check);
618
sk_dst_check(struct sock * sk,u32 cookie)619 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
620 {
621 struct dst_entry *dst = sk_dst_get(sk);
622
623 if (dst && dst->obsolete &&
624 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
625 dst, cookie) == NULL) {
626 sk_dst_reset(sk);
627 dst_release(dst);
628 return NULL;
629 }
630
631 return dst;
632 }
633 EXPORT_SYMBOL(sk_dst_check);
634
sock_bindtoindex_locked(struct sock * sk,int ifindex)635 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
636 {
637 int ret = -ENOPROTOOPT;
638 #ifdef CONFIG_NETDEVICES
639 struct net *net = sock_net(sk);
640
641 /* Sorry... */
642 ret = -EPERM;
643 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
644 goto out;
645
646 ret = -EINVAL;
647 if (ifindex < 0)
648 goto out;
649
650 /* Paired with all READ_ONCE() done locklessly. */
651 WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
652
653 if (sk->sk_prot->rehash)
654 sk->sk_prot->rehash(sk);
655 sk_dst_reset(sk);
656
657 ret = 0;
658
659 out:
660 #endif
661
662 return ret;
663 }
664
sock_bindtoindex(struct sock * sk,int ifindex,bool lock_sk)665 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
666 {
667 int ret;
668
669 if (lock_sk)
670 lock_sock(sk);
671 ret = sock_bindtoindex_locked(sk, ifindex);
672 if (lock_sk)
673 release_sock(sk);
674
675 return ret;
676 }
677 EXPORT_SYMBOL(sock_bindtoindex);
678
sock_setbindtodevice(struct sock * sk,sockptr_t optval,int optlen)679 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
680 {
681 int ret = -ENOPROTOOPT;
682 #ifdef CONFIG_NETDEVICES
683 struct net *net = sock_net(sk);
684 char devname[IFNAMSIZ];
685 int index;
686
687 ret = -EINVAL;
688 if (optlen < 0)
689 goto out;
690
691 /* Bind this socket to a particular device like "eth0",
692 * as specified in the passed interface name. If the
693 * name is "" or the option length is zero the socket
694 * is not bound.
695 */
696 if (optlen > IFNAMSIZ - 1)
697 optlen = IFNAMSIZ - 1;
698 memset(devname, 0, sizeof(devname));
699
700 ret = -EFAULT;
701 if (copy_from_sockptr(devname, optval, optlen))
702 goto out;
703
704 index = 0;
705 if (devname[0] != '\0') {
706 struct net_device *dev;
707
708 rcu_read_lock();
709 dev = dev_get_by_name_rcu(net, devname);
710 if (dev)
711 index = dev->ifindex;
712 rcu_read_unlock();
713 ret = -ENODEV;
714 if (!dev)
715 goto out;
716 }
717
718 sockopt_lock_sock(sk);
719 ret = sock_bindtoindex_locked(sk, index);
720 sockopt_release_sock(sk);
721 out:
722 #endif
723
724 return ret;
725 }
726
sock_getbindtodevice(struct sock * sk,sockptr_t optval,sockptr_t optlen,int len)727 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
728 sockptr_t optlen, int len)
729 {
730 int ret = -ENOPROTOOPT;
731 #ifdef CONFIG_NETDEVICES
732 int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
733 struct net *net = sock_net(sk);
734 char devname[IFNAMSIZ];
735
736 if (bound_dev_if == 0) {
737 len = 0;
738 goto zero;
739 }
740
741 ret = -EINVAL;
742 if (len < IFNAMSIZ)
743 goto out;
744
745 ret = netdev_get_name(net, devname, bound_dev_if);
746 if (ret)
747 goto out;
748
749 len = strlen(devname) + 1;
750
751 ret = -EFAULT;
752 if (copy_to_sockptr(optval, devname, len))
753 goto out;
754
755 zero:
756 ret = -EFAULT;
757 if (copy_to_sockptr(optlen, &len, sizeof(int)))
758 goto out;
759
760 ret = 0;
761
762 out:
763 #endif
764
765 return ret;
766 }
767
sk_mc_loop(const struct sock * sk)768 bool sk_mc_loop(const struct sock *sk)
769 {
770 if (dev_recursion_level())
771 return false;
772 if (!sk)
773 return true;
774 /* IPV6_ADDRFORM can change sk->sk_family under us. */
775 switch (READ_ONCE(sk->sk_family)) {
776 case AF_INET:
777 return inet_test_bit(MC_LOOP, sk);
778 #if IS_ENABLED(CONFIG_IPV6)
779 case AF_INET6:
780 return inet6_test_bit(MC6_LOOP, sk);
781 #endif
782 }
783 WARN_ON_ONCE(1);
784 return true;
785 }
786 EXPORT_SYMBOL(sk_mc_loop);
787
sock_set_reuseaddr(struct sock * sk)788 void sock_set_reuseaddr(struct sock *sk)
789 {
790 lock_sock(sk);
791 sk->sk_reuse = SK_CAN_REUSE;
792 release_sock(sk);
793 }
794 EXPORT_SYMBOL(sock_set_reuseaddr);
795
sock_set_reuseport(struct sock * sk)796 void sock_set_reuseport(struct sock *sk)
797 {
798 lock_sock(sk);
799 sk->sk_reuseport = true;
800 release_sock(sk);
801 }
802 EXPORT_SYMBOL(sock_set_reuseport);
803
sock_no_linger(struct sock * sk)804 void sock_no_linger(struct sock *sk)
805 {
806 lock_sock(sk);
807 WRITE_ONCE(sk->sk_lingertime, 0);
808 sock_set_flag(sk, SOCK_LINGER);
809 release_sock(sk);
810 }
811 EXPORT_SYMBOL(sock_no_linger);
812
sock_set_priority(struct sock * sk,u32 priority)813 void sock_set_priority(struct sock *sk, u32 priority)
814 {
815 WRITE_ONCE(sk->sk_priority, priority);
816 }
817 EXPORT_SYMBOL(sock_set_priority);
818
sock_set_sndtimeo(struct sock * sk,s64 secs)819 void sock_set_sndtimeo(struct sock *sk, s64 secs)
820 {
821 lock_sock(sk);
822 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
823 WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
824 else
825 WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
826 release_sock(sk);
827 }
828 EXPORT_SYMBOL(sock_set_sndtimeo);
829
__sock_set_timestamps(struct sock * sk,bool val,bool new,bool ns)830 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
831 {
832 sock_valbool_flag(sk, SOCK_RCVTSTAMP, val);
833 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns);
834 if (val) {
835 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
836 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
837 }
838 }
839
sock_enable_timestamps(struct sock * sk)840 void sock_enable_timestamps(struct sock *sk)
841 {
842 lock_sock(sk);
843 __sock_set_timestamps(sk, true, false, true);
844 release_sock(sk);
845 }
846 EXPORT_SYMBOL(sock_enable_timestamps);
847
sock_set_timestamp(struct sock * sk,int optname,bool valbool)848 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
849 {
850 switch (optname) {
851 case SO_TIMESTAMP_OLD:
852 __sock_set_timestamps(sk, valbool, false, false);
853 break;
854 case SO_TIMESTAMP_NEW:
855 __sock_set_timestamps(sk, valbool, true, false);
856 break;
857 case SO_TIMESTAMPNS_OLD:
858 __sock_set_timestamps(sk, valbool, false, true);
859 break;
860 case SO_TIMESTAMPNS_NEW:
861 __sock_set_timestamps(sk, valbool, true, true);
862 break;
863 }
864 }
865
sock_timestamping_bind_phc(struct sock * sk,int phc_index)866 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
867 {
868 struct net *net = sock_net(sk);
869 struct net_device *dev = NULL;
870 bool match = false;
871 int *vclock_index;
872 int i, num;
873
874 if (sk->sk_bound_dev_if)
875 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
876
877 if (!dev) {
878 pr_err("%s: sock not bind to device\n", __func__);
879 return -EOPNOTSUPP;
880 }
881
882 num = ethtool_get_phc_vclocks(dev, &vclock_index);
883 dev_put(dev);
884
885 for (i = 0; i < num; i++) {
886 if (*(vclock_index + i) == phc_index) {
887 match = true;
888 break;
889 }
890 }
891
892 if (num > 0)
893 kfree(vclock_index);
894
895 if (!match)
896 return -EINVAL;
897
898 WRITE_ONCE(sk->sk_bind_phc, phc_index);
899
900 return 0;
901 }
902
sock_set_timestamping(struct sock * sk,int optname,struct so_timestamping timestamping)903 int sock_set_timestamping(struct sock *sk, int optname,
904 struct so_timestamping timestamping)
905 {
906 int val = timestamping.flags;
907 int ret;
908
909 if (val & ~SOF_TIMESTAMPING_MASK)
910 return -EINVAL;
911
912 if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
913 !(val & SOF_TIMESTAMPING_OPT_ID))
914 return -EINVAL;
915
916 if (val & SOF_TIMESTAMPING_OPT_ID &&
917 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
918 if (sk_is_tcp(sk)) {
919 if ((1 << sk->sk_state) &
920 (TCPF_CLOSE | TCPF_LISTEN))
921 return -EINVAL;
922 if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
923 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
924 else
925 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
926 } else {
927 atomic_set(&sk->sk_tskey, 0);
928 }
929 }
930
931 if (val & SOF_TIMESTAMPING_OPT_STATS &&
932 !(val & SOF_TIMESTAMPING_OPT_TSONLY))
933 return -EINVAL;
934
935 if (val & SOF_TIMESTAMPING_BIND_PHC) {
936 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
937 if (ret)
938 return ret;
939 }
940
941 WRITE_ONCE(sk->sk_tsflags, val);
942 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
943 sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY));
944
945 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
946 sock_enable_timestamp(sk,
947 SOCK_TIMESTAMPING_RX_SOFTWARE);
948 else
949 sock_disable_timestamp(sk,
950 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
951 return 0;
952 }
953
954 #if defined(CONFIG_CGROUP_BPF)
bpf_skops_tx_timestamping(struct sock * sk,struct sk_buff * skb,int op)955 void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op)
956 {
957 struct bpf_sock_ops_kern sock_ops;
958
959 memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp));
960 sock_ops.op = op;
961 sock_ops.is_fullsock = 1;
962 sock_ops.sk = sk;
963 bpf_skops_init_skb(&sock_ops, skb, 0);
964 __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS);
965 }
966 #endif
967
sock_set_keepalive(struct sock * sk)968 void sock_set_keepalive(struct sock *sk)
969 {
970 lock_sock(sk);
971 if (sk->sk_prot->keepalive)
972 sk->sk_prot->keepalive(sk, true);
973 sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
974 release_sock(sk);
975 }
976 EXPORT_SYMBOL(sock_set_keepalive);
977
__sock_set_rcvbuf(struct sock * sk,int val)978 static void __sock_set_rcvbuf(struct sock *sk, int val)
979 {
980 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
981 * as a negative value.
982 */
983 val = min_t(int, val, INT_MAX / 2);
984 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
985
986 /* We double it on the way in to account for "struct sk_buff" etc.
987 * overhead. Applications assume that the SO_RCVBUF setting they make
988 * will allow that much actual data to be received on that socket.
989 *
990 * Applications are unaware that "struct sk_buff" and other overheads
991 * allocate from the receive buffer during socket buffer allocation.
992 *
993 * And after considering the possible alternatives, returning the value
994 * we actually used in getsockopt is the most desirable behavior.
995 */
996 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
997 }
998
sock_set_rcvbuf(struct sock * sk,int val)999 void sock_set_rcvbuf(struct sock *sk, int val)
1000 {
1001 lock_sock(sk);
1002 __sock_set_rcvbuf(sk, val);
1003 release_sock(sk);
1004 }
1005 EXPORT_SYMBOL(sock_set_rcvbuf);
1006
__sock_set_mark(struct sock * sk,u32 val)1007 static void __sock_set_mark(struct sock *sk, u32 val)
1008 {
1009 if (val != sk->sk_mark) {
1010 WRITE_ONCE(sk->sk_mark, val);
1011 sk_dst_reset(sk);
1012 }
1013 }
1014
sock_set_mark(struct sock * sk,u32 val)1015 void sock_set_mark(struct sock *sk, u32 val)
1016 {
1017 lock_sock(sk);
1018 __sock_set_mark(sk, val);
1019 release_sock(sk);
1020 }
1021 EXPORT_SYMBOL(sock_set_mark);
1022
sock_release_reserved_memory(struct sock * sk,int bytes)1023 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1024 {
1025 /* Round down bytes to multiple of pages */
1026 bytes = round_down(bytes, PAGE_SIZE);
1027
1028 WARN_ON(bytes > sk->sk_reserved_mem);
1029 WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1030 sk_mem_reclaim(sk);
1031 }
1032
sock_reserve_memory(struct sock * sk,int bytes)1033 static int sock_reserve_memory(struct sock *sk, int bytes)
1034 {
1035 long allocated;
1036 bool charged;
1037 int pages;
1038
1039 if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1040 return -EOPNOTSUPP;
1041
1042 if (!bytes)
1043 return 0;
1044
1045 pages = sk_mem_pages(bytes);
1046
1047 /* pre-charge to memcg */
1048 charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1049 GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1050 if (!charged)
1051 return -ENOMEM;
1052
1053 /* pre-charge to forward_alloc */
1054 sk_memory_allocated_add(sk, pages);
1055 allocated = sk_memory_allocated(sk);
1056 /* If the system goes into memory pressure with this
1057 * precharge, give up and return error.
1058 */
1059 if (allocated > sk_prot_mem_limits(sk, 1)) {
1060 sk_memory_allocated_sub(sk, pages);
1061 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1062 return -ENOMEM;
1063 }
1064 sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1065
1066 WRITE_ONCE(sk->sk_reserved_mem,
1067 sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1068
1069 return 0;
1070 }
1071
1072 #ifdef CONFIG_PAGE_POOL
1073
1074 /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED
1075 * in 1 syscall. The limit exists to limit the amount of memory the kernel
1076 * allocates to copy these tokens, and to prevent looping over the frags for
1077 * too long.
1078 */
1079 #define MAX_DONTNEED_TOKENS 128
1080 #define MAX_DONTNEED_FRAGS 1024
1081
1082 static noinline_for_stack int
sock_devmem_dontneed(struct sock * sk,sockptr_t optval,unsigned int optlen)1083 sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen)
1084 {
1085 unsigned int num_tokens, i, j, k, netmem_num = 0;
1086 struct dmabuf_token *tokens;
1087 int ret = 0, num_frags = 0;
1088 netmem_ref netmems[16];
1089
1090 if (!sk_is_tcp(sk))
1091 return -EBADF;
1092
1093 if (optlen % sizeof(*tokens) ||
1094 optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS)
1095 return -EINVAL;
1096
1097 num_tokens = optlen / sizeof(*tokens);
1098 tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL);
1099 if (!tokens)
1100 return -ENOMEM;
1101
1102 if (copy_from_sockptr(tokens, optval, optlen)) {
1103 kvfree(tokens);
1104 return -EFAULT;
1105 }
1106
1107 xa_lock_bh(&sk->sk_user_frags);
1108 for (i = 0; i < num_tokens; i++) {
1109 for (j = 0; j < tokens[i].token_count; j++) {
1110 if (++num_frags > MAX_DONTNEED_FRAGS)
1111 goto frag_limit_reached;
1112
1113 netmem_ref netmem = (__force netmem_ref)__xa_erase(
1114 &sk->sk_user_frags, tokens[i].token_start + j);
1115
1116 if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem)))
1117 continue;
1118
1119 netmems[netmem_num++] = netmem;
1120 if (netmem_num == ARRAY_SIZE(netmems)) {
1121 xa_unlock_bh(&sk->sk_user_frags);
1122 for (k = 0; k < netmem_num; k++)
1123 WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1124 netmem_num = 0;
1125 xa_lock_bh(&sk->sk_user_frags);
1126 }
1127 ret++;
1128 }
1129 }
1130
1131 frag_limit_reached:
1132 xa_unlock_bh(&sk->sk_user_frags);
1133 for (k = 0; k < netmem_num; k++)
1134 WARN_ON_ONCE(!napi_pp_put_page(netmems[k]));
1135
1136 kvfree(tokens);
1137 return ret;
1138 }
1139 #endif
1140
sockopt_lock_sock(struct sock * sk)1141 void sockopt_lock_sock(struct sock *sk)
1142 {
1143 /* When current->bpf_ctx is set, the setsockopt is called from
1144 * a bpf prog. bpf has ensured the sk lock has been
1145 * acquired before calling setsockopt().
1146 */
1147 if (has_current_bpf_ctx())
1148 return;
1149
1150 lock_sock(sk);
1151 }
1152 EXPORT_SYMBOL(sockopt_lock_sock);
1153
sockopt_release_sock(struct sock * sk)1154 void sockopt_release_sock(struct sock *sk)
1155 {
1156 if (has_current_bpf_ctx())
1157 return;
1158
1159 release_sock(sk);
1160 }
1161 EXPORT_SYMBOL(sockopt_release_sock);
1162
sockopt_ns_capable(struct user_namespace * ns,int cap)1163 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1164 {
1165 return has_current_bpf_ctx() || ns_capable(ns, cap);
1166 }
1167 EXPORT_SYMBOL(sockopt_ns_capable);
1168
sockopt_capable(int cap)1169 bool sockopt_capable(int cap)
1170 {
1171 return has_current_bpf_ctx() || capable(cap);
1172 }
1173 EXPORT_SYMBOL(sockopt_capable);
1174
sockopt_validate_clockid(__kernel_clockid_t value)1175 static int sockopt_validate_clockid(__kernel_clockid_t value)
1176 {
1177 switch (value) {
1178 case CLOCK_REALTIME:
1179 case CLOCK_MONOTONIC:
1180 case CLOCK_TAI:
1181 return 0;
1182 }
1183 return -EINVAL;
1184 }
1185
1186 /*
1187 * This is meant for all protocols to use and covers goings on
1188 * at the socket level. Everything here is generic.
1189 */
1190
sk_setsockopt(struct sock * sk,int level,int optname,sockptr_t optval,unsigned int optlen)1191 int sk_setsockopt(struct sock *sk, int level, int optname,
1192 sockptr_t optval, unsigned int optlen)
1193 {
1194 struct so_timestamping timestamping;
1195 struct socket *sock = sk->sk_socket;
1196 struct sock_txtime sk_txtime;
1197 int val;
1198 int valbool;
1199 struct linger ling;
1200 int ret = 0;
1201
1202 /*
1203 * Options without arguments
1204 */
1205
1206 if (optname == SO_BINDTODEVICE)
1207 return sock_setbindtodevice(sk, optval, optlen);
1208
1209 if (optlen < sizeof(int))
1210 return -EINVAL;
1211
1212 if (copy_from_sockptr(&val, optval, sizeof(val)))
1213 return -EFAULT;
1214
1215 valbool = val ? 1 : 0;
1216
1217 /* handle options which do not require locking the socket. */
1218 switch (optname) {
1219 case SO_PRIORITY:
1220 if (sk_set_prio_allowed(sk, val)) {
1221 sock_set_priority(sk, val);
1222 return 0;
1223 }
1224 return -EPERM;
1225 case SO_TYPE:
1226 case SO_PROTOCOL:
1227 case SO_DOMAIN:
1228 case SO_ERROR:
1229 return -ENOPROTOOPT;
1230 #ifdef CONFIG_NET_RX_BUSY_POLL
1231 case SO_BUSY_POLL:
1232 if (val < 0)
1233 return -EINVAL;
1234 WRITE_ONCE(sk->sk_ll_usec, val);
1235 return 0;
1236 case SO_PREFER_BUSY_POLL:
1237 if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1238 return -EPERM;
1239 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1240 return 0;
1241 case SO_BUSY_POLL_BUDGET:
1242 if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1243 !sockopt_capable(CAP_NET_ADMIN))
1244 return -EPERM;
1245 if (val < 0 || val > U16_MAX)
1246 return -EINVAL;
1247 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1248 return 0;
1249 #endif
1250 case SO_MAX_PACING_RATE:
1251 {
1252 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1253 unsigned long pacing_rate;
1254
1255 if (sizeof(ulval) != sizeof(val) &&
1256 optlen >= sizeof(ulval) &&
1257 copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1258 return -EFAULT;
1259 }
1260 if (ulval != ~0UL)
1261 cmpxchg(&sk->sk_pacing_status,
1262 SK_PACING_NONE,
1263 SK_PACING_NEEDED);
1264 /* Pairs with READ_ONCE() from sk_getsockopt() */
1265 WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1266 pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1267 if (ulval < pacing_rate)
1268 WRITE_ONCE(sk->sk_pacing_rate, ulval);
1269 return 0;
1270 }
1271 case SO_TXREHASH:
1272 if (!sk_is_tcp(sk))
1273 return -EOPNOTSUPP;
1274 if (val < -1 || val > 1)
1275 return -EINVAL;
1276 if ((u8)val == SOCK_TXREHASH_DEFAULT)
1277 val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1278 /* Paired with READ_ONCE() in tcp_rtx_synack()
1279 * and sk_getsockopt().
1280 */
1281 WRITE_ONCE(sk->sk_txrehash, (u8)val);
1282 return 0;
1283 case SO_PEEK_OFF:
1284 {
1285 int (*set_peek_off)(struct sock *sk, int val);
1286
1287 set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1288 if (set_peek_off)
1289 ret = set_peek_off(sk, val);
1290 else
1291 ret = -EOPNOTSUPP;
1292 return ret;
1293 }
1294 #ifdef CONFIG_PAGE_POOL
1295 case SO_DEVMEM_DONTNEED:
1296 return sock_devmem_dontneed(sk, optval, optlen);
1297 #endif
1298 }
1299
1300 sockopt_lock_sock(sk);
1301
1302 switch (optname) {
1303 case SO_DEBUG:
1304 if (val && !sockopt_capable(CAP_NET_ADMIN))
1305 ret = -EACCES;
1306 else
1307 sock_valbool_flag(sk, SOCK_DBG, valbool);
1308 break;
1309 case SO_REUSEADDR:
1310 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1311 break;
1312 case SO_REUSEPORT:
1313 if (valbool && !sk_is_inet(sk))
1314 ret = -EOPNOTSUPP;
1315 else
1316 sk->sk_reuseport = valbool;
1317 break;
1318 case SO_DONTROUTE:
1319 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1320 sk_dst_reset(sk);
1321 break;
1322 case SO_BROADCAST:
1323 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1324 break;
1325 case SO_SNDBUF:
1326 /* Don't error on this BSD doesn't and if you think
1327 * about it this is right. Otherwise apps have to
1328 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1329 * are treated in BSD as hints
1330 */
1331 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1332 set_sndbuf:
1333 /* Ensure val * 2 fits into an int, to prevent max_t()
1334 * from treating it as a negative value.
1335 */
1336 val = min_t(int, val, INT_MAX / 2);
1337 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1338 WRITE_ONCE(sk->sk_sndbuf,
1339 max_t(int, val * 2, SOCK_MIN_SNDBUF));
1340 /* Wake up sending tasks if we upped the value. */
1341 sk->sk_write_space(sk);
1342 break;
1343
1344 case SO_SNDBUFFORCE:
1345 if (!sockopt_capable(CAP_NET_ADMIN)) {
1346 ret = -EPERM;
1347 break;
1348 }
1349
1350 /* No negative values (to prevent underflow, as val will be
1351 * multiplied by 2).
1352 */
1353 if (val < 0)
1354 val = 0;
1355 goto set_sndbuf;
1356
1357 case SO_RCVBUF:
1358 /* Don't error on this BSD doesn't and if you think
1359 * about it this is right. Otherwise apps have to
1360 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1361 * are treated in BSD as hints
1362 */
1363 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1364 break;
1365
1366 case SO_RCVBUFFORCE:
1367 if (!sockopt_capable(CAP_NET_ADMIN)) {
1368 ret = -EPERM;
1369 break;
1370 }
1371
1372 /* No negative values (to prevent underflow, as val will be
1373 * multiplied by 2).
1374 */
1375 __sock_set_rcvbuf(sk, max(val, 0));
1376 break;
1377
1378 case SO_KEEPALIVE:
1379 if (sk->sk_prot->keepalive)
1380 sk->sk_prot->keepalive(sk, valbool);
1381 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1382 break;
1383
1384 case SO_OOBINLINE:
1385 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1386 break;
1387
1388 case SO_NO_CHECK:
1389 sk->sk_no_check_tx = valbool;
1390 break;
1391
1392 case SO_LINGER:
1393 if (optlen < sizeof(ling)) {
1394 ret = -EINVAL; /* 1003.1g */
1395 break;
1396 }
1397 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1398 ret = -EFAULT;
1399 break;
1400 }
1401 if (!ling.l_onoff) {
1402 sock_reset_flag(sk, SOCK_LINGER);
1403 } else {
1404 unsigned long t_sec = ling.l_linger;
1405
1406 if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1407 WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1408 else
1409 WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1410 sock_set_flag(sk, SOCK_LINGER);
1411 }
1412 break;
1413
1414 case SO_BSDCOMPAT:
1415 break;
1416
1417 case SO_TIMESTAMP_OLD:
1418 case SO_TIMESTAMP_NEW:
1419 case SO_TIMESTAMPNS_OLD:
1420 case SO_TIMESTAMPNS_NEW:
1421 sock_set_timestamp(sk, optname, valbool);
1422 break;
1423
1424 case SO_TIMESTAMPING_NEW:
1425 case SO_TIMESTAMPING_OLD:
1426 if (optlen == sizeof(timestamping)) {
1427 if (copy_from_sockptr(×tamping, optval,
1428 sizeof(timestamping))) {
1429 ret = -EFAULT;
1430 break;
1431 }
1432 } else {
1433 memset(×tamping, 0, sizeof(timestamping));
1434 timestamping.flags = val;
1435 }
1436 ret = sock_set_timestamping(sk, optname, timestamping);
1437 break;
1438
1439 case SO_RCVLOWAT:
1440 {
1441 int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1442
1443 if (val < 0)
1444 val = INT_MAX;
1445 if (sock)
1446 set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1447 if (set_rcvlowat)
1448 ret = set_rcvlowat(sk, val);
1449 else
1450 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1451 break;
1452 }
1453 case SO_RCVTIMEO_OLD:
1454 case SO_RCVTIMEO_NEW:
1455 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1456 optlen, optname == SO_RCVTIMEO_OLD);
1457 break;
1458
1459 case SO_SNDTIMEO_OLD:
1460 case SO_SNDTIMEO_NEW:
1461 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1462 optlen, optname == SO_SNDTIMEO_OLD);
1463 break;
1464
1465 case SO_ATTACH_FILTER: {
1466 struct sock_fprog fprog;
1467
1468 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1469 if (!ret)
1470 ret = sk_attach_filter(&fprog, sk);
1471 break;
1472 }
1473 case SO_ATTACH_BPF:
1474 ret = -EINVAL;
1475 if (optlen == sizeof(u32)) {
1476 u32 ufd;
1477
1478 ret = -EFAULT;
1479 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1480 break;
1481
1482 ret = sk_attach_bpf(ufd, sk);
1483 }
1484 break;
1485
1486 case SO_ATTACH_REUSEPORT_CBPF: {
1487 struct sock_fprog fprog;
1488
1489 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1490 if (!ret)
1491 ret = sk_reuseport_attach_filter(&fprog, sk);
1492 break;
1493 }
1494 case SO_ATTACH_REUSEPORT_EBPF:
1495 ret = -EINVAL;
1496 if (optlen == sizeof(u32)) {
1497 u32 ufd;
1498
1499 ret = -EFAULT;
1500 if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1501 break;
1502
1503 ret = sk_reuseport_attach_bpf(ufd, sk);
1504 }
1505 break;
1506
1507 case SO_DETACH_REUSEPORT_BPF:
1508 ret = reuseport_detach_prog(sk);
1509 break;
1510
1511 case SO_DETACH_FILTER:
1512 ret = sk_detach_filter(sk);
1513 break;
1514
1515 case SO_LOCK_FILTER:
1516 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1517 ret = -EPERM;
1518 else
1519 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1520 break;
1521
1522 case SO_MARK:
1523 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1524 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1525 ret = -EPERM;
1526 break;
1527 }
1528
1529 __sock_set_mark(sk, val);
1530 break;
1531 case SO_RCVMARK:
1532 sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1533 break;
1534
1535 case SO_RCVPRIORITY:
1536 sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool);
1537 break;
1538
1539 case SO_RXQ_OVFL:
1540 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1541 break;
1542
1543 case SO_WIFI_STATUS:
1544 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1545 break;
1546
1547 case SO_NOFCS:
1548 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1549 break;
1550
1551 case SO_SELECT_ERR_QUEUE:
1552 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1553 break;
1554
1555 case SO_PASSCRED:
1556 if (sk_may_scm_recv(sk))
1557 sk->sk_scm_credentials = valbool;
1558 else
1559 ret = -EOPNOTSUPP;
1560 break;
1561
1562 case SO_PASSSEC:
1563 if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk))
1564 sk->sk_scm_security = valbool;
1565 else
1566 ret = -EOPNOTSUPP;
1567 break;
1568
1569 case SO_PASSPIDFD:
1570 if (sk_is_unix(sk))
1571 sk->sk_scm_pidfd = valbool;
1572 else
1573 ret = -EOPNOTSUPP;
1574 break;
1575
1576 case SO_PASSRIGHTS:
1577 if (sk_is_unix(sk))
1578 sk->sk_scm_rights = valbool;
1579 else
1580 ret = -EOPNOTSUPP;
1581 break;
1582
1583 case SO_INCOMING_CPU:
1584 reuseport_update_incoming_cpu(sk, val);
1585 break;
1586
1587 case SO_CNX_ADVICE:
1588 if (val == 1)
1589 dst_negative_advice(sk);
1590 break;
1591
1592 case SO_ZEROCOPY:
1593 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1594 if (!(sk_is_tcp(sk) ||
1595 (sk->sk_type == SOCK_DGRAM &&
1596 sk->sk_protocol == IPPROTO_UDP)))
1597 ret = -EOPNOTSUPP;
1598 } else if (sk->sk_family != PF_RDS) {
1599 ret = -EOPNOTSUPP;
1600 }
1601 if (!ret) {
1602 if (val < 0 || val > 1)
1603 ret = -EINVAL;
1604 else
1605 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1606 }
1607 break;
1608
1609 case SO_TXTIME:
1610 if (optlen != sizeof(struct sock_txtime)) {
1611 ret = -EINVAL;
1612 break;
1613 } else if (copy_from_sockptr(&sk_txtime, optval,
1614 sizeof(struct sock_txtime))) {
1615 ret = -EFAULT;
1616 break;
1617 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1618 ret = -EINVAL;
1619 break;
1620 }
1621 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1622 * scheduler has enough safe guards.
1623 */
1624 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1625 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1626 ret = -EPERM;
1627 break;
1628 }
1629
1630 ret = sockopt_validate_clockid(sk_txtime.clockid);
1631 if (ret)
1632 break;
1633
1634 sock_valbool_flag(sk, SOCK_TXTIME, true);
1635 sk->sk_clockid = sk_txtime.clockid;
1636 sk->sk_txtime_deadline_mode =
1637 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1638 sk->sk_txtime_report_errors =
1639 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1640 break;
1641
1642 case SO_BINDTOIFINDEX:
1643 ret = sock_bindtoindex_locked(sk, val);
1644 break;
1645
1646 case SO_BUF_LOCK:
1647 if (val & ~SOCK_BUF_LOCK_MASK) {
1648 ret = -EINVAL;
1649 break;
1650 }
1651 sk->sk_userlocks = val | (sk->sk_userlocks &
1652 ~SOCK_BUF_LOCK_MASK);
1653 break;
1654
1655 case SO_RESERVE_MEM:
1656 {
1657 int delta;
1658
1659 if (val < 0) {
1660 ret = -EINVAL;
1661 break;
1662 }
1663
1664 delta = val - sk->sk_reserved_mem;
1665 if (delta < 0)
1666 sock_release_reserved_memory(sk, -delta);
1667 else
1668 ret = sock_reserve_memory(sk, delta);
1669 break;
1670 }
1671
1672 default:
1673 ret = -ENOPROTOOPT;
1674 break;
1675 }
1676 sockopt_release_sock(sk);
1677 return ret;
1678 }
1679
sock_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)1680 int sock_setsockopt(struct socket *sock, int level, int optname,
1681 sockptr_t optval, unsigned int optlen)
1682 {
1683 return sk_setsockopt(sock->sk, level, optname,
1684 optval, optlen);
1685 }
1686 EXPORT_SYMBOL(sock_setsockopt);
1687
sk_get_peer_cred(struct sock * sk)1688 static const struct cred *sk_get_peer_cred(struct sock *sk)
1689 {
1690 const struct cred *cred;
1691
1692 spin_lock(&sk->sk_peer_lock);
1693 cred = get_cred(sk->sk_peer_cred);
1694 spin_unlock(&sk->sk_peer_lock);
1695
1696 return cred;
1697 }
1698
cred_to_ucred(struct pid * pid,const struct cred * cred,struct ucred * ucred)1699 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1700 struct ucred *ucred)
1701 {
1702 ucred->pid = pid_vnr(pid);
1703 ucred->uid = ucred->gid = -1;
1704 if (cred) {
1705 struct user_namespace *current_ns = current_user_ns();
1706
1707 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1708 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1709 }
1710 }
1711
groups_to_user(sockptr_t dst,const struct group_info * src)1712 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1713 {
1714 struct user_namespace *user_ns = current_user_ns();
1715 int i;
1716
1717 for (i = 0; i < src->ngroups; i++) {
1718 gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1719
1720 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1721 return -EFAULT;
1722 }
1723
1724 return 0;
1725 }
1726
sk_getsockopt(struct sock * sk,int level,int optname,sockptr_t optval,sockptr_t optlen)1727 int sk_getsockopt(struct sock *sk, int level, int optname,
1728 sockptr_t optval, sockptr_t optlen)
1729 {
1730 struct socket *sock = sk->sk_socket;
1731
1732 union {
1733 int val;
1734 u64 val64;
1735 unsigned long ulval;
1736 struct linger ling;
1737 struct old_timeval32 tm32;
1738 struct __kernel_old_timeval tm;
1739 struct __kernel_sock_timeval stm;
1740 struct sock_txtime txtime;
1741 struct so_timestamping timestamping;
1742 } v;
1743
1744 int lv = sizeof(int);
1745 int len;
1746
1747 if (copy_from_sockptr(&len, optlen, sizeof(int)))
1748 return -EFAULT;
1749 if (len < 0)
1750 return -EINVAL;
1751
1752 memset(&v, 0, sizeof(v));
1753
1754 switch (optname) {
1755 case SO_DEBUG:
1756 v.val = sock_flag(sk, SOCK_DBG);
1757 break;
1758
1759 case SO_DONTROUTE:
1760 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1761 break;
1762
1763 case SO_BROADCAST:
1764 v.val = sock_flag(sk, SOCK_BROADCAST);
1765 break;
1766
1767 case SO_SNDBUF:
1768 v.val = READ_ONCE(sk->sk_sndbuf);
1769 break;
1770
1771 case SO_RCVBUF:
1772 v.val = READ_ONCE(sk->sk_rcvbuf);
1773 break;
1774
1775 case SO_REUSEADDR:
1776 v.val = sk->sk_reuse;
1777 break;
1778
1779 case SO_REUSEPORT:
1780 v.val = sk->sk_reuseport;
1781 break;
1782
1783 case SO_KEEPALIVE:
1784 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1785 break;
1786
1787 case SO_TYPE:
1788 v.val = sk->sk_type;
1789 break;
1790
1791 case SO_PROTOCOL:
1792 v.val = sk->sk_protocol;
1793 break;
1794
1795 case SO_DOMAIN:
1796 v.val = sk->sk_family;
1797 break;
1798
1799 case SO_ERROR:
1800 v.val = -sock_error(sk);
1801 if (v.val == 0)
1802 v.val = xchg(&sk->sk_err_soft, 0);
1803 break;
1804
1805 case SO_OOBINLINE:
1806 v.val = sock_flag(sk, SOCK_URGINLINE);
1807 break;
1808
1809 case SO_NO_CHECK:
1810 v.val = sk->sk_no_check_tx;
1811 break;
1812
1813 case SO_PRIORITY:
1814 v.val = READ_ONCE(sk->sk_priority);
1815 break;
1816
1817 case SO_LINGER:
1818 lv = sizeof(v.ling);
1819 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER);
1820 v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ;
1821 break;
1822
1823 case SO_BSDCOMPAT:
1824 break;
1825
1826 case SO_TIMESTAMP_OLD:
1827 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1828 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1829 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1830 break;
1831
1832 case SO_TIMESTAMPNS_OLD:
1833 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1834 break;
1835
1836 case SO_TIMESTAMP_NEW:
1837 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1838 break;
1839
1840 case SO_TIMESTAMPNS_NEW:
1841 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1842 break;
1843
1844 case SO_TIMESTAMPING_OLD:
1845 case SO_TIMESTAMPING_NEW:
1846 lv = sizeof(v.timestamping);
1847 /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1848 * returning the flags when they were set through the same option.
1849 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1850 */
1851 if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1852 v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1853 v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1854 }
1855 break;
1856
1857 case SO_RCVTIMEO_OLD:
1858 case SO_RCVTIMEO_NEW:
1859 lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1860 SO_RCVTIMEO_OLD == optname);
1861 break;
1862
1863 case SO_SNDTIMEO_OLD:
1864 case SO_SNDTIMEO_NEW:
1865 lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1866 SO_SNDTIMEO_OLD == optname);
1867 break;
1868
1869 case SO_RCVLOWAT:
1870 v.val = READ_ONCE(sk->sk_rcvlowat);
1871 break;
1872
1873 case SO_SNDLOWAT:
1874 v.val = 1;
1875 break;
1876
1877 case SO_PASSCRED:
1878 if (!sk_may_scm_recv(sk))
1879 return -EOPNOTSUPP;
1880
1881 v.val = sk->sk_scm_credentials;
1882 break;
1883
1884 case SO_PASSPIDFD:
1885 if (!sk_is_unix(sk))
1886 return -EOPNOTSUPP;
1887
1888 v.val = sk->sk_scm_pidfd;
1889 break;
1890
1891 case SO_PASSRIGHTS:
1892 if (!sk_is_unix(sk))
1893 return -EOPNOTSUPP;
1894
1895 v.val = sk->sk_scm_rights;
1896 break;
1897
1898 case SO_PEERCRED:
1899 {
1900 struct ucred peercred;
1901 if (len > sizeof(peercred))
1902 len = sizeof(peercred);
1903
1904 spin_lock(&sk->sk_peer_lock);
1905 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1906 spin_unlock(&sk->sk_peer_lock);
1907
1908 if (copy_to_sockptr(optval, &peercred, len))
1909 return -EFAULT;
1910 goto lenout;
1911 }
1912
1913 case SO_PEERPIDFD:
1914 {
1915 struct pid *peer_pid;
1916 struct file *pidfd_file = NULL;
1917 unsigned int flags = 0;
1918 int pidfd;
1919
1920 if (len > sizeof(pidfd))
1921 len = sizeof(pidfd);
1922
1923 spin_lock(&sk->sk_peer_lock);
1924 peer_pid = get_pid(sk->sk_peer_pid);
1925 spin_unlock(&sk->sk_peer_lock);
1926
1927 if (!peer_pid)
1928 return -ENODATA;
1929
1930 /* The use of PIDFD_STALE requires stashing of struct pid
1931 * on pidfs with pidfs_register_pid() and only AF_UNIX
1932 * were prepared for this.
1933 */
1934 if (sk->sk_family == AF_UNIX)
1935 flags = PIDFD_STALE;
1936
1937 pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file);
1938 put_pid(peer_pid);
1939 if (pidfd < 0)
1940 return pidfd;
1941
1942 if (copy_to_sockptr(optval, &pidfd, len) ||
1943 copy_to_sockptr(optlen, &len, sizeof(int))) {
1944 put_unused_fd(pidfd);
1945 fput(pidfd_file);
1946
1947 return -EFAULT;
1948 }
1949
1950 fd_install(pidfd, pidfd_file);
1951 return 0;
1952 }
1953
1954 case SO_PEERGROUPS:
1955 {
1956 const struct cred *cred;
1957 int ret, n;
1958
1959 cred = sk_get_peer_cred(sk);
1960 if (!cred)
1961 return -ENODATA;
1962
1963 n = cred->group_info->ngroups;
1964 if (len < n * sizeof(gid_t)) {
1965 len = n * sizeof(gid_t);
1966 put_cred(cred);
1967 return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1968 }
1969 len = n * sizeof(gid_t);
1970
1971 ret = groups_to_user(optval, cred->group_info);
1972 put_cred(cred);
1973 if (ret)
1974 return ret;
1975 goto lenout;
1976 }
1977
1978 case SO_PEERNAME:
1979 {
1980 struct sockaddr_storage address;
1981
1982 lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1983 if (lv < 0)
1984 return -ENOTCONN;
1985 if (lv < len)
1986 return -EINVAL;
1987 if (copy_to_sockptr(optval, &address, len))
1988 return -EFAULT;
1989 goto lenout;
1990 }
1991
1992 /* Dubious BSD thing... Probably nobody even uses it, but
1993 * the UNIX standard wants it for whatever reason... -DaveM
1994 */
1995 case SO_ACCEPTCONN:
1996 v.val = sk->sk_state == TCP_LISTEN;
1997 break;
1998
1999 case SO_PASSSEC:
2000 if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk))
2001 return -EOPNOTSUPP;
2002
2003 v.val = sk->sk_scm_security;
2004 break;
2005
2006 case SO_PEERSEC:
2007 return security_socket_getpeersec_stream(sock,
2008 optval, optlen, len);
2009
2010 case SO_MARK:
2011 v.val = READ_ONCE(sk->sk_mark);
2012 break;
2013
2014 case SO_RCVMARK:
2015 v.val = sock_flag(sk, SOCK_RCVMARK);
2016 break;
2017
2018 case SO_RCVPRIORITY:
2019 v.val = sock_flag(sk, SOCK_RCVPRIORITY);
2020 break;
2021
2022 case SO_RXQ_OVFL:
2023 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
2024 break;
2025
2026 case SO_WIFI_STATUS:
2027 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
2028 break;
2029
2030 case SO_PEEK_OFF:
2031 if (!READ_ONCE(sock->ops)->set_peek_off)
2032 return -EOPNOTSUPP;
2033
2034 v.val = READ_ONCE(sk->sk_peek_off);
2035 break;
2036 case SO_NOFCS:
2037 v.val = sock_flag(sk, SOCK_NOFCS);
2038 break;
2039
2040 case SO_BINDTODEVICE:
2041 return sock_getbindtodevice(sk, optval, optlen, len);
2042
2043 case SO_GET_FILTER:
2044 len = sk_get_filter(sk, optval, len);
2045 if (len < 0)
2046 return len;
2047
2048 goto lenout;
2049
2050 case SO_LOCK_FILTER:
2051 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
2052 break;
2053
2054 case SO_BPF_EXTENSIONS:
2055 v.val = bpf_tell_extensions();
2056 break;
2057
2058 case SO_SELECT_ERR_QUEUE:
2059 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
2060 break;
2061
2062 #ifdef CONFIG_NET_RX_BUSY_POLL
2063 case SO_BUSY_POLL:
2064 v.val = READ_ONCE(sk->sk_ll_usec);
2065 break;
2066 case SO_PREFER_BUSY_POLL:
2067 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
2068 break;
2069 #endif
2070
2071 case SO_MAX_PACING_RATE:
2072 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
2073 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
2074 lv = sizeof(v.ulval);
2075 v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
2076 } else {
2077 /* 32bit version */
2078 v.val = min_t(unsigned long, ~0U,
2079 READ_ONCE(sk->sk_max_pacing_rate));
2080 }
2081 break;
2082
2083 case SO_INCOMING_CPU:
2084 v.val = READ_ONCE(sk->sk_incoming_cpu);
2085 break;
2086
2087 case SO_MEMINFO:
2088 {
2089 u32 meminfo[SK_MEMINFO_VARS];
2090
2091 sk_get_meminfo(sk, meminfo);
2092
2093 len = min_t(unsigned int, len, sizeof(meminfo));
2094 if (copy_to_sockptr(optval, &meminfo, len))
2095 return -EFAULT;
2096
2097 goto lenout;
2098 }
2099
2100 #ifdef CONFIG_NET_RX_BUSY_POLL
2101 case SO_INCOMING_NAPI_ID:
2102 v.val = READ_ONCE(sk->sk_napi_id);
2103
2104 /* aggregate non-NAPI IDs down to 0 */
2105 if (!napi_id_valid(v.val))
2106 v.val = 0;
2107
2108 break;
2109 #endif
2110
2111 case SO_COOKIE:
2112 lv = sizeof(u64);
2113 if (len < lv)
2114 return -EINVAL;
2115 v.val64 = sock_gen_cookie(sk);
2116 break;
2117
2118 case SO_ZEROCOPY:
2119 v.val = sock_flag(sk, SOCK_ZEROCOPY);
2120 break;
2121
2122 case SO_TXTIME:
2123 lv = sizeof(v.txtime);
2124 v.txtime.clockid = sk->sk_clockid;
2125 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
2126 SOF_TXTIME_DEADLINE_MODE : 0;
2127 v.txtime.flags |= sk->sk_txtime_report_errors ?
2128 SOF_TXTIME_REPORT_ERRORS : 0;
2129 break;
2130
2131 case SO_BINDTOIFINDEX:
2132 v.val = READ_ONCE(sk->sk_bound_dev_if);
2133 break;
2134
2135 case SO_NETNS_COOKIE:
2136 lv = sizeof(u64);
2137 if (len != lv)
2138 return -EINVAL;
2139 v.val64 = sock_net(sk)->net_cookie;
2140 break;
2141
2142 case SO_BUF_LOCK:
2143 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
2144 break;
2145
2146 case SO_RESERVE_MEM:
2147 v.val = READ_ONCE(sk->sk_reserved_mem);
2148 break;
2149
2150 case SO_TXREHASH:
2151 if (!sk_is_tcp(sk))
2152 return -EOPNOTSUPP;
2153
2154 /* Paired with WRITE_ONCE() in sk_setsockopt() */
2155 v.val = READ_ONCE(sk->sk_txrehash);
2156 break;
2157
2158 default:
2159 /* We implement the SO_SNDLOWAT etc to not be settable
2160 * (1003.1g 7).
2161 */
2162 return -ENOPROTOOPT;
2163 }
2164
2165 if (len > lv)
2166 len = lv;
2167 if (copy_to_sockptr(optval, &v, len))
2168 return -EFAULT;
2169 lenout:
2170 if (copy_to_sockptr(optlen, &len, sizeof(int)))
2171 return -EFAULT;
2172 return 0;
2173 }
2174
2175 /*
2176 * Initialize an sk_lock.
2177 *
2178 * (We also register the sk_lock with the lock validator.)
2179 */
sock_lock_init(struct sock * sk)2180 static inline void sock_lock_init(struct sock *sk)
2181 {
2182 sk_owner_clear(sk);
2183
2184 if (sk->sk_kern_sock)
2185 sock_lock_init_class_and_name(
2186 sk,
2187 af_family_kern_slock_key_strings[sk->sk_family],
2188 af_family_kern_slock_keys + sk->sk_family,
2189 af_family_kern_key_strings[sk->sk_family],
2190 af_family_kern_keys + sk->sk_family);
2191 else
2192 sock_lock_init_class_and_name(
2193 sk,
2194 af_family_slock_key_strings[sk->sk_family],
2195 af_family_slock_keys + sk->sk_family,
2196 af_family_key_strings[sk->sk_family],
2197 af_family_keys + sk->sk_family);
2198 }
2199
2200 /*
2201 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2202 * even temporarily, because of RCU lookups. sk_node should also be left as is.
2203 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2204 */
sock_copy(struct sock * nsk,const struct sock * osk)2205 static void sock_copy(struct sock *nsk, const struct sock *osk)
2206 {
2207 const struct proto *prot = READ_ONCE(osk->sk_prot);
2208 #ifdef CONFIG_SECURITY_NETWORK
2209 void *sptr = nsk->sk_security;
2210 #endif
2211
2212 /* If we move sk_tx_queue_mapping out of the private section,
2213 * we must check if sk_tx_queue_clear() is called after
2214 * sock_copy() in sk_clone_lock().
2215 */
2216 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2217 offsetof(struct sock, sk_dontcopy_begin) ||
2218 offsetof(struct sock, sk_tx_queue_mapping) >=
2219 offsetof(struct sock, sk_dontcopy_end));
2220
2221 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2222
2223 unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2224 prot->obj_size - offsetof(struct sock, sk_dontcopy_end),
2225 /* alloc is larger than struct, see sk_prot_alloc() */);
2226
2227 #ifdef CONFIG_SECURITY_NETWORK
2228 nsk->sk_security = sptr;
2229 security_sk_clone(osk, nsk);
2230 #endif
2231 }
2232
sk_prot_alloc(struct proto * prot,gfp_t priority,int family)2233 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2234 int family)
2235 {
2236 struct sock *sk;
2237 struct kmem_cache *slab;
2238
2239 slab = prot->slab;
2240 if (slab != NULL) {
2241 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2242 if (!sk)
2243 return sk;
2244 if (want_init_on_alloc(priority))
2245 sk_prot_clear_nulls(sk, prot->obj_size);
2246 } else
2247 sk = kmalloc(prot->obj_size, priority);
2248
2249 if (sk != NULL) {
2250 if (security_sk_alloc(sk, family, priority))
2251 goto out_free;
2252
2253 if (!try_module_get(prot->owner))
2254 goto out_free_sec;
2255 }
2256
2257 return sk;
2258
2259 out_free_sec:
2260 security_sk_free(sk);
2261 out_free:
2262 if (slab != NULL)
2263 kmem_cache_free(slab, sk);
2264 else
2265 kfree(sk);
2266 return NULL;
2267 }
2268
sk_prot_free(struct proto * prot,struct sock * sk)2269 static void sk_prot_free(struct proto *prot, struct sock *sk)
2270 {
2271 struct kmem_cache *slab;
2272 struct module *owner;
2273
2274 owner = prot->owner;
2275 slab = prot->slab;
2276
2277 cgroup_sk_free(&sk->sk_cgrp_data);
2278 mem_cgroup_sk_free(sk);
2279 security_sk_free(sk);
2280
2281 sk_owner_put(sk);
2282
2283 if (slab != NULL)
2284 kmem_cache_free(slab, sk);
2285 else
2286 kfree(sk);
2287 module_put(owner);
2288 }
2289
2290 /**
2291 * sk_alloc - All socket objects are allocated here
2292 * @net: the applicable net namespace
2293 * @family: protocol family
2294 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2295 * @prot: struct proto associated with this new sock instance
2296 * @kern: is this to be a kernel socket?
2297 */
sk_alloc(struct net * net,int family,gfp_t priority,struct proto * prot,int kern)2298 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2299 struct proto *prot, int kern)
2300 {
2301 struct sock *sk;
2302
2303 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2304 if (sk) {
2305 sk->sk_family = family;
2306 /*
2307 * See comment in struct sock definition to understand
2308 * why we need sk_prot_creator -acme
2309 */
2310 sk->sk_prot = sk->sk_prot_creator = prot;
2311 sk->sk_kern_sock = kern;
2312 sock_lock_init(sk);
2313 sk->sk_net_refcnt = kern ? 0 : 1;
2314 if (likely(sk->sk_net_refcnt)) {
2315 get_net_track(net, &sk->ns_tracker, priority);
2316 sock_inuse_add(net, 1);
2317 } else {
2318 net_passive_inc(net);
2319 __netns_tracker_alloc(net, &sk->ns_tracker,
2320 false, priority);
2321 }
2322
2323 sock_net_set(sk, net);
2324 refcount_set(&sk->sk_wmem_alloc, 1);
2325
2326 mem_cgroup_sk_alloc(sk);
2327 cgroup_sk_alloc(&sk->sk_cgrp_data);
2328 sock_update_classid(&sk->sk_cgrp_data);
2329 sock_update_netprioidx(&sk->sk_cgrp_data);
2330 sk_tx_queue_clear(sk);
2331 }
2332
2333 return sk;
2334 }
2335 EXPORT_SYMBOL(sk_alloc);
2336
2337 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2338 * grace period. This is the case for UDP sockets and TCP listeners.
2339 */
__sk_destruct(struct rcu_head * head)2340 static void __sk_destruct(struct rcu_head *head)
2341 {
2342 struct sock *sk = container_of(head, struct sock, sk_rcu);
2343 struct net *net = sock_net(sk);
2344 struct sk_filter *filter;
2345
2346 if (sk->sk_destruct)
2347 sk->sk_destruct(sk);
2348
2349 filter = rcu_dereference_check(sk->sk_filter,
2350 refcount_read(&sk->sk_wmem_alloc) == 0);
2351 if (filter) {
2352 sk_filter_uncharge(sk, filter);
2353 RCU_INIT_POINTER(sk->sk_filter, NULL);
2354 }
2355
2356 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2357
2358 #ifdef CONFIG_BPF_SYSCALL
2359 bpf_sk_storage_free(sk);
2360 #endif
2361
2362 if (atomic_read(&sk->sk_omem_alloc))
2363 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2364 __func__, atomic_read(&sk->sk_omem_alloc));
2365
2366 if (sk->sk_frag.page) {
2367 put_page(sk->sk_frag.page);
2368 sk->sk_frag.page = NULL;
2369 }
2370
2371 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2372 put_cred(sk->sk_peer_cred);
2373 put_pid(sk->sk_peer_pid);
2374
2375 if (likely(sk->sk_net_refcnt)) {
2376 put_net_track(net, &sk->ns_tracker);
2377 } else {
2378 __netns_tracker_free(net, &sk->ns_tracker, false);
2379 net_passive_dec(net);
2380 }
2381 sk_prot_free(sk->sk_prot_creator, sk);
2382 }
2383
sk_net_refcnt_upgrade(struct sock * sk)2384 void sk_net_refcnt_upgrade(struct sock *sk)
2385 {
2386 struct net *net = sock_net(sk);
2387
2388 WARN_ON_ONCE(sk->sk_net_refcnt);
2389 __netns_tracker_free(net, &sk->ns_tracker, false);
2390 net_passive_dec(net);
2391 sk->sk_net_refcnt = 1;
2392 get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
2393 sock_inuse_add(net, 1);
2394 }
2395 EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade);
2396
sk_destruct(struct sock * sk)2397 void sk_destruct(struct sock *sk)
2398 {
2399 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2400
2401 if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2402 reuseport_detach_sock(sk);
2403 use_call_rcu = true;
2404 }
2405
2406 if (use_call_rcu)
2407 call_rcu(&sk->sk_rcu, __sk_destruct);
2408 else
2409 __sk_destruct(&sk->sk_rcu);
2410 }
2411
__sk_free(struct sock * sk)2412 static void __sk_free(struct sock *sk)
2413 {
2414 if (likely(sk->sk_net_refcnt))
2415 sock_inuse_add(sock_net(sk), -1);
2416
2417 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2418 sock_diag_broadcast_destroy(sk);
2419 else
2420 sk_destruct(sk);
2421 }
2422
sk_free(struct sock * sk)2423 void sk_free(struct sock *sk)
2424 {
2425 /*
2426 * We subtract one from sk_wmem_alloc and can know if
2427 * some packets are still in some tx queue.
2428 * If not null, sock_wfree() will call __sk_free(sk) later
2429 */
2430 if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2431 __sk_free(sk);
2432 }
2433 EXPORT_SYMBOL(sk_free);
2434
sk_init_common(struct sock * sk)2435 static void sk_init_common(struct sock *sk)
2436 {
2437 skb_queue_head_init(&sk->sk_receive_queue);
2438 skb_queue_head_init(&sk->sk_write_queue);
2439 skb_queue_head_init(&sk->sk_error_queue);
2440
2441 rwlock_init(&sk->sk_callback_lock);
2442 lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2443 af_rlock_keys + sk->sk_family,
2444 af_family_rlock_key_strings[sk->sk_family]);
2445 lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2446 af_wlock_keys + sk->sk_family,
2447 af_family_wlock_key_strings[sk->sk_family]);
2448 lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2449 af_elock_keys + sk->sk_family,
2450 af_family_elock_key_strings[sk->sk_family]);
2451 if (sk->sk_kern_sock)
2452 lockdep_set_class_and_name(&sk->sk_callback_lock,
2453 af_kern_callback_keys + sk->sk_family,
2454 af_family_kern_clock_key_strings[sk->sk_family]);
2455 else
2456 lockdep_set_class_and_name(&sk->sk_callback_lock,
2457 af_callback_keys + sk->sk_family,
2458 af_family_clock_key_strings[sk->sk_family]);
2459 }
2460
2461 /**
2462 * sk_clone_lock - clone a socket, and lock its clone
2463 * @sk: the socket to clone
2464 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2465 *
2466 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2467 */
sk_clone_lock(const struct sock * sk,const gfp_t priority)2468 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2469 {
2470 struct proto *prot = READ_ONCE(sk->sk_prot);
2471 struct sk_filter *filter;
2472 bool is_charged = true;
2473 struct sock *newsk;
2474
2475 newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2476 if (!newsk)
2477 goto out;
2478
2479 sock_copy(newsk, sk);
2480
2481 newsk->sk_prot_creator = prot;
2482
2483 /* SANITY */
2484 if (likely(newsk->sk_net_refcnt)) {
2485 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2486 sock_inuse_add(sock_net(newsk), 1);
2487 } else {
2488 /* Kernel sockets are not elevating the struct net refcount.
2489 * Instead, use a tracker to more easily detect if a layer
2490 * is not properly dismantling its kernel sockets at netns
2491 * destroy time.
2492 */
2493 net_passive_inc(sock_net(newsk));
2494 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2495 false, priority);
2496 }
2497 sk_node_init(&newsk->sk_node);
2498 sock_lock_init(newsk);
2499 bh_lock_sock(newsk);
2500 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
2501 newsk->sk_backlog.len = 0;
2502
2503 atomic_set(&newsk->sk_rmem_alloc, 0);
2504
2505 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2506 refcount_set(&newsk->sk_wmem_alloc, 1);
2507
2508 atomic_set(&newsk->sk_omem_alloc, 0);
2509 sk_init_common(newsk);
2510
2511 newsk->sk_dst_cache = NULL;
2512 newsk->sk_dst_pending_confirm = 0;
2513 newsk->sk_wmem_queued = 0;
2514 newsk->sk_forward_alloc = 0;
2515 newsk->sk_reserved_mem = 0;
2516 atomic_set(&newsk->sk_drops, 0);
2517 newsk->sk_send_head = NULL;
2518 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2519 atomic_set(&newsk->sk_zckey, 0);
2520
2521 sock_reset_flag(newsk, SOCK_DONE);
2522
2523 /* sk->sk_memcg will be populated at accept() time */
2524 newsk->sk_memcg = NULL;
2525
2526 cgroup_sk_clone(&newsk->sk_cgrp_data);
2527
2528 rcu_read_lock();
2529 filter = rcu_dereference(sk->sk_filter);
2530 if (filter != NULL)
2531 /* though it's an empty new sock, the charging may fail
2532 * if sysctl_optmem_max was changed between creation of
2533 * original socket and cloning
2534 */
2535 is_charged = sk_filter_charge(newsk, filter);
2536 RCU_INIT_POINTER(newsk->sk_filter, filter);
2537 rcu_read_unlock();
2538
2539 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2540 /* We need to make sure that we don't uncharge the new
2541 * socket if we couldn't charge it in the first place
2542 * as otherwise we uncharge the parent's filter.
2543 */
2544 if (!is_charged)
2545 RCU_INIT_POINTER(newsk->sk_filter, NULL);
2546
2547 goto free;
2548 }
2549
2550 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2551
2552 if (bpf_sk_storage_clone(sk, newsk))
2553 goto free;
2554
2555 /* Clear sk_user_data if parent had the pointer tagged
2556 * as not suitable for copying when cloning.
2557 */
2558 if (sk_user_data_is_nocopy(newsk))
2559 newsk->sk_user_data = NULL;
2560
2561 newsk->sk_err = 0;
2562 newsk->sk_err_soft = 0;
2563 newsk->sk_priority = 0;
2564 newsk->sk_incoming_cpu = raw_smp_processor_id();
2565
2566 /* Before updating sk_refcnt, we must commit prior changes to memory
2567 * (Documentation/RCU/rculist_nulls.rst for details)
2568 */
2569 smp_wmb();
2570 refcount_set(&newsk->sk_refcnt, 2);
2571
2572 sk_set_socket(newsk, NULL);
2573 sk_tx_queue_clear(newsk);
2574 RCU_INIT_POINTER(newsk->sk_wq, NULL);
2575
2576 if (newsk->sk_prot->sockets_allocated)
2577 sk_sockets_allocated_inc(newsk);
2578
2579 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2580 net_enable_timestamp();
2581 out:
2582 return newsk;
2583 free:
2584 /* It is still raw copy of parent, so invalidate
2585 * destructor and make plain sk_free()
2586 */
2587 newsk->sk_destruct = NULL;
2588 bh_unlock_sock(newsk);
2589 sk_free(newsk);
2590 newsk = NULL;
2591 goto out;
2592 }
2593 EXPORT_SYMBOL_GPL(sk_clone_lock);
2594
sk_dst_gso_max_size(struct sock * sk,struct dst_entry * dst)2595 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2596 {
2597 bool is_ipv6 = false;
2598 u32 max_size;
2599
2600 #if IS_ENABLED(CONFIG_IPV6)
2601 is_ipv6 = (sk->sk_family == AF_INET6 &&
2602 !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2603 #endif
2604 /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2605 max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2606 READ_ONCE(dst->dev->gso_ipv4_max_size);
2607 if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2608 max_size = GSO_LEGACY_MAX_SIZE;
2609
2610 return max_size - (MAX_TCP_HEADER + 1);
2611 }
2612
sk_setup_caps(struct sock * sk,struct dst_entry * dst)2613 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2614 {
2615 u32 max_segs = 1;
2616
2617 sk->sk_route_caps = dst->dev->features;
2618 if (sk_is_tcp(sk)) {
2619 struct inet_connection_sock *icsk = inet_csk(sk);
2620
2621 sk->sk_route_caps |= NETIF_F_GSO;
2622 icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK);
2623 }
2624 if (sk->sk_route_caps & NETIF_F_GSO)
2625 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2626 if (unlikely(sk->sk_gso_disabled))
2627 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2628 if (sk_can_gso(sk)) {
2629 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2630 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2631 } else {
2632 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2633 sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2634 /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2635 max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2636 }
2637 }
2638 sk->sk_gso_max_segs = max_segs;
2639 sk_dst_set(sk, dst);
2640 }
2641 EXPORT_SYMBOL_GPL(sk_setup_caps);
2642
2643 /*
2644 * Simple resource managers for sockets.
2645 */
2646
2647
2648 /*
2649 * Write buffer destructor automatically called from kfree_skb.
2650 */
sock_wfree(struct sk_buff * skb)2651 void sock_wfree(struct sk_buff *skb)
2652 {
2653 struct sock *sk = skb->sk;
2654 unsigned int len = skb->truesize;
2655 bool free;
2656
2657 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2658 if (sock_flag(sk, SOCK_RCU_FREE) &&
2659 sk->sk_write_space == sock_def_write_space) {
2660 rcu_read_lock();
2661 free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2662 sock_def_write_space_wfree(sk);
2663 rcu_read_unlock();
2664 if (unlikely(free))
2665 __sk_free(sk);
2666 return;
2667 }
2668
2669 /*
2670 * Keep a reference on sk_wmem_alloc, this will be released
2671 * after sk_write_space() call
2672 */
2673 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2674 sk->sk_write_space(sk);
2675 len = 1;
2676 }
2677 /*
2678 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2679 * could not do because of in-flight packets
2680 */
2681 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2682 __sk_free(sk);
2683 }
2684 EXPORT_SYMBOL(sock_wfree);
2685
2686 /* This variant of sock_wfree() is used by TCP,
2687 * since it sets SOCK_USE_WRITE_QUEUE.
2688 */
__sock_wfree(struct sk_buff * skb)2689 void __sock_wfree(struct sk_buff *skb)
2690 {
2691 struct sock *sk = skb->sk;
2692
2693 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2694 __sk_free(sk);
2695 }
2696
skb_set_owner_w(struct sk_buff * skb,struct sock * sk)2697 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2698 {
2699 skb_orphan(skb);
2700 #ifdef CONFIG_INET
2701 if (unlikely(!sk_fullsock(sk)))
2702 return skb_set_owner_edemux(skb, sk);
2703 #endif
2704 skb->sk = sk;
2705 skb->destructor = sock_wfree;
2706 skb_set_hash_from_sk(skb, sk);
2707 /*
2708 * We used to take a refcount on sk, but following operation
2709 * is enough to guarantee sk_free() won't free this sock until
2710 * all in-flight packets are completed
2711 */
2712 refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2713 }
2714 EXPORT_SYMBOL(skb_set_owner_w);
2715
can_skb_orphan_partial(const struct sk_buff * skb)2716 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2717 {
2718 /* Drivers depend on in-order delivery for crypto offload,
2719 * partial orphan breaks out-of-order-OK logic.
2720 */
2721 if (skb_is_decrypted(skb))
2722 return false;
2723
2724 return (skb->destructor == sock_wfree ||
2725 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2726 }
2727
2728 /* This helper is used by netem, as it can hold packets in its
2729 * delay queue. We want to allow the owner socket to send more
2730 * packets, as if they were already TX completed by a typical driver.
2731 * But we also want to keep skb->sk set because some packet schedulers
2732 * rely on it (sch_fq for example).
2733 */
skb_orphan_partial(struct sk_buff * skb)2734 void skb_orphan_partial(struct sk_buff *skb)
2735 {
2736 if (skb_is_tcp_pure_ack(skb))
2737 return;
2738
2739 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2740 return;
2741
2742 skb_orphan(skb);
2743 }
2744 EXPORT_SYMBOL(skb_orphan_partial);
2745
2746 /*
2747 * Read buffer destructor automatically called from kfree_skb.
2748 */
sock_rfree(struct sk_buff * skb)2749 void sock_rfree(struct sk_buff *skb)
2750 {
2751 struct sock *sk = skb->sk;
2752 unsigned int len = skb->truesize;
2753
2754 atomic_sub(len, &sk->sk_rmem_alloc);
2755 sk_mem_uncharge(sk, len);
2756 }
2757 EXPORT_SYMBOL(sock_rfree);
2758
2759 /*
2760 * Buffer destructor for skbs that are not used directly in read or write
2761 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2762 */
sock_efree(struct sk_buff * skb)2763 void sock_efree(struct sk_buff *skb)
2764 {
2765 sock_put(skb->sk);
2766 }
2767 EXPORT_SYMBOL(sock_efree);
2768
2769 /* Buffer destructor for prefetch/receive path where reference count may
2770 * not be held, e.g. for listen sockets.
2771 */
2772 #ifdef CONFIG_INET
sock_pfree(struct sk_buff * skb)2773 void sock_pfree(struct sk_buff *skb)
2774 {
2775 struct sock *sk = skb->sk;
2776
2777 if (!sk_is_refcounted(sk))
2778 return;
2779
2780 if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) {
2781 inet_reqsk(sk)->rsk_listener = NULL;
2782 reqsk_free(inet_reqsk(sk));
2783 return;
2784 }
2785
2786 sock_gen_put(sk);
2787 }
2788 EXPORT_SYMBOL(sock_pfree);
2789 #endif /* CONFIG_INET */
2790
sock_i_uid(struct sock * sk)2791 kuid_t sock_i_uid(struct sock *sk)
2792 {
2793 kuid_t uid;
2794
2795 read_lock_bh(&sk->sk_callback_lock);
2796 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2797 read_unlock_bh(&sk->sk_callback_lock);
2798 return uid;
2799 }
2800 EXPORT_SYMBOL(sock_i_uid);
2801
__sock_i_ino(struct sock * sk)2802 unsigned long __sock_i_ino(struct sock *sk)
2803 {
2804 unsigned long ino;
2805
2806 read_lock(&sk->sk_callback_lock);
2807 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2808 read_unlock(&sk->sk_callback_lock);
2809 return ino;
2810 }
2811 EXPORT_SYMBOL(__sock_i_ino);
2812
sock_i_ino(struct sock * sk)2813 unsigned long sock_i_ino(struct sock *sk)
2814 {
2815 unsigned long ino;
2816
2817 local_bh_disable();
2818 ino = __sock_i_ino(sk);
2819 local_bh_enable();
2820 return ino;
2821 }
2822 EXPORT_SYMBOL(sock_i_ino);
2823
2824 /*
2825 * Allocate a skb from the socket's send buffer.
2826 */
sock_wmalloc(struct sock * sk,unsigned long size,int force,gfp_t priority)2827 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2828 gfp_t priority)
2829 {
2830 if (force ||
2831 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2832 struct sk_buff *skb = alloc_skb(size, priority);
2833
2834 if (skb) {
2835 skb_set_owner_w(skb, sk);
2836 return skb;
2837 }
2838 }
2839 return NULL;
2840 }
2841 EXPORT_SYMBOL(sock_wmalloc);
2842
sock_ofree(struct sk_buff * skb)2843 static void sock_ofree(struct sk_buff *skb)
2844 {
2845 struct sock *sk = skb->sk;
2846
2847 atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2848 }
2849
sock_omalloc(struct sock * sk,unsigned long size,gfp_t priority)2850 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2851 gfp_t priority)
2852 {
2853 struct sk_buff *skb;
2854
2855 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2856 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2857 READ_ONCE(sock_net(sk)->core.sysctl_optmem_max))
2858 return NULL;
2859
2860 skb = alloc_skb(size, priority);
2861 if (!skb)
2862 return NULL;
2863
2864 atomic_add(skb->truesize, &sk->sk_omem_alloc);
2865 skb->sk = sk;
2866 skb->destructor = sock_ofree;
2867 return skb;
2868 }
2869
2870 /*
2871 * Allocate a memory block from the socket's option memory buffer.
2872 */
sock_kmalloc(struct sock * sk,int size,gfp_t priority)2873 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2874 {
2875 int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max);
2876
2877 if ((unsigned int)size <= optmem_max &&
2878 atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2879 void *mem;
2880 /* First do the add, to avoid the race if kmalloc
2881 * might sleep.
2882 */
2883 atomic_add(size, &sk->sk_omem_alloc);
2884 mem = kmalloc(size, priority);
2885 if (mem)
2886 return mem;
2887 atomic_sub(size, &sk->sk_omem_alloc);
2888 }
2889 return NULL;
2890 }
2891 EXPORT_SYMBOL(sock_kmalloc);
2892
2893 /*
2894 * Duplicate the input "src" memory block using the socket's
2895 * option memory buffer.
2896 */
sock_kmemdup(struct sock * sk,const void * src,int size,gfp_t priority)2897 void *sock_kmemdup(struct sock *sk, const void *src,
2898 int size, gfp_t priority)
2899 {
2900 void *mem;
2901
2902 mem = sock_kmalloc(sk, size, priority);
2903 if (mem)
2904 memcpy(mem, src, size);
2905 return mem;
2906 }
2907 EXPORT_SYMBOL(sock_kmemdup);
2908
2909 /* Free an option memory block. Note, we actually want the inline
2910 * here as this allows gcc to detect the nullify and fold away the
2911 * condition entirely.
2912 */
__sock_kfree_s(struct sock * sk,void * mem,int size,const bool nullify)2913 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2914 const bool nullify)
2915 {
2916 if (WARN_ON_ONCE(!mem))
2917 return;
2918 if (nullify)
2919 kfree_sensitive(mem);
2920 else
2921 kfree(mem);
2922 atomic_sub(size, &sk->sk_omem_alloc);
2923 }
2924
sock_kfree_s(struct sock * sk,void * mem,int size)2925 void sock_kfree_s(struct sock *sk, void *mem, int size)
2926 {
2927 __sock_kfree_s(sk, mem, size, false);
2928 }
2929 EXPORT_SYMBOL(sock_kfree_s);
2930
sock_kzfree_s(struct sock * sk,void * mem,int size)2931 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2932 {
2933 __sock_kfree_s(sk, mem, size, true);
2934 }
2935 EXPORT_SYMBOL(sock_kzfree_s);
2936
2937 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2938 I think, these locks should be removed for datagram sockets.
2939 */
sock_wait_for_wmem(struct sock * sk,long timeo)2940 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2941 {
2942 DEFINE_WAIT(wait);
2943
2944 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2945 for (;;) {
2946 if (!timeo)
2947 break;
2948 if (signal_pending(current))
2949 break;
2950 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2951 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2952 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2953 break;
2954 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2955 break;
2956 if (READ_ONCE(sk->sk_err))
2957 break;
2958 timeo = schedule_timeout(timeo);
2959 }
2960 finish_wait(sk_sleep(sk), &wait);
2961 return timeo;
2962 }
2963
2964
2965 /*
2966 * Generic send/receive buffer handlers
2967 */
2968
sock_alloc_send_pskb(struct sock * sk,unsigned long header_len,unsigned long data_len,int noblock,int * errcode,int max_page_order)2969 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2970 unsigned long data_len, int noblock,
2971 int *errcode, int max_page_order)
2972 {
2973 struct sk_buff *skb;
2974 long timeo;
2975 int err;
2976
2977 timeo = sock_sndtimeo(sk, noblock);
2978 for (;;) {
2979 err = sock_error(sk);
2980 if (err != 0)
2981 goto failure;
2982
2983 err = -EPIPE;
2984 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2985 goto failure;
2986
2987 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2988 break;
2989
2990 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2991 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2992 err = -EAGAIN;
2993 if (!timeo)
2994 goto failure;
2995 if (signal_pending(current))
2996 goto interrupted;
2997 timeo = sock_wait_for_wmem(sk, timeo);
2998 }
2999 skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
3000 errcode, sk->sk_allocation);
3001 if (skb)
3002 skb_set_owner_w(skb, sk);
3003 return skb;
3004
3005 interrupted:
3006 err = sock_intr_errno(timeo);
3007 failure:
3008 *errcode = err;
3009 return NULL;
3010 }
3011 EXPORT_SYMBOL(sock_alloc_send_pskb);
3012
__sock_cmsg_send(struct sock * sk,struct cmsghdr * cmsg,struct sockcm_cookie * sockc)3013 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
3014 struct sockcm_cookie *sockc)
3015 {
3016 u32 tsflags;
3017
3018 BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31));
3019
3020 switch (cmsg->cmsg_type) {
3021 case SO_MARK:
3022 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
3023 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3024 return -EPERM;
3025 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3026 return -EINVAL;
3027 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
3028 break;
3029 case SO_TIMESTAMPING_OLD:
3030 case SO_TIMESTAMPING_NEW:
3031 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3032 return -EINVAL;
3033
3034 tsflags = *(u32 *)CMSG_DATA(cmsg);
3035 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
3036 return -EINVAL;
3037
3038 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
3039 sockc->tsflags |= tsflags;
3040 break;
3041 case SCM_TXTIME:
3042 if (!sock_flag(sk, SOCK_TXTIME))
3043 return -EINVAL;
3044 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
3045 return -EINVAL;
3046 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
3047 break;
3048 case SCM_TS_OPT_ID:
3049 if (sk_is_tcp(sk))
3050 return -EINVAL;
3051 tsflags = READ_ONCE(sk->sk_tsflags);
3052 if (!(tsflags & SOF_TIMESTAMPING_OPT_ID))
3053 return -EINVAL;
3054 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3055 return -EINVAL;
3056 sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg);
3057 sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID;
3058 break;
3059 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
3060 case SCM_RIGHTS:
3061 case SCM_CREDENTIALS:
3062 break;
3063 case SO_PRIORITY:
3064 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3065 return -EINVAL;
3066 if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg)))
3067 return -EPERM;
3068 sockc->priority = *(u32 *)CMSG_DATA(cmsg);
3069 break;
3070 case SCM_DEVMEM_DMABUF:
3071 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
3072 return -EINVAL;
3073 sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg);
3074 break;
3075 default:
3076 return -EINVAL;
3077 }
3078 return 0;
3079 }
3080 EXPORT_SYMBOL(__sock_cmsg_send);
3081
sock_cmsg_send(struct sock * sk,struct msghdr * msg,struct sockcm_cookie * sockc)3082 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
3083 struct sockcm_cookie *sockc)
3084 {
3085 struct cmsghdr *cmsg;
3086 int ret;
3087
3088 for_each_cmsghdr(cmsg, msg) {
3089 if (!CMSG_OK(msg, cmsg))
3090 return -EINVAL;
3091 if (cmsg->cmsg_level != SOL_SOCKET)
3092 continue;
3093 ret = __sock_cmsg_send(sk, cmsg, sockc);
3094 if (ret)
3095 return ret;
3096 }
3097 return 0;
3098 }
3099 EXPORT_SYMBOL(sock_cmsg_send);
3100
sk_enter_memory_pressure(struct sock * sk)3101 static void sk_enter_memory_pressure(struct sock *sk)
3102 {
3103 if (!sk->sk_prot->enter_memory_pressure)
3104 return;
3105
3106 sk->sk_prot->enter_memory_pressure(sk);
3107 }
3108
sk_leave_memory_pressure(struct sock * sk)3109 static void sk_leave_memory_pressure(struct sock *sk)
3110 {
3111 if (sk->sk_prot->leave_memory_pressure) {
3112 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
3113 tcp_leave_memory_pressure, sk);
3114 } else {
3115 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
3116
3117 if (memory_pressure && READ_ONCE(*memory_pressure))
3118 WRITE_ONCE(*memory_pressure, 0);
3119 }
3120 }
3121
3122 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
3123
3124 /**
3125 * skb_page_frag_refill - check that a page_frag contains enough room
3126 * @sz: minimum size of the fragment we want to get
3127 * @pfrag: pointer to page_frag
3128 * @gfp: priority for memory allocation
3129 *
3130 * Note: While this allocator tries to use high order pages, there is
3131 * no guarantee that allocations succeed. Therefore, @sz MUST be
3132 * less or equal than PAGE_SIZE.
3133 */
skb_page_frag_refill(unsigned int sz,struct page_frag * pfrag,gfp_t gfp)3134 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
3135 {
3136 if (pfrag->page) {
3137 if (page_ref_count(pfrag->page) == 1) {
3138 pfrag->offset = 0;
3139 return true;
3140 }
3141 if (pfrag->offset + sz <= pfrag->size)
3142 return true;
3143 put_page(pfrag->page);
3144 }
3145
3146 pfrag->offset = 0;
3147 if (SKB_FRAG_PAGE_ORDER &&
3148 !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
3149 /* Avoid direct reclaim but allow kswapd to wake */
3150 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
3151 __GFP_COMP | __GFP_NOWARN |
3152 __GFP_NORETRY,
3153 SKB_FRAG_PAGE_ORDER);
3154 if (likely(pfrag->page)) {
3155 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
3156 return true;
3157 }
3158 }
3159 pfrag->page = alloc_page(gfp);
3160 if (likely(pfrag->page)) {
3161 pfrag->size = PAGE_SIZE;
3162 return true;
3163 }
3164 return false;
3165 }
3166 EXPORT_SYMBOL(skb_page_frag_refill);
3167
sk_page_frag_refill(struct sock * sk,struct page_frag * pfrag)3168 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
3169 {
3170 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
3171 return true;
3172
3173 sk_enter_memory_pressure(sk);
3174 sk_stream_moderate_sndbuf(sk);
3175 return false;
3176 }
3177 EXPORT_SYMBOL(sk_page_frag_refill);
3178
__lock_sock(struct sock * sk)3179 void __lock_sock(struct sock *sk)
3180 __releases(&sk->sk_lock.slock)
3181 __acquires(&sk->sk_lock.slock)
3182 {
3183 DEFINE_WAIT(wait);
3184
3185 for (;;) {
3186 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
3187 TASK_UNINTERRUPTIBLE);
3188 spin_unlock_bh(&sk->sk_lock.slock);
3189 schedule();
3190 spin_lock_bh(&sk->sk_lock.slock);
3191 if (!sock_owned_by_user(sk))
3192 break;
3193 }
3194 finish_wait(&sk->sk_lock.wq, &wait);
3195 }
3196
__release_sock(struct sock * sk)3197 void __release_sock(struct sock *sk)
3198 __releases(&sk->sk_lock.slock)
3199 __acquires(&sk->sk_lock.slock)
3200 {
3201 struct sk_buff *skb, *next;
3202
3203 while ((skb = sk->sk_backlog.head) != NULL) {
3204 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
3205
3206 spin_unlock_bh(&sk->sk_lock.slock);
3207
3208 do {
3209 next = skb->next;
3210 prefetch(next);
3211 DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
3212 skb_mark_not_on_list(skb);
3213 sk_backlog_rcv(sk, skb);
3214
3215 cond_resched();
3216
3217 skb = next;
3218 } while (skb != NULL);
3219
3220 spin_lock_bh(&sk->sk_lock.slock);
3221 }
3222
3223 /*
3224 * Doing the zeroing here guarantee we can not loop forever
3225 * while a wild producer attempts to flood us.
3226 */
3227 sk->sk_backlog.len = 0;
3228 }
3229
__sk_flush_backlog(struct sock * sk)3230 void __sk_flush_backlog(struct sock *sk)
3231 {
3232 spin_lock_bh(&sk->sk_lock.slock);
3233 __release_sock(sk);
3234
3235 if (sk->sk_prot->release_cb)
3236 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3237 tcp_release_cb, sk);
3238
3239 spin_unlock_bh(&sk->sk_lock.slock);
3240 }
3241 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3242
3243 /**
3244 * sk_wait_data - wait for data to arrive at sk_receive_queue
3245 * @sk: sock to wait on
3246 * @timeo: for how long
3247 * @skb: last skb seen on sk_receive_queue
3248 *
3249 * Now socket state including sk->sk_err is changed only under lock,
3250 * hence we may omit checks after joining wait queue.
3251 * We check receive queue before schedule() only as optimization;
3252 * it is very likely that release_sock() added new data.
3253 */
sk_wait_data(struct sock * sk,long * timeo,const struct sk_buff * skb)3254 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3255 {
3256 DEFINE_WAIT_FUNC(wait, woken_wake_function);
3257 int rc;
3258
3259 add_wait_queue(sk_sleep(sk), &wait);
3260 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3261 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3262 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3263 remove_wait_queue(sk_sleep(sk), &wait);
3264 return rc;
3265 }
3266 EXPORT_SYMBOL(sk_wait_data);
3267
3268 /**
3269 * __sk_mem_raise_allocated - increase memory_allocated
3270 * @sk: socket
3271 * @size: memory size to allocate
3272 * @amt: pages to allocate
3273 * @kind: allocation type
3274 *
3275 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3276 *
3277 * Unlike the globally shared limits among the sockets under same protocol,
3278 * consuming the budget of a memcg won't have direct effect on other ones.
3279 * So be optimistic about memcg's tolerance, and leave the callers to decide
3280 * whether or not to raise allocated through sk_under_memory_pressure() or
3281 * its variants.
3282 */
__sk_mem_raise_allocated(struct sock * sk,int size,int amt,int kind)3283 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3284 {
3285 struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3286 struct proto *prot = sk->sk_prot;
3287 bool charged = true;
3288 long allocated;
3289
3290 sk_memory_allocated_add(sk, amt);
3291 allocated = sk_memory_allocated(sk);
3292
3293 if (memcg) {
3294 charged = mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge());
3295 if (!charged)
3296 goto suppress_allocation;
3297 }
3298
3299 /* Under limit. */
3300 if (allocated <= sk_prot_mem_limits(sk, 0)) {
3301 sk_leave_memory_pressure(sk);
3302 return 1;
3303 }
3304
3305 /* Under pressure. */
3306 if (allocated > sk_prot_mem_limits(sk, 1))
3307 sk_enter_memory_pressure(sk);
3308
3309 /* Over hard limit. */
3310 if (allocated > sk_prot_mem_limits(sk, 2))
3311 goto suppress_allocation;
3312
3313 /* Guarantee minimum buffer size under pressure (either global
3314 * or memcg) to make sure features described in RFC 7323 (TCP
3315 * Extensions for High Performance) work properly.
3316 *
3317 * This rule does NOT stand when exceeds global or memcg's hard
3318 * limit, or else a DoS attack can be taken place by spawning
3319 * lots of sockets whose usage are under minimum buffer size.
3320 */
3321 if (kind == SK_MEM_RECV) {
3322 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3323 return 1;
3324
3325 } else { /* SK_MEM_SEND */
3326 int wmem0 = sk_get_wmem0(sk, prot);
3327
3328 if (sk->sk_type == SOCK_STREAM) {
3329 if (sk->sk_wmem_queued < wmem0)
3330 return 1;
3331 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3332 return 1;
3333 }
3334 }
3335
3336 if (sk_has_memory_pressure(sk)) {
3337 u64 alloc;
3338
3339 /* The following 'average' heuristic is within the
3340 * scope of global accounting, so it only makes
3341 * sense for global memory pressure.
3342 */
3343 if (!sk_under_global_memory_pressure(sk))
3344 return 1;
3345
3346 /* Try to be fair among all the sockets under global
3347 * pressure by allowing the ones that below average
3348 * usage to raise.
3349 */
3350 alloc = sk_sockets_allocated_read_positive(sk);
3351 if (sk_prot_mem_limits(sk, 2) > alloc *
3352 sk_mem_pages(sk->sk_wmem_queued +
3353 atomic_read(&sk->sk_rmem_alloc) +
3354 sk->sk_forward_alloc))
3355 return 1;
3356 }
3357
3358 suppress_allocation:
3359
3360 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3361 sk_stream_moderate_sndbuf(sk);
3362
3363 /* Fail only if socket is _under_ its sndbuf.
3364 * In this case we cannot block, so that we have to fail.
3365 */
3366 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3367 /* Force charge with __GFP_NOFAIL */
3368 if (memcg && !charged) {
3369 mem_cgroup_charge_skmem(memcg, amt,
3370 gfp_memcg_charge() | __GFP_NOFAIL);
3371 }
3372 return 1;
3373 }
3374 }
3375
3376 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3377 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3378
3379 sk_memory_allocated_sub(sk, amt);
3380
3381 if (memcg && charged)
3382 mem_cgroup_uncharge_skmem(memcg, amt);
3383
3384 return 0;
3385 }
3386
3387 /**
3388 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3389 * @sk: socket
3390 * @size: memory size to allocate
3391 * @kind: allocation type
3392 *
3393 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3394 * rmem allocation. This function assumes that protocols which have
3395 * memory_pressure use sk_wmem_queued as write buffer accounting.
3396 */
__sk_mem_schedule(struct sock * sk,int size,int kind)3397 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3398 {
3399 int ret, amt = sk_mem_pages(size);
3400
3401 sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3402 ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3403 if (!ret)
3404 sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3405 return ret;
3406 }
3407 EXPORT_SYMBOL(__sk_mem_schedule);
3408
3409 /**
3410 * __sk_mem_reduce_allocated - reclaim memory_allocated
3411 * @sk: socket
3412 * @amount: number of quanta
3413 *
3414 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3415 */
__sk_mem_reduce_allocated(struct sock * sk,int amount)3416 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3417 {
3418 sk_memory_allocated_sub(sk, amount);
3419
3420 if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3421 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3422
3423 if (sk_under_global_memory_pressure(sk) &&
3424 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3425 sk_leave_memory_pressure(sk);
3426 }
3427
3428 /**
3429 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3430 * @sk: socket
3431 * @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3432 */
__sk_mem_reclaim(struct sock * sk,int amount)3433 void __sk_mem_reclaim(struct sock *sk, int amount)
3434 {
3435 amount >>= PAGE_SHIFT;
3436 sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3437 __sk_mem_reduce_allocated(sk, amount);
3438 }
3439 EXPORT_SYMBOL(__sk_mem_reclaim);
3440
sk_set_peek_off(struct sock * sk,int val)3441 int sk_set_peek_off(struct sock *sk, int val)
3442 {
3443 WRITE_ONCE(sk->sk_peek_off, val);
3444 return 0;
3445 }
3446 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3447
3448 /*
3449 * Set of default routines for initialising struct proto_ops when
3450 * the protocol does not support a particular function. In certain
3451 * cases where it makes no sense for a protocol to have a "do nothing"
3452 * function, some default processing is provided.
3453 */
3454
sock_no_bind(struct socket * sock,struct sockaddr * saddr,int len)3455 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3456 {
3457 return -EOPNOTSUPP;
3458 }
3459 EXPORT_SYMBOL(sock_no_bind);
3460
sock_no_connect(struct socket * sock,struct sockaddr * saddr,int len,int flags)3461 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3462 int len, int flags)
3463 {
3464 return -EOPNOTSUPP;
3465 }
3466 EXPORT_SYMBOL(sock_no_connect);
3467
sock_no_socketpair(struct socket * sock1,struct socket * sock2)3468 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3469 {
3470 return -EOPNOTSUPP;
3471 }
3472 EXPORT_SYMBOL(sock_no_socketpair);
3473
sock_no_accept(struct socket * sock,struct socket * newsock,struct proto_accept_arg * arg)3474 int sock_no_accept(struct socket *sock, struct socket *newsock,
3475 struct proto_accept_arg *arg)
3476 {
3477 return -EOPNOTSUPP;
3478 }
3479 EXPORT_SYMBOL(sock_no_accept);
3480
sock_no_getname(struct socket * sock,struct sockaddr * saddr,int peer)3481 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3482 int peer)
3483 {
3484 return -EOPNOTSUPP;
3485 }
3486 EXPORT_SYMBOL(sock_no_getname);
3487
sock_no_ioctl(struct socket * sock,unsigned int cmd,unsigned long arg)3488 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3489 {
3490 return -EOPNOTSUPP;
3491 }
3492 EXPORT_SYMBOL(sock_no_ioctl);
3493
sock_no_listen(struct socket * sock,int backlog)3494 int sock_no_listen(struct socket *sock, int backlog)
3495 {
3496 return -EOPNOTSUPP;
3497 }
3498 EXPORT_SYMBOL(sock_no_listen);
3499
sock_no_shutdown(struct socket * sock,int how)3500 int sock_no_shutdown(struct socket *sock, int how)
3501 {
3502 return -EOPNOTSUPP;
3503 }
3504 EXPORT_SYMBOL(sock_no_shutdown);
3505
sock_no_sendmsg(struct socket * sock,struct msghdr * m,size_t len)3506 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3507 {
3508 return -EOPNOTSUPP;
3509 }
3510 EXPORT_SYMBOL(sock_no_sendmsg);
3511
sock_no_sendmsg_locked(struct sock * sk,struct msghdr * m,size_t len)3512 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3513 {
3514 return -EOPNOTSUPP;
3515 }
3516 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3517
sock_no_recvmsg(struct socket * sock,struct msghdr * m,size_t len,int flags)3518 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3519 int flags)
3520 {
3521 return -EOPNOTSUPP;
3522 }
3523 EXPORT_SYMBOL(sock_no_recvmsg);
3524
sock_no_mmap(struct file * file,struct socket * sock,struct vm_area_struct * vma)3525 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3526 {
3527 /* Mirror missing mmap method error code */
3528 return -ENODEV;
3529 }
3530 EXPORT_SYMBOL(sock_no_mmap);
3531
3532 /*
3533 * When a file is received (via SCM_RIGHTS, etc), we must bump the
3534 * various sock-based usage counts.
3535 */
__receive_sock(struct file * file)3536 void __receive_sock(struct file *file)
3537 {
3538 struct socket *sock;
3539
3540 sock = sock_from_file(file);
3541 if (sock) {
3542 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3543 sock_update_classid(&sock->sk->sk_cgrp_data);
3544 }
3545 }
3546
3547 /*
3548 * Default Socket Callbacks
3549 */
3550
sock_def_wakeup(struct sock * sk)3551 static void sock_def_wakeup(struct sock *sk)
3552 {
3553 struct socket_wq *wq;
3554
3555 rcu_read_lock();
3556 wq = rcu_dereference(sk->sk_wq);
3557 if (skwq_has_sleeper(wq))
3558 wake_up_interruptible_all(&wq->wait);
3559 rcu_read_unlock();
3560 }
3561
sock_def_error_report(struct sock * sk)3562 static void sock_def_error_report(struct sock *sk)
3563 {
3564 struct socket_wq *wq;
3565
3566 rcu_read_lock();
3567 wq = rcu_dereference(sk->sk_wq);
3568 if (skwq_has_sleeper(wq))
3569 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3570 sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR);
3571 rcu_read_unlock();
3572 }
3573
sock_def_readable(struct sock * sk)3574 void sock_def_readable(struct sock *sk)
3575 {
3576 struct socket_wq *wq;
3577
3578 trace_sk_data_ready(sk);
3579
3580 rcu_read_lock();
3581 wq = rcu_dereference(sk->sk_wq);
3582 if (skwq_has_sleeper(wq))
3583 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3584 EPOLLRDNORM | EPOLLRDBAND);
3585 sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN);
3586 rcu_read_unlock();
3587 }
3588
sock_def_write_space(struct sock * sk)3589 static void sock_def_write_space(struct sock *sk)
3590 {
3591 struct socket_wq *wq;
3592
3593 rcu_read_lock();
3594
3595 /* Do not wake up a writer until he can make "significant"
3596 * progress. --DaveM
3597 */
3598 if (sock_writeable(sk)) {
3599 wq = rcu_dereference(sk->sk_wq);
3600 if (skwq_has_sleeper(wq))
3601 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3602 EPOLLWRNORM | EPOLLWRBAND);
3603
3604 /* Should agree with poll, otherwise some programs break */
3605 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3606 }
3607
3608 rcu_read_unlock();
3609 }
3610
3611 /* An optimised version of sock_def_write_space(), should only be called
3612 * for SOCK_RCU_FREE sockets under RCU read section and after putting
3613 * ->sk_wmem_alloc.
3614 */
sock_def_write_space_wfree(struct sock * sk)3615 static void sock_def_write_space_wfree(struct sock *sk)
3616 {
3617 /* Do not wake up a writer until he can make "significant"
3618 * progress. --DaveM
3619 */
3620 if (sock_writeable(sk)) {
3621 struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3622
3623 /* rely on refcount_sub from sock_wfree() */
3624 smp_mb__after_atomic();
3625 if (wq && waitqueue_active(&wq->wait))
3626 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3627 EPOLLWRNORM | EPOLLWRBAND);
3628
3629 /* Should agree with poll, otherwise some programs break */
3630 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT);
3631 }
3632 }
3633
sock_def_destruct(struct sock * sk)3634 static void sock_def_destruct(struct sock *sk)
3635 {
3636 }
3637
sk_send_sigurg(struct sock * sk)3638 void sk_send_sigurg(struct sock *sk)
3639 {
3640 if (sk->sk_socket && sk->sk_socket->file)
3641 if (send_sigurg(sk->sk_socket->file))
3642 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3643 }
3644 EXPORT_SYMBOL(sk_send_sigurg);
3645
sk_reset_timer(struct sock * sk,struct timer_list * timer,unsigned long expires)3646 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3647 unsigned long expires)
3648 {
3649 if (!mod_timer(timer, expires))
3650 sock_hold(sk);
3651 }
3652 EXPORT_SYMBOL(sk_reset_timer);
3653
sk_stop_timer(struct sock * sk,struct timer_list * timer)3654 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3655 {
3656 if (timer_delete(timer))
3657 __sock_put(sk);
3658 }
3659 EXPORT_SYMBOL(sk_stop_timer);
3660
sk_stop_timer_sync(struct sock * sk,struct timer_list * timer)3661 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3662 {
3663 if (timer_delete_sync(timer))
3664 __sock_put(sk);
3665 }
3666 EXPORT_SYMBOL(sk_stop_timer_sync);
3667
sock_init_data_uid(struct socket * sock,struct sock * sk,kuid_t uid)3668 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3669 {
3670 sk_init_common(sk);
3671 sk->sk_send_head = NULL;
3672
3673 timer_setup(&sk->sk_timer, NULL, 0);
3674
3675 sk->sk_allocation = GFP_KERNEL;
3676 sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default);
3677 sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default);
3678 sk->sk_state = TCP_CLOSE;
3679 sk->sk_use_task_frag = true;
3680 sk_set_socket(sk, sock);
3681
3682 sock_set_flag(sk, SOCK_ZAPPED);
3683
3684 if (sock) {
3685 sk->sk_type = sock->type;
3686 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3687 sock->sk = sk;
3688 } else {
3689 RCU_INIT_POINTER(sk->sk_wq, NULL);
3690 }
3691 sk->sk_uid = uid;
3692
3693 sk->sk_state_change = sock_def_wakeup;
3694 sk->sk_data_ready = sock_def_readable;
3695 sk->sk_write_space = sock_def_write_space;
3696 sk->sk_error_report = sock_def_error_report;
3697 sk->sk_destruct = sock_def_destruct;
3698
3699 sk->sk_frag.page = NULL;
3700 sk->sk_frag.offset = 0;
3701 sk->sk_peek_off = -1;
3702
3703 sk->sk_peer_pid = NULL;
3704 sk->sk_peer_cred = NULL;
3705 spin_lock_init(&sk->sk_peer_lock);
3706
3707 sk->sk_write_pending = 0;
3708 sk->sk_rcvlowat = 1;
3709 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
3710 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
3711
3712 sk->sk_stamp = SK_DEFAULT_STAMP;
3713 #if BITS_PER_LONG==32
3714 seqlock_init(&sk->sk_stamp_seq);
3715 #endif
3716 atomic_set(&sk->sk_zckey, 0);
3717
3718 #ifdef CONFIG_NET_RX_BUSY_POLL
3719 sk->sk_napi_id = 0;
3720 sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read);
3721 #endif
3722
3723 sk->sk_max_pacing_rate = ~0UL;
3724 sk->sk_pacing_rate = ~0UL;
3725 WRITE_ONCE(sk->sk_pacing_shift, 10);
3726 sk->sk_incoming_cpu = -1;
3727
3728 sk_rx_queue_clear(sk);
3729 /*
3730 * Before updating sk_refcnt, we must commit prior changes to memory
3731 * (Documentation/RCU/rculist_nulls.rst for details)
3732 */
3733 smp_wmb();
3734 refcount_set(&sk->sk_refcnt, 1);
3735 atomic_set(&sk->sk_drops, 0);
3736 }
3737 EXPORT_SYMBOL(sock_init_data_uid);
3738
sock_init_data(struct socket * sock,struct sock * sk)3739 void sock_init_data(struct socket *sock, struct sock *sk)
3740 {
3741 kuid_t uid = sock ?
3742 SOCK_INODE(sock)->i_uid :
3743 make_kuid(sock_net(sk)->user_ns, 0);
3744
3745 sock_init_data_uid(sock, sk, uid);
3746 }
3747 EXPORT_SYMBOL(sock_init_data);
3748
lock_sock_nested(struct sock * sk,int subclass)3749 void lock_sock_nested(struct sock *sk, int subclass)
3750 {
3751 /* The sk_lock has mutex_lock() semantics here. */
3752 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3753
3754 might_sleep();
3755 spin_lock_bh(&sk->sk_lock.slock);
3756 if (sock_owned_by_user_nocheck(sk))
3757 __lock_sock(sk);
3758 sk->sk_lock.owned = 1;
3759 spin_unlock_bh(&sk->sk_lock.slock);
3760 }
3761 EXPORT_SYMBOL(lock_sock_nested);
3762
release_sock(struct sock * sk)3763 void release_sock(struct sock *sk)
3764 {
3765 spin_lock_bh(&sk->sk_lock.slock);
3766 if (sk->sk_backlog.tail)
3767 __release_sock(sk);
3768
3769 if (sk->sk_prot->release_cb)
3770 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3771 tcp_release_cb, sk);
3772
3773 sock_release_ownership(sk);
3774 if (waitqueue_active(&sk->sk_lock.wq))
3775 wake_up(&sk->sk_lock.wq);
3776 spin_unlock_bh(&sk->sk_lock.slock);
3777 }
3778 EXPORT_SYMBOL(release_sock);
3779
__lock_sock_fast(struct sock * sk)3780 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3781 {
3782 might_sleep();
3783 spin_lock_bh(&sk->sk_lock.slock);
3784
3785 if (!sock_owned_by_user_nocheck(sk)) {
3786 /*
3787 * Fast path return with bottom halves disabled and
3788 * sock::sk_lock.slock held.
3789 *
3790 * The 'mutex' is not contended and holding
3791 * sock::sk_lock.slock prevents all other lockers to
3792 * proceed so the corresponding unlock_sock_fast() can
3793 * avoid the slow path of release_sock() completely and
3794 * just release slock.
3795 *
3796 * From a semantical POV this is equivalent to 'acquiring'
3797 * the 'mutex', hence the corresponding lockdep
3798 * mutex_release() has to happen in the fast path of
3799 * unlock_sock_fast().
3800 */
3801 return false;
3802 }
3803
3804 __lock_sock(sk);
3805 sk->sk_lock.owned = 1;
3806 __acquire(&sk->sk_lock.slock);
3807 spin_unlock_bh(&sk->sk_lock.slock);
3808 return true;
3809 }
3810 EXPORT_SYMBOL(__lock_sock_fast);
3811
sock_gettstamp(struct socket * sock,void __user * userstamp,bool timeval,bool time32)3812 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3813 bool timeval, bool time32)
3814 {
3815 struct sock *sk = sock->sk;
3816 struct timespec64 ts;
3817
3818 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3819 ts = ktime_to_timespec64(sock_read_timestamp(sk));
3820 if (ts.tv_sec == -1)
3821 return -ENOENT;
3822 if (ts.tv_sec == 0) {
3823 ktime_t kt = ktime_get_real();
3824 sock_write_timestamp(sk, kt);
3825 ts = ktime_to_timespec64(kt);
3826 }
3827
3828 if (timeval)
3829 ts.tv_nsec /= 1000;
3830
3831 #ifdef CONFIG_COMPAT_32BIT_TIME
3832 if (time32)
3833 return put_old_timespec32(&ts, userstamp);
3834 #endif
3835 #ifdef CONFIG_SPARC64
3836 /* beware of padding in sparc64 timeval */
3837 if (timeval && !in_compat_syscall()) {
3838 struct __kernel_old_timeval __user tv = {
3839 .tv_sec = ts.tv_sec,
3840 .tv_usec = ts.tv_nsec,
3841 };
3842 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3843 return -EFAULT;
3844 return 0;
3845 }
3846 #endif
3847 return put_timespec64(&ts, userstamp);
3848 }
3849 EXPORT_SYMBOL(sock_gettstamp);
3850
sock_enable_timestamp(struct sock * sk,enum sock_flags flag)3851 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3852 {
3853 if (!sock_flag(sk, flag)) {
3854 unsigned long previous_flags = sk->sk_flags;
3855
3856 sock_set_flag(sk, flag);
3857 /*
3858 * we just set one of the two flags which require net
3859 * time stamping, but time stamping might have been on
3860 * already because of the other one
3861 */
3862 if (sock_needs_netstamp(sk) &&
3863 !(previous_flags & SK_FLAGS_TIMESTAMP))
3864 net_enable_timestamp();
3865 }
3866 }
3867
sock_recv_errqueue(struct sock * sk,struct msghdr * msg,int len,int level,int type)3868 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3869 int level, int type)
3870 {
3871 struct sock_exterr_skb *serr;
3872 struct sk_buff *skb;
3873 int copied, err;
3874
3875 err = -EAGAIN;
3876 skb = sock_dequeue_err_skb(sk);
3877 if (skb == NULL)
3878 goto out;
3879
3880 copied = skb->len;
3881 if (copied > len) {
3882 msg->msg_flags |= MSG_TRUNC;
3883 copied = len;
3884 }
3885 err = skb_copy_datagram_msg(skb, 0, msg, copied);
3886 if (err)
3887 goto out_free_skb;
3888
3889 sock_recv_timestamp(msg, sk, skb);
3890
3891 serr = SKB_EXT_ERR(skb);
3892 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3893
3894 msg->msg_flags |= MSG_ERRQUEUE;
3895 err = copied;
3896
3897 out_free_skb:
3898 kfree_skb(skb);
3899 out:
3900 return err;
3901 }
3902 EXPORT_SYMBOL(sock_recv_errqueue);
3903
3904 /*
3905 * Get a socket option on an socket.
3906 *
3907 * FIX: POSIX 1003.1g is very ambiguous here. It states that
3908 * asynchronous errors should be reported by getsockopt. We assume
3909 * this means if you specify SO_ERROR (otherwise what is the point of it).
3910 */
sock_common_getsockopt(struct socket * sock,int level,int optname,char __user * optval,int __user * optlen)3911 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3912 char __user *optval, int __user *optlen)
3913 {
3914 struct sock *sk = sock->sk;
3915
3916 /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3917 return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3918 }
3919 EXPORT_SYMBOL(sock_common_getsockopt);
3920
sock_common_recvmsg(struct socket * sock,struct msghdr * msg,size_t size,int flags)3921 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3922 int flags)
3923 {
3924 struct sock *sk = sock->sk;
3925 int addr_len = 0;
3926 int err;
3927
3928 err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3929 if (err >= 0)
3930 msg->msg_namelen = addr_len;
3931 return err;
3932 }
3933 EXPORT_SYMBOL(sock_common_recvmsg);
3934
3935 /*
3936 * Set socket options on an inet socket.
3937 */
sock_common_setsockopt(struct socket * sock,int level,int optname,sockptr_t optval,unsigned int optlen)3938 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3939 sockptr_t optval, unsigned int optlen)
3940 {
3941 struct sock *sk = sock->sk;
3942
3943 /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3944 return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3945 }
3946 EXPORT_SYMBOL(sock_common_setsockopt);
3947
sk_common_release(struct sock * sk)3948 void sk_common_release(struct sock *sk)
3949 {
3950 if (sk->sk_prot->destroy)
3951 sk->sk_prot->destroy(sk);
3952
3953 /*
3954 * Observation: when sk_common_release is called, processes have
3955 * no access to socket. But net still has.
3956 * Step one, detach it from networking:
3957 *
3958 * A. Remove from hash tables.
3959 */
3960
3961 sk->sk_prot->unhash(sk);
3962
3963 /*
3964 * In this point socket cannot receive new packets, but it is possible
3965 * that some packets are in flight because some CPU runs receiver and
3966 * did hash table lookup before we unhashed socket. They will achieve
3967 * receive queue and will be purged by socket destructor.
3968 *
3969 * Also we still have packets pending on receive queue and probably,
3970 * our own packets waiting in device queues. sock_destroy will drain
3971 * receive queue, but transmitted packets will delay socket destruction
3972 * until the last reference will be released.
3973 */
3974
3975 sock_orphan(sk);
3976
3977 xfrm_sk_free_policy(sk);
3978
3979 sock_put(sk);
3980 }
3981 EXPORT_SYMBOL(sk_common_release);
3982
sk_get_meminfo(const struct sock * sk,u32 * mem)3983 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3984 {
3985 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3986
3987 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3988 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3989 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3990 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3991 mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc);
3992 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3993 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3994 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3995 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3996 }
3997
3998 #ifdef CONFIG_PROC_FS
3999 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
4000
sock_prot_inuse_get(struct net * net,struct proto * prot)4001 int sock_prot_inuse_get(struct net *net, struct proto *prot)
4002 {
4003 int cpu, idx = prot->inuse_idx;
4004 int res = 0;
4005
4006 for_each_possible_cpu(cpu)
4007 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
4008
4009 return res >= 0 ? res : 0;
4010 }
4011 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
4012
sock_inuse_get(struct net * net)4013 int sock_inuse_get(struct net *net)
4014 {
4015 int cpu, res = 0;
4016
4017 for_each_possible_cpu(cpu)
4018 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
4019
4020 return res;
4021 }
4022
4023 EXPORT_SYMBOL_GPL(sock_inuse_get);
4024
sock_inuse_init_net(struct net * net)4025 static int __net_init sock_inuse_init_net(struct net *net)
4026 {
4027 net->core.prot_inuse = alloc_percpu(struct prot_inuse);
4028 if (net->core.prot_inuse == NULL)
4029 return -ENOMEM;
4030 return 0;
4031 }
4032
sock_inuse_exit_net(struct net * net)4033 static void __net_exit sock_inuse_exit_net(struct net *net)
4034 {
4035 free_percpu(net->core.prot_inuse);
4036 }
4037
4038 static struct pernet_operations net_inuse_ops = {
4039 .init = sock_inuse_init_net,
4040 .exit = sock_inuse_exit_net,
4041 };
4042
net_inuse_init(void)4043 static __init int net_inuse_init(void)
4044 {
4045 if (register_pernet_subsys(&net_inuse_ops))
4046 panic("Cannot initialize net inuse counters");
4047
4048 return 0;
4049 }
4050
4051 core_initcall(net_inuse_init);
4052
assign_proto_idx(struct proto * prot)4053 static int assign_proto_idx(struct proto *prot)
4054 {
4055 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
4056
4057 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) {
4058 pr_err("PROTO_INUSE_NR exhausted\n");
4059 return -ENOSPC;
4060 }
4061
4062 set_bit(prot->inuse_idx, proto_inuse_idx);
4063 return 0;
4064 }
4065
release_proto_idx(struct proto * prot)4066 static void release_proto_idx(struct proto *prot)
4067 {
4068 if (prot->inuse_idx != PROTO_INUSE_NR)
4069 clear_bit(prot->inuse_idx, proto_inuse_idx);
4070 }
4071 #else
assign_proto_idx(struct proto * prot)4072 static inline int assign_proto_idx(struct proto *prot)
4073 {
4074 return 0;
4075 }
4076
release_proto_idx(struct proto * prot)4077 static inline void release_proto_idx(struct proto *prot)
4078 {
4079 }
4080
4081 #endif
4082
tw_prot_cleanup(struct timewait_sock_ops * twsk_prot)4083 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
4084 {
4085 if (!twsk_prot)
4086 return;
4087 kfree(twsk_prot->twsk_slab_name);
4088 twsk_prot->twsk_slab_name = NULL;
4089 kmem_cache_destroy(twsk_prot->twsk_slab);
4090 twsk_prot->twsk_slab = NULL;
4091 }
4092
tw_prot_init(const struct proto * prot)4093 static int tw_prot_init(const struct proto *prot)
4094 {
4095 struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
4096
4097 if (!twsk_prot)
4098 return 0;
4099
4100 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
4101 prot->name);
4102 if (!twsk_prot->twsk_slab_name)
4103 return -ENOMEM;
4104
4105 twsk_prot->twsk_slab =
4106 kmem_cache_create(twsk_prot->twsk_slab_name,
4107 twsk_prot->twsk_obj_size, 0,
4108 SLAB_ACCOUNT | prot->slab_flags,
4109 NULL);
4110 if (!twsk_prot->twsk_slab) {
4111 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
4112 prot->name);
4113 return -ENOMEM;
4114 }
4115
4116 return 0;
4117 }
4118
req_prot_cleanup(struct request_sock_ops * rsk_prot)4119 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
4120 {
4121 if (!rsk_prot)
4122 return;
4123 kfree(rsk_prot->slab_name);
4124 rsk_prot->slab_name = NULL;
4125 kmem_cache_destroy(rsk_prot->slab);
4126 rsk_prot->slab = NULL;
4127 }
4128
req_prot_init(const struct proto * prot)4129 static int req_prot_init(const struct proto *prot)
4130 {
4131 struct request_sock_ops *rsk_prot = prot->rsk_prot;
4132
4133 if (!rsk_prot)
4134 return 0;
4135
4136 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
4137 prot->name);
4138 if (!rsk_prot->slab_name)
4139 return -ENOMEM;
4140
4141 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
4142 rsk_prot->obj_size, 0,
4143 SLAB_ACCOUNT | prot->slab_flags,
4144 NULL);
4145
4146 if (!rsk_prot->slab) {
4147 pr_crit("%s: Can't create request sock SLAB cache!\n",
4148 prot->name);
4149 return -ENOMEM;
4150 }
4151 return 0;
4152 }
4153
proto_register(struct proto * prot,int alloc_slab)4154 int proto_register(struct proto *prot, int alloc_slab)
4155 {
4156 int ret = -ENOBUFS;
4157
4158 if (prot->memory_allocated && !prot->sysctl_mem) {
4159 pr_err("%s: missing sysctl_mem\n", prot->name);
4160 return -EINVAL;
4161 }
4162 if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
4163 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
4164 return -EINVAL;
4165 }
4166 if (alloc_slab) {
4167 prot->slab = kmem_cache_create_usercopy(prot->name,
4168 prot->obj_size, 0,
4169 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
4170 prot->slab_flags,
4171 prot->useroffset, prot->usersize,
4172 NULL);
4173
4174 if (prot->slab == NULL) {
4175 pr_crit("%s: Can't create sock SLAB cache!\n",
4176 prot->name);
4177 goto out;
4178 }
4179
4180 if (req_prot_init(prot))
4181 goto out_free_request_sock_slab;
4182
4183 if (tw_prot_init(prot))
4184 goto out_free_timewait_sock_slab;
4185 }
4186
4187 mutex_lock(&proto_list_mutex);
4188 ret = assign_proto_idx(prot);
4189 if (ret) {
4190 mutex_unlock(&proto_list_mutex);
4191 goto out_free_timewait_sock_slab;
4192 }
4193 list_add(&prot->node, &proto_list);
4194 mutex_unlock(&proto_list_mutex);
4195 return ret;
4196
4197 out_free_timewait_sock_slab:
4198 if (alloc_slab)
4199 tw_prot_cleanup(prot->twsk_prot);
4200 out_free_request_sock_slab:
4201 if (alloc_slab) {
4202 req_prot_cleanup(prot->rsk_prot);
4203
4204 kmem_cache_destroy(prot->slab);
4205 prot->slab = NULL;
4206 }
4207 out:
4208 return ret;
4209 }
4210 EXPORT_SYMBOL(proto_register);
4211
proto_unregister(struct proto * prot)4212 void proto_unregister(struct proto *prot)
4213 {
4214 mutex_lock(&proto_list_mutex);
4215 release_proto_idx(prot);
4216 list_del(&prot->node);
4217 mutex_unlock(&proto_list_mutex);
4218
4219 kmem_cache_destroy(prot->slab);
4220 prot->slab = NULL;
4221
4222 req_prot_cleanup(prot->rsk_prot);
4223 tw_prot_cleanup(prot->twsk_prot);
4224 }
4225 EXPORT_SYMBOL(proto_unregister);
4226
sock_load_diag_module(int family,int protocol)4227 int sock_load_diag_module(int family, int protocol)
4228 {
4229 if (!protocol) {
4230 if (!sock_is_registered(family))
4231 return -ENOENT;
4232
4233 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4234 NETLINK_SOCK_DIAG, family);
4235 }
4236
4237 #ifdef CONFIG_INET
4238 if (family == AF_INET &&
4239 protocol != IPPROTO_RAW &&
4240 protocol < MAX_INET_PROTOS &&
4241 !rcu_access_pointer(inet_protos[protocol]))
4242 return -ENOENT;
4243 #endif
4244
4245 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4246 NETLINK_SOCK_DIAG, family, protocol);
4247 }
4248 EXPORT_SYMBOL(sock_load_diag_module);
4249
4250 #ifdef CONFIG_PROC_FS
proto_seq_start(struct seq_file * seq,loff_t * pos)4251 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4252 __acquires(proto_list_mutex)
4253 {
4254 mutex_lock(&proto_list_mutex);
4255 return seq_list_start_head(&proto_list, *pos);
4256 }
4257
proto_seq_next(struct seq_file * seq,void * v,loff_t * pos)4258 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4259 {
4260 return seq_list_next(v, &proto_list, pos);
4261 }
4262
proto_seq_stop(struct seq_file * seq,void * v)4263 static void proto_seq_stop(struct seq_file *seq, void *v)
4264 __releases(proto_list_mutex)
4265 {
4266 mutex_unlock(&proto_list_mutex);
4267 }
4268
proto_method_implemented(const void * method)4269 static char proto_method_implemented(const void *method)
4270 {
4271 return method == NULL ? 'n' : 'y';
4272 }
sock_prot_memory_allocated(struct proto * proto)4273 static long sock_prot_memory_allocated(struct proto *proto)
4274 {
4275 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4276 }
4277
sock_prot_memory_pressure(struct proto * proto)4278 static const char *sock_prot_memory_pressure(struct proto *proto)
4279 {
4280 return proto->memory_pressure != NULL ?
4281 proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4282 }
4283
proto_seq_printf(struct seq_file * seq,struct proto * proto)4284 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4285 {
4286
4287 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
4288 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4289 proto->name,
4290 proto->obj_size,
4291 sock_prot_inuse_get(seq_file_net(seq), proto),
4292 sock_prot_memory_allocated(proto),
4293 sock_prot_memory_pressure(proto),
4294 proto->max_header,
4295 proto->slab == NULL ? "no" : "yes",
4296 module_name(proto->owner),
4297 proto_method_implemented(proto->close),
4298 proto_method_implemented(proto->connect),
4299 proto_method_implemented(proto->disconnect),
4300 proto_method_implemented(proto->accept),
4301 proto_method_implemented(proto->ioctl),
4302 proto_method_implemented(proto->init),
4303 proto_method_implemented(proto->destroy),
4304 proto_method_implemented(proto->shutdown),
4305 proto_method_implemented(proto->setsockopt),
4306 proto_method_implemented(proto->getsockopt),
4307 proto_method_implemented(proto->sendmsg),
4308 proto_method_implemented(proto->recvmsg),
4309 proto_method_implemented(proto->bind),
4310 proto_method_implemented(proto->backlog_rcv),
4311 proto_method_implemented(proto->hash),
4312 proto_method_implemented(proto->unhash),
4313 proto_method_implemented(proto->get_port),
4314 proto_method_implemented(proto->enter_memory_pressure));
4315 }
4316
proto_seq_show(struct seq_file * seq,void * v)4317 static int proto_seq_show(struct seq_file *seq, void *v)
4318 {
4319 if (v == &proto_list)
4320 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4321 "protocol",
4322 "size",
4323 "sockets",
4324 "memory",
4325 "press",
4326 "maxhdr",
4327 "slab",
4328 "module",
4329 "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4330 else
4331 proto_seq_printf(seq, list_entry(v, struct proto, node));
4332 return 0;
4333 }
4334
4335 static const struct seq_operations proto_seq_ops = {
4336 .start = proto_seq_start,
4337 .next = proto_seq_next,
4338 .stop = proto_seq_stop,
4339 .show = proto_seq_show,
4340 };
4341
proto_init_net(struct net * net)4342 static __net_init int proto_init_net(struct net *net)
4343 {
4344 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4345 sizeof(struct seq_net_private)))
4346 return -ENOMEM;
4347
4348 return 0;
4349 }
4350
proto_exit_net(struct net * net)4351 static __net_exit void proto_exit_net(struct net *net)
4352 {
4353 remove_proc_entry("protocols", net->proc_net);
4354 }
4355
4356
4357 static __net_initdata struct pernet_operations proto_net_ops = {
4358 .init = proto_init_net,
4359 .exit = proto_exit_net,
4360 };
4361
proto_init(void)4362 static int __init proto_init(void)
4363 {
4364 return register_pernet_subsys(&proto_net_ops);
4365 }
4366
4367 subsys_initcall(proto_init);
4368
4369 #endif /* PROC_FS */
4370
4371 #ifdef CONFIG_NET_RX_BUSY_POLL
sk_busy_loop_end(void * p,unsigned long start_time)4372 bool sk_busy_loop_end(void *p, unsigned long start_time)
4373 {
4374 struct sock *sk = p;
4375
4376 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
4377 return true;
4378
4379 if (sk_is_udp(sk) &&
4380 !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
4381 return true;
4382
4383 return sk_busy_loop_timeout(sk, start_time);
4384 }
4385 EXPORT_SYMBOL(sk_busy_loop_end);
4386 #endif /* CONFIG_NET_RX_BUSY_POLL */
4387
sock_bind_add(struct sock * sk,struct sockaddr * addr,int addr_len)4388 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4389 {
4390 if (!sk->sk_prot->bind_add)
4391 return -EOPNOTSUPP;
4392 return sk->sk_prot->bind_add(sk, addr, addr_len);
4393 }
4394 EXPORT_SYMBOL(sock_bind_add);
4395
4396 /* Copy 'size' bytes from userspace and return `size` back to userspace */
sock_ioctl_inout(struct sock * sk,unsigned int cmd,void __user * arg,void * karg,size_t size)4397 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4398 void __user *arg, void *karg, size_t size)
4399 {
4400 int ret;
4401
4402 if (copy_from_user(karg, arg, size))
4403 return -EFAULT;
4404
4405 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4406 if (ret)
4407 return ret;
4408
4409 if (copy_to_user(arg, karg, size))
4410 return -EFAULT;
4411
4412 return 0;
4413 }
4414 EXPORT_SYMBOL(sock_ioctl_inout);
4415
4416 /* This is the most common ioctl prep function, where the result (4 bytes) is
4417 * copied back to userspace if the ioctl() returns successfully. No input is
4418 * copied from userspace as input argument.
4419 */
sock_ioctl_out(struct sock * sk,unsigned int cmd,void __user * arg)4420 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4421 {
4422 int ret, karg = 0;
4423
4424 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4425 if (ret)
4426 return ret;
4427
4428 return put_user(karg, (int __user *)arg);
4429 }
4430
4431 /* A wrapper around sock ioctls, which copies the data from userspace
4432 * (depending on the protocol/ioctl), and copies back the result to userspace.
4433 * The main motivation for this function is to pass kernel memory to the
4434 * protocol ioctl callbacks, instead of userspace memory.
4435 */
sk_ioctl(struct sock * sk,unsigned int cmd,void __user * arg)4436 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4437 {
4438 int rc = 1;
4439
4440 if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4441 rc = ipmr_sk_ioctl(sk, cmd, arg);
4442 else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4443 rc = ip6mr_sk_ioctl(sk, cmd, arg);
4444 else if (sk_is_phonet(sk))
4445 rc = phonet_sk_ioctl(sk, cmd, arg);
4446
4447 /* If ioctl was processed, returns its value */
4448 if (rc <= 0)
4449 return rc;
4450
4451 /* Otherwise call the default handler */
4452 return sock_ioctl_out(sk, cmd, arg);
4453 }
4454 EXPORT_SYMBOL(sk_ioctl);
4455
sock_struct_check(void)4456 static int __init sock_struct_check(void)
4457 {
4458 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops);
4459 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off);
4460 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue);
4461 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue);
4462 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog);
4463
4464 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst);
4465 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex);
4466 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie);
4467 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf);
4468 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter);
4469 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq);
4470 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready);
4471 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo);
4472 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat);
4473
4474 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err);
4475 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket);
4476 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg);
4477
4478 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock);
4479 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem);
4480 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc);
4481 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags);
4482
4483 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4484 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc);
4485 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_sndbuf);
4486 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued);
4487 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc);
4488 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags);
4489 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head);
4490 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue);
4491 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending);
4492 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm);
4493 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status);
4494 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag);
4495 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer);
4496 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate);
4497 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey);
4498 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey);
4499
4500 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate);
4501 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo);
4502 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority);
4503 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark);
4504 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache);
4505 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps);
4506 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type);
4507 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size);
4508 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation);
4509 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash);
4510 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs);
4511 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift);
4512 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag);
4513 return 0;
4514 }
4515
4516 core_initcall(sock_struct_check);
4517