1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic socket support routines. Memory allocators, socket lock/release 8 * handler for protocols to use and generic option handler. 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 */ 85 86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 87 88 #include <linux/unaligned.h> 89 #include <linux/capability.h> 90 #include <linux/errno.h> 91 #include <linux/errqueue.h> 92 #include <linux/types.h> 93 #include <linux/socket.h> 94 #include <linux/in.h> 95 #include <linux/kernel.h> 96 #include <linux/module.h> 97 #include <linux/proc_fs.h> 98 #include <linux/seq_file.h> 99 #include <linux/sched.h> 100 #include <linux/sched/mm.h> 101 #include <linux/timer.h> 102 #include <linux/string.h> 103 #include <linux/sockios.h> 104 #include <linux/net.h> 105 #include <linux/mm.h> 106 #include <linux/slab.h> 107 #include <linux/interrupt.h> 108 #include <linux/poll.h> 109 #include <linux/tcp.h> 110 #include <linux/udp.h> 111 #include <linux/init.h> 112 #include <linux/highmem.h> 113 #include <linux/user_namespace.h> 114 #include <linux/static_key.h> 115 #include <linux/memcontrol.h> 116 #include <linux/prefetch.h> 117 #include <linux/compat.h> 118 #include <linux/mroute.h> 119 #include <linux/mroute6.h> 120 #include <linux/icmpv6.h> 121 122 #include <linux/uaccess.h> 123 124 #include <linux/netdevice.h> 125 #include <net/protocol.h> 126 #include <linux/skbuff.h> 127 #include <linux/skbuff_ref.h> 128 #include <net/net_namespace.h> 129 #include <net/request_sock.h> 130 #include <net/sock.h> 131 #include <net/proto_memory.h> 132 #include <linux/net_tstamp.h> 133 #include <net/xfrm.h> 134 #include <linux/ipsec.h> 135 #include <net/cls_cgroup.h> 136 #include <net/netprio_cgroup.h> 137 #include <linux/sock_diag.h> 138 139 #include <linux/filter.h> 140 #include <net/sock_reuseport.h> 141 #include <net/bpf_sk_storage.h> 142 143 #include <trace/events/sock.h> 144 145 #include <net/tcp.h> 146 #include <net/busy_poll.h> 147 #include <net/phonet/phonet.h> 148 149 #include <linux/ethtool.h> 150 151 #include <uapi/linux/pidfd.h> 152 153 #include "dev.h" 154 155 static DEFINE_MUTEX(proto_list_mutex); 156 static LIST_HEAD(proto_list); 157 158 static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc); 159 static void sock_def_write_space(struct sock *sk); 160 161 /** 162 * sk_ns_capable - General socket capability test 163 * @sk: Socket to use a capability on or through 164 * @user_ns: The user namespace of the capability to use 165 * @cap: The capability to use 166 * 167 * Test to see if the opener of the socket had when the socket was 168 * created and the current process has the capability @cap in the user 169 * namespace @user_ns. 170 */ 171 bool sk_ns_capable(const struct sock *sk, 172 struct user_namespace *user_ns, int cap) 173 { 174 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 175 ns_capable(user_ns, cap); 176 } 177 EXPORT_SYMBOL(sk_ns_capable); 178 179 /** 180 * sk_capable - Socket global capability test 181 * @sk: Socket to use a capability on or through 182 * @cap: The global capability to use 183 * 184 * Test to see if the opener of the socket had when the socket was 185 * created and the current process has the capability @cap in all user 186 * namespaces. 187 */ 188 bool sk_capable(const struct sock *sk, int cap) 189 { 190 return sk_ns_capable(sk, &init_user_ns, cap); 191 } 192 EXPORT_SYMBOL(sk_capable); 193 194 /** 195 * sk_net_capable - Network namespace socket capability test 196 * @sk: Socket to use a capability on or through 197 * @cap: The capability to use 198 * 199 * Test to see if the opener of the socket had when the socket was created 200 * and the current process has the capability @cap over the network namespace 201 * the socket is a member of. 202 */ 203 bool sk_net_capable(const struct sock *sk, int cap) 204 { 205 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 206 } 207 EXPORT_SYMBOL(sk_net_capable); 208 209 /* 210 * Each address family might have different locking rules, so we have 211 * one slock key per address family and separate keys for internal and 212 * userspace sockets. 213 */ 214 static struct lock_class_key af_family_keys[AF_MAX]; 215 static struct lock_class_key af_family_kern_keys[AF_MAX]; 216 static struct lock_class_key af_family_slock_keys[AF_MAX]; 217 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 218 219 /* 220 * Make lock validator output more readable. (we pre-construct these 221 * strings build-time, so that runtime initialization of socket 222 * locks is fast): 223 */ 224 225 #define _sock_locks(x) \ 226 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 227 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 228 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 229 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 230 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 231 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 232 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 233 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 234 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 235 x "27" , x "28" , x "AF_CAN" , \ 236 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 237 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 238 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 239 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 240 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 241 x "AF_MCTP" , \ 242 x "AF_MAX" 243 244 static const char *const af_family_key_strings[AF_MAX+1] = { 245 _sock_locks("sk_lock-") 246 }; 247 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 248 _sock_locks("slock-") 249 }; 250 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 251 _sock_locks("clock-") 252 }; 253 254 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 255 _sock_locks("k-sk_lock-") 256 }; 257 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 258 _sock_locks("k-slock-") 259 }; 260 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 261 _sock_locks("k-clock-") 262 }; 263 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 264 _sock_locks("rlock-") 265 }; 266 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 267 _sock_locks("wlock-") 268 }; 269 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 270 _sock_locks("elock-") 271 }; 272 273 /* 274 * sk_callback_lock and sk queues locking rules are per-address-family, 275 * so split the lock classes by using a per-AF key: 276 */ 277 static struct lock_class_key af_callback_keys[AF_MAX]; 278 static struct lock_class_key af_rlock_keys[AF_MAX]; 279 static struct lock_class_key af_wlock_keys[AF_MAX]; 280 static struct lock_class_key af_elock_keys[AF_MAX]; 281 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 282 283 /* Run time adjustable parameters. */ 284 __u32 sysctl_wmem_max __read_mostly = 4 << 20; 285 EXPORT_SYMBOL(sysctl_wmem_max); 286 __u32 sysctl_rmem_max __read_mostly = 4 << 20; 287 EXPORT_SYMBOL(sysctl_rmem_max); 288 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT; 289 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT; 290 291 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 292 EXPORT_SYMBOL_GPL(memalloc_socks_key); 293 294 /** 295 * sk_set_memalloc - sets %SOCK_MEMALLOC 296 * @sk: socket to set it on 297 * 298 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 299 * It's the responsibility of the admin to adjust min_free_kbytes 300 * to meet the requirements 301 */ 302 void sk_set_memalloc(struct sock *sk) 303 { 304 sock_set_flag(sk, SOCK_MEMALLOC); 305 sk->sk_allocation |= __GFP_MEMALLOC; 306 static_branch_inc(&memalloc_socks_key); 307 } 308 EXPORT_SYMBOL_GPL(sk_set_memalloc); 309 310 void sk_clear_memalloc(struct sock *sk) 311 { 312 sock_reset_flag(sk, SOCK_MEMALLOC); 313 sk->sk_allocation &= ~__GFP_MEMALLOC; 314 static_branch_dec(&memalloc_socks_key); 315 316 /* 317 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 318 * progress of swapping. SOCK_MEMALLOC may be cleared while 319 * it has rmem allocations due to the last swapfile being deactivated 320 * but there is a risk that the socket is unusable due to exceeding 321 * the rmem limits. Reclaim the reserves and obey rmem limits again. 322 */ 323 sk_mem_reclaim(sk); 324 } 325 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 326 327 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 328 { 329 int ret; 330 unsigned int noreclaim_flag; 331 332 /* these should have been dropped before queueing */ 333 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 334 335 noreclaim_flag = memalloc_noreclaim_save(); 336 ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv, 337 tcp_v6_do_rcv, 338 tcp_v4_do_rcv, 339 sk, skb); 340 memalloc_noreclaim_restore(noreclaim_flag); 341 342 return ret; 343 } 344 EXPORT_SYMBOL(__sk_backlog_rcv); 345 346 void sk_error_report(struct sock *sk) 347 { 348 sk->sk_error_report(sk); 349 350 switch (sk->sk_family) { 351 case AF_INET: 352 fallthrough; 353 case AF_INET6: 354 trace_inet_sk_error_report(sk); 355 break; 356 default: 357 break; 358 } 359 } 360 EXPORT_SYMBOL(sk_error_report); 361 362 int sock_get_timeout(long timeo, void *optval, bool old_timeval) 363 { 364 struct __kernel_sock_timeval tv; 365 366 if (timeo == MAX_SCHEDULE_TIMEOUT) { 367 tv.tv_sec = 0; 368 tv.tv_usec = 0; 369 } else { 370 tv.tv_sec = timeo / HZ; 371 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; 372 } 373 374 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 375 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; 376 *(struct old_timeval32 *)optval = tv32; 377 return sizeof(tv32); 378 } 379 380 if (old_timeval) { 381 struct __kernel_old_timeval old_tv; 382 old_tv.tv_sec = tv.tv_sec; 383 old_tv.tv_usec = tv.tv_usec; 384 *(struct __kernel_old_timeval *)optval = old_tv; 385 return sizeof(old_tv); 386 } 387 388 *(struct __kernel_sock_timeval *)optval = tv; 389 return sizeof(tv); 390 } 391 EXPORT_SYMBOL(sock_get_timeout); 392 393 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, 394 sockptr_t optval, int optlen, bool old_timeval) 395 { 396 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 397 struct old_timeval32 tv32; 398 399 if (optlen < sizeof(tv32)) 400 return -EINVAL; 401 402 if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) 403 return -EFAULT; 404 tv->tv_sec = tv32.tv_sec; 405 tv->tv_usec = tv32.tv_usec; 406 } else if (old_timeval) { 407 struct __kernel_old_timeval old_tv; 408 409 if (optlen < sizeof(old_tv)) 410 return -EINVAL; 411 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) 412 return -EFAULT; 413 tv->tv_sec = old_tv.tv_sec; 414 tv->tv_usec = old_tv.tv_usec; 415 } else { 416 if (optlen < sizeof(*tv)) 417 return -EINVAL; 418 if (copy_from_sockptr(tv, optval, sizeof(*tv))) 419 return -EFAULT; 420 } 421 422 return 0; 423 } 424 EXPORT_SYMBOL(sock_copy_user_timeval); 425 426 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, 427 bool old_timeval) 428 { 429 struct __kernel_sock_timeval tv; 430 int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval); 431 long val; 432 433 if (err) 434 return err; 435 436 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 437 return -EDOM; 438 439 if (tv.tv_sec < 0) { 440 static int warned __read_mostly; 441 442 WRITE_ONCE(*timeo_p, 0); 443 if (warned < 10 && net_ratelimit()) { 444 warned++; 445 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 446 __func__, current->comm, task_pid_nr(current)); 447 } 448 return 0; 449 } 450 val = MAX_SCHEDULE_TIMEOUT; 451 if ((tv.tv_sec || tv.tv_usec) && 452 (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))) 453 val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, 454 USEC_PER_SEC / HZ); 455 WRITE_ONCE(*timeo_p, val); 456 return 0; 457 } 458 459 static bool sk_set_prio_allowed(const struct sock *sk, int val) 460 { 461 return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) || 462 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || 463 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); 464 } 465 466 static bool sock_needs_netstamp(const struct sock *sk) 467 { 468 switch (sk->sk_family) { 469 case AF_UNSPEC: 470 case AF_UNIX: 471 return false; 472 default: 473 return true; 474 } 475 } 476 477 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 478 { 479 if (sk->sk_flags & flags) { 480 sk->sk_flags &= ~flags; 481 if (sock_needs_netstamp(sk) && 482 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 483 net_disable_timestamp(); 484 } 485 } 486 487 488 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 489 { 490 unsigned long flags; 491 struct sk_buff_head *list = &sk->sk_receive_queue; 492 493 if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { 494 sk_drops_inc(sk); 495 trace_sock_rcvqueue_full(sk, skb); 496 return -ENOMEM; 497 } 498 499 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 500 sk_drops_inc(sk); 501 return -ENOBUFS; 502 } 503 504 skb->dev = NULL; 505 skb_set_owner_r(skb, sk); 506 507 /* we escape from rcu protected region, make sure we dont leak 508 * a norefcounted dst 509 */ 510 skb_dst_force(skb); 511 512 spin_lock_irqsave(&list->lock, flags); 513 sock_skb_set_dropcount(sk, skb); 514 __skb_queue_tail(list, skb); 515 spin_unlock_irqrestore(&list->lock, flags); 516 517 if (!sock_flag(sk, SOCK_DEAD)) 518 sk->sk_data_ready(sk); 519 return 0; 520 } 521 EXPORT_SYMBOL(__sock_queue_rcv_skb); 522 523 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, 524 enum skb_drop_reason *reason) 525 { 526 enum skb_drop_reason drop_reason; 527 int err; 528 529 err = sk_filter_reason(sk, skb, &drop_reason); 530 if (err) 531 goto out; 532 533 err = __sock_queue_rcv_skb(sk, skb); 534 switch (err) { 535 case -ENOMEM: 536 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; 537 break; 538 case -ENOBUFS: 539 drop_reason = SKB_DROP_REASON_PROTO_MEM; 540 break; 541 default: 542 drop_reason = SKB_NOT_DROPPED_YET; 543 break; 544 } 545 out: 546 if (reason) 547 *reason = drop_reason; 548 return err; 549 } 550 EXPORT_SYMBOL(sock_queue_rcv_skb_reason); 551 552 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 553 const int nested, unsigned int trim_cap, bool refcounted) 554 { 555 enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; 556 int rc = NET_RX_SUCCESS; 557 int err; 558 559 if (sk_filter_trim_cap(sk, skb, trim_cap, &reason)) 560 goto discard_and_relse; 561 562 skb->dev = NULL; 563 564 if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) { 565 sk_drops_inc(sk); 566 reason = SKB_DROP_REASON_SOCKET_RCVBUFF; 567 goto discard_and_relse; 568 } 569 if (nested) 570 bh_lock_sock_nested(sk); 571 else 572 bh_lock_sock(sk); 573 if (!sock_owned_by_user(sk)) { 574 /* 575 * trylock + unlock semantics: 576 */ 577 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 578 579 rc = sk_backlog_rcv(sk, skb); 580 581 mutex_release(&sk->sk_lock.dep_map, _RET_IP_); 582 } else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) { 583 bh_unlock_sock(sk); 584 if (err == -ENOMEM) 585 reason = SKB_DROP_REASON_PFMEMALLOC; 586 if (err == -ENOBUFS) 587 reason = SKB_DROP_REASON_SOCKET_BACKLOG; 588 sk_drops_inc(sk); 589 goto discard_and_relse; 590 } 591 592 bh_unlock_sock(sk); 593 out: 594 if (refcounted) 595 sock_put(sk); 596 return rc; 597 discard_and_relse: 598 sk_skb_reason_drop(sk, skb, reason); 599 goto out; 600 } 601 EXPORT_SYMBOL(__sk_receive_skb); 602 603 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, 604 u32)); 605 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 606 u32)); 607 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 608 { 609 struct dst_entry *dst = __sk_dst_get(sk); 610 611 if (dst && READ_ONCE(dst->obsolete) && 612 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 613 dst, cookie) == NULL) { 614 sk_tx_queue_clear(sk); 615 WRITE_ONCE(sk->sk_dst_pending_confirm, 0); 616 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 617 dst_release(dst); 618 return NULL; 619 } 620 621 return dst; 622 } 623 EXPORT_SYMBOL(__sk_dst_check); 624 625 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 626 { 627 struct dst_entry *dst = sk_dst_get(sk); 628 629 if (dst && READ_ONCE(dst->obsolete) && 630 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 631 dst, cookie) == NULL) { 632 sk_dst_reset(sk); 633 dst_release(dst); 634 return NULL; 635 } 636 637 return dst; 638 } 639 EXPORT_SYMBOL(sk_dst_check); 640 641 static int sock_bindtoindex_locked(struct sock *sk, int ifindex) 642 { 643 int ret = -ENOPROTOOPT; 644 #ifdef CONFIG_NETDEVICES 645 struct net *net = sock_net(sk); 646 647 /* Sorry... */ 648 ret = -EPERM; 649 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) 650 goto out; 651 652 ret = -EINVAL; 653 if (ifindex < 0) 654 goto out; 655 656 /* Paired with all READ_ONCE() done locklessly. */ 657 WRITE_ONCE(sk->sk_bound_dev_if, ifindex); 658 659 if (sk->sk_prot->rehash) 660 sk->sk_prot->rehash(sk); 661 sk_dst_reset(sk); 662 663 ret = 0; 664 665 out: 666 #endif 667 668 return ret; 669 } 670 671 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) 672 { 673 int ret; 674 675 if (lock_sk) 676 lock_sock(sk); 677 ret = sock_bindtoindex_locked(sk, ifindex); 678 if (lock_sk) 679 release_sock(sk); 680 681 return ret; 682 } 683 EXPORT_SYMBOL(sock_bindtoindex); 684 685 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) 686 { 687 int ret = -ENOPROTOOPT; 688 #ifdef CONFIG_NETDEVICES 689 struct net *net = sock_net(sk); 690 char devname[IFNAMSIZ]; 691 int index; 692 693 ret = -EINVAL; 694 if (optlen < 0) 695 goto out; 696 697 /* Bind this socket to a particular device like "eth0", 698 * as specified in the passed interface name. If the 699 * name is "" or the option length is zero the socket 700 * is not bound. 701 */ 702 if (optlen > IFNAMSIZ - 1) 703 optlen = IFNAMSIZ - 1; 704 memset(devname, 0, sizeof(devname)); 705 706 ret = -EFAULT; 707 if (copy_from_sockptr(devname, optval, optlen)) 708 goto out; 709 710 index = 0; 711 if (devname[0] != '\0') { 712 struct net_device *dev; 713 714 rcu_read_lock(); 715 dev = dev_get_by_name_rcu(net, devname); 716 if (dev) 717 index = dev->ifindex; 718 rcu_read_unlock(); 719 ret = -ENODEV; 720 if (!dev) 721 goto out; 722 } 723 724 sockopt_lock_sock(sk); 725 ret = sock_bindtoindex_locked(sk, index); 726 sockopt_release_sock(sk); 727 out: 728 #endif 729 730 return ret; 731 } 732 733 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval, 734 sockptr_t optlen, int len) 735 { 736 int ret = -ENOPROTOOPT; 737 #ifdef CONFIG_NETDEVICES 738 int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); 739 struct net *net = sock_net(sk); 740 char devname[IFNAMSIZ]; 741 742 if (bound_dev_if == 0) { 743 len = 0; 744 goto zero; 745 } 746 747 ret = -EINVAL; 748 if (len < IFNAMSIZ) 749 goto out; 750 751 ret = netdev_get_name(net, devname, bound_dev_if); 752 if (ret) 753 goto out; 754 755 len = strlen(devname) + 1; 756 757 ret = -EFAULT; 758 if (copy_to_sockptr(optval, devname, len)) 759 goto out; 760 761 zero: 762 ret = -EFAULT; 763 if (copy_to_sockptr(optlen, &len, sizeof(int))) 764 goto out; 765 766 ret = 0; 767 768 out: 769 #endif 770 771 return ret; 772 } 773 774 bool sk_mc_loop(const struct sock *sk) 775 { 776 if (dev_recursion_level()) 777 return false; 778 if (!sk) 779 return true; 780 /* IPV6_ADDRFORM can change sk->sk_family under us. */ 781 switch (READ_ONCE(sk->sk_family)) { 782 case AF_INET: 783 return inet_test_bit(MC_LOOP, sk); 784 #if IS_ENABLED(CONFIG_IPV6) 785 case AF_INET6: 786 return inet6_test_bit(MC6_LOOP, sk); 787 #endif 788 } 789 WARN_ON_ONCE(1); 790 return true; 791 } 792 EXPORT_SYMBOL(sk_mc_loop); 793 794 void sock_set_reuseaddr(struct sock *sk) 795 { 796 lock_sock(sk); 797 sk->sk_reuse = SK_CAN_REUSE; 798 release_sock(sk); 799 } 800 EXPORT_SYMBOL(sock_set_reuseaddr); 801 802 void sock_set_reuseport(struct sock *sk) 803 { 804 lock_sock(sk); 805 sk->sk_reuseport = true; 806 release_sock(sk); 807 } 808 EXPORT_SYMBOL(sock_set_reuseport); 809 810 void sock_no_linger(struct sock *sk) 811 { 812 lock_sock(sk); 813 WRITE_ONCE(sk->sk_lingertime, 0); 814 sock_set_flag(sk, SOCK_LINGER); 815 release_sock(sk); 816 } 817 EXPORT_SYMBOL(sock_no_linger); 818 819 void sock_set_priority(struct sock *sk, u32 priority) 820 { 821 WRITE_ONCE(sk->sk_priority, priority); 822 } 823 EXPORT_SYMBOL(sock_set_priority); 824 825 void sock_set_sndtimeo(struct sock *sk, s64 secs) 826 { 827 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) 828 WRITE_ONCE(sk->sk_sndtimeo, secs * HZ); 829 else 830 WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT); 831 } 832 EXPORT_SYMBOL(sock_set_sndtimeo); 833 834 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) 835 { 836 sock_valbool_flag(sk, SOCK_RCVTSTAMP, val); 837 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns); 838 if (val) { 839 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); 840 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 841 } 842 } 843 844 void sock_set_timestamp(struct sock *sk, int optname, bool valbool) 845 { 846 switch (optname) { 847 case SO_TIMESTAMP_OLD: 848 __sock_set_timestamps(sk, valbool, false, false); 849 break; 850 case SO_TIMESTAMP_NEW: 851 __sock_set_timestamps(sk, valbool, true, false); 852 break; 853 case SO_TIMESTAMPNS_OLD: 854 __sock_set_timestamps(sk, valbool, false, true); 855 break; 856 case SO_TIMESTAMPNS_NEW: 857 __sock_set_timestamps(sk, valbool, true, true); 858 break; 859 } 860 } 861 862 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) 863 { 864 struct net *net = sock_net(sk); 865 struct net_device *dev = NULL; 866 bool match = false; 867 int *vclock_index; 868 int i, num; 869 870 if (sk->sk_bound_dev_if) 871 dev = dev_get_by_index(net, sk->sk_bound_dev_if); 872 873 if (!dev) { 874 pr_err("%s: sock not bind to device\n", __func__); 875 return -EOPNOTSUPP; 876 } 877 878 num = ethtool_get_phc_vclocks(dev, &vclock_index); 879 dev_put(dev); 880 881 for (i = 0; i < num; i++) { 882 if (*(vclock_index + i) == phc_index) { 883 match = true; 884 break; 885 } 886 } 887 888 if (num > 0) 889 kfree(vclock_index); 890 891 if (!match) 892 return -EINVAL; 893 894 WRITE_ONCE(sk->sk_bind_phc, phc_index); 895 896 return 0; 897 } 898 899 int sock_set_timestamping(struct sock *sk, int optname, 900 struct so_timestamping timestamping) 901 { 902 int val = timestamping.flags; 903 int ret; 904 905 if (val & ~SOF_TIMESTAMPING_MASK) 906 return -EINVAL; 907 908 if (val & SOF_TIMESTAMPING_OPT_ID_TCP && 909 !(val & SOF_TIMESTAMPING_OPT_ID)) 910 return -EINVAL; 911 912 if (val & SOF_TIMESTAMPING_OPT_ID && 913 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 914 if (sk_is_tcp(sk)) { 915 if ((1 << sk->sk_state) & 916 (TCPF_CLOSE | TCPF_LISTEN)) 917 return -EINVAL; 918 if (val & SOF_TIMESTAMPING_OPT_ID_TCP) 919 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); 920 else 921 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); 922 } else { 923 atomic_set(&sk->sk_tskey, 0); 924 } 925 } 926 927 if (val & SOF_TIMESTAMPING_OPT_STATS && 928 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) 929 return -EINVAL; 930 931 if (val & SOF_TIMESTAMPING_BIND_PHC) { 932 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc); 933 if (ret) 934 return ret; 935 } 936 937 WRITE_ONCE(sk->sk_tsflags, val); 938 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); 939 sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY)); 940 941 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 942 sock_enable_timestamp(sk, 943 SOCK_TIMESTAMPING_RX_SOFTWARE); 944 else 945 sock_disable_timestamp(sk, 946 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 947 return 0; 948 } 949 950 #if defined(CONFIG_CGROUP_BPF) 951 void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op) 952 { 953 struct bpf_sock_ops_kern sock_ops; 954 955 memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); 956 sock_ops.op = op; 957 sock_ops.is_fullsock = 1; 958 sock_ops.sk = sk; 959 bpf_skops_init_skb(&sock_ops, skb, 0); 960 __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS); 961 } 962 #endif 963 964 void sock_set_keepalive(struct sock *sk) 965 { 966 lock_sock(sk); 967 if (sk->sk_prot->keepalive) 968 sk->sk_prot->keepalive(sk, true); 969 sock_valbool_flag(sk, SOCK_KEEPOPEN, true); 970 release_sock(sk); 971 } 972 EXPORT_SYMBOL(sock_set_keepalive); 973 974 static void __sock_set_rcvbuf(struct sock *sk, int val) 975 { 976 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it 977 * as a negative value. 978 */ 979 val = min_t(int, val, INT_MAX / 2); 980 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 981 982 /* We double it on the way in to account for "struct sk_buff" etc. 983 * overhead. Applications assume that the SO_RCVBUF setting they make 984 * will allow that much actual data to be received on that socket. 985 * 986 * Applications are unaware that "struct sk_buff" and other overheads 987 * allocate from the receive buffer during socket buffer allocation. 988 * 989 * And after considering the possible alternatives, returning the value 990 * we actually used in getsockopt is the most desirable behavior. 991 */ 992 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); 993 } 994 995 void sock_set_rcvbuf(struct sock *sk, int val) 996 { 997 lock_sock(sk); 998 __sock_set_rcvbuf(sk, val); 999 release_sock(sk); 1000 } 1001 EXPORT_SYMBOL(sock_set_rcvbuf); 1002 1003 static void __sock_set_mark(struct sock *sk, u32 val) 1004 { 1005 if (val != sk->sk_mark) { 1006 WRITE_ONCE(sk->sk_mark, val); 1007 sk_dst_reset(sk); 1008 } 1009 } 1010 1011 void sock_set_mark(struct sock *sk, u32 val) 1012 { 1013 lock_sock(sk); 1014 __sock_set_mark(sk, val); 1015 release_sock(sk); 1016 } 1017 EXPORT_SYMBOL(sock_set_mark); 1018 1019 static void sock_release_reserved_memory(struct sock *sk, int bytes) 1020 { 1021 /* Round down bytes to multiple of pages */ 1022 bytes = round_down(bytes, PAGE_SIZE); 1023 1024 WARN_ON(bytes > sk->sk_reserved_mem); 1025 WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes); 1026 sk_mem_reclaim(sk); 1027 } 1028 1029 static int sock_reserve_memory(struct sock *sk, int bytes) 1030 { 1031 long allocated; 1032 bool charged; 1033 int pages; 1034 1035 if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk)) 1036 return -EOPNOTSUPP; 1037 1038 if (!bytes) 1039 return 0; 1040 1041 pages = sk_mem_pages(bytes); 1042 1043 /* pre-charge to memcg */ 1044 charged = mem_cgroup_sk_charge(sk, pages, 1045 GFP_KERNEL | __GFP_RETRY_MAYFAIL); 1046 if (!charged) 1047 return -ENOMEM; 1048 1049 if (sk->sk_bypass_prot_mem) 1050 goto success; 1051 1052 /* pre-charge to forward_alloc */ 1053 sk_memory_allocated_add(sk, pages); 1054 allocated = sk_memory_allocated(sk); 1055 1056 /* If the system goes into memory pressure with this 1057 * precharge, give up and return error. 1058 */ 1059 if (allocated > sk_prot_mem_limits(sk, 1)) { 1060 sk_memory_allocated_sub(sk, pages); 1061 mem_cgroup_sk_uncharge(sk, pages); 1062 return -ENOMEM; 1063 } 1064 1065 success: 1066 sk_forward_alloc_add(sk, pages << PAGE_SHIFT); 1067 1068 WRITE_ONCE(sk->sk_reserved_mem, 1069 sk->sk_reserved_mem + (pages << PAGE_SHIFT)); 1070 1071 return 0; 1072 } 1073 1074 #ifdef CONFIG_PAGE_POOL 1075 1076 /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED 1077 * in 1 syscall. The limit exists to limit the amount of memory the kernel 1078 * allocates to copy these tokens, and to prevent looping over the frags for 1079 * too long. 1080 */ 1081 #define MAX_DONTNEED_TOKENS 128 1082 #define MAX_DONTNEED_FRAGS 1024 1083 1084 static noinline_for_stack int 1085 sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) 1086 { 1087 unsigned int num_tokens, i, j, k, netmem_num = 0; 1088 struct dmabuf_token *tokens; 1089 int ret = 0, num_frags = 0; 1090 netmem_ref netmems[16]; 1091 1092 if (!sk_is_tcp(sk)) 1093 return -EBADF; 1094 1095 if (optlen % sizeof(*tokens) || 1096 optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS) 1097 return -EINVAL; 1098 1099 num_tokens = optlen / sizeof(*tokens); 1100 tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL); 1101 if (!tokens) 1102 return -ENOMEM; 1103 1104 if (copy_from_sockptr(tokens, optval, optlen)) { 1105 kvfree(tokens); 1106 return -EFAULT; 1107 } 1108 1109 xa_lock_bh(&sk->sk_user_frags); 1110 for (i = 0; i < num_tokens; i++) { 1111 for (j = 0; j < tokens[i].token_count; j++) { 1112 if (++num_frags > MAX_DONTNEED_FRAGS) 1113 goto frag_limit_reached; 1114 1115 netmem_ref netmem = (__force netmem_ref)__xa_erase( 1116 &sk->sk_user_frags, tokens[i].token_start + j); 1117 1118 if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem))) 1119 continue; 1120 1121 netmems[netmem_num++] = netmem; 1122 if (netmem_num == ARRAY_SIZE(netmems)) { 1123 xa_unlock_bh(&sk->sk_user_frags); 1124 for (k = 0; k < netmem_num; k++) 1125 WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); 1126 netmem_num = 0; 1127 xa_lock_bh(&sk->sk_user_frags); 1128 } 1129 ret++; 1130 } 1131 } 1132 1133 frag_limit_reached: 1134 xa_unlock_bh(&sk->sk_user_frags); 1135 for (k = 0; k < netmem_num; k++) 1136 WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); 1137 1138 kvfree(tokens); 1139 return ret; 1140 } 1141 #endif 1142 1143 void sockopt_lock_sock(struct sock *sk) 1144 { 1145 /* When current->bpf_ctx is set, the setsockopt is called from 1146 * a bpf prog. bpf has ensured the sk lock has been 1147 * acquired before calling setsockopt(). 1148 */ 1149 if (has_current_bpf_ctx()) 1150 return; 1151 1152 lock_sock(sk); 1153 } 1154 EXPORT_SYMBOL(sockopt_lock_sock); 1155 1156 void sockopt_release_sock(struct sock *sk) 1157 { 1158 if (has_current_bpf_ctx()) 1159 return; 1160 1161 release_sock(sk); 1162 } 1163 EXPORT_SYMBOL(sockopt_release_sock); 1164 1165 bool sockopt_ns_capable(struct user_namespace *ns, int cap) 1166 { 1167 return has_current_bpf_ctx() || ns_capable(ns, cap); 1168 } 1169 EXPORT_SYMBOL(sockopt_ns_capable); 1170 1171 bool sockopt_capable(int cap) 1172 { 1173 return has_current_bpf_ctx() || capable(cap); 1174 } 1175 EXPORT_SYMBOL(sockopt_capable); 1176 1177 static int sockopt_validate_clockid(__kernel_clockid_t value) 1178 { 1179 switch (value) { 1180 case CLOCK_REALTIME: 1181 case CLOCK_MONOTONIC: 1182 case CLOCK_TAI: 1183 return 0; 1184 } 1185 return -EINVAL; 1186 } 1187 1188 /* 1189 * This is meant for all protocols to use and covers goings on 1190 * at the socket level. Everything here is generic. 1191 */ 1192 1193 int sk_setsockopt(struct sock *sk, int level, int optname, 1194 sockptr_t optval, unsigned int optlen) 1195 { 1196 struct so_timestamping timestamping; 1197 struct socket *sock = sk->sk_socket; 1198 struct sock_txtime sk_txtime; 1199 int val; 1200 int valbool; 1201 struct linger ling; 1202 int ret = 0; 1203 1204 /* 1205 * Options without arguments 1206 */ 1207 1208 if (optname == SO_BINDTODEVICE) 1209 return sock_setbindtodevice(sk, optval, optlen); 1210 1211 if (optlen < sizeof(int)) 1212 return -EINVAL; 1213 1214 if (copy_from_sockptr(&val, optval, sizeof(val))) 1215 return -EFAULT; 1216 1217 valbool = val ? 1 : 0; 1218 1219 /* handle options which do not require locking the socket. */ 1220 switch (optname) { 1221 case SO_PRIORITY: 1222 if (sk_set_prio_allowed(sk, val)) { 1223 sock_set_priority(sk, val); 1224 return 0; 1225 } 1226 return -EPERM; 1227 case SO_TYPE: 1228 case SO_PROTOCOL: 1229 case SO_DOMAIN: 1230 case SO_ERROR: 1231 return -ENOPROTOOPT; 1232 #ifdef CONFIG_NET_RX_BUSY_POLL 1233 case SO_BUSY_POLL: 1234 if (val < 0) 1235 return -EINVAL; 1236 WRITE_ONCE(sk->sk_ll_usec, val); 1237 return 0; 1238 case SO_PREFER_BUSY_POLL: 1239 if (valbool && !sockopt_capable(CAP_NET_ADMIN)) 1240 return -EPERM; 1241 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); 1242 return 0; 1243 case SO_BUSY_POLL_BUDGET: 1244 if (val > READ_ONCE(sk->sk_busy_poll_budget) && 1245 !sockopt_capable(CAP_NET_ADMIN)) 1246 return -EPERM; 1247 if (val < 0 || val > U16_MAX) 1248 return -EINVAL; 1249 WRITE_ONCE(sk->sk_busy_poll_budget, val); 1250 return 0; 1251 #endif 1252 case SO_MAX_PACING_RATE: 1253 { 1254 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; 1255 unsigned long pacing_rate; 1256 1257 if (sizeof(ulval) != sizeof(val) && 1258 optlen >= sizeof(ulval) && 1259 copy_from_sockptr(&ulval, optval, sizeof(ulval))) { 1260 return -EFAULT; 1261 } 1262 if (ulval != ~0UL) 1263 cmpxchg(&sk->sk_pacing_status, 1264 SK_PACING_NONE, 1265 SK_PACING_NEEDED); 1266 /* Pairs with READ_ONCE() from sk_getsockopt() */ 1267 WRITE_ONCE(sk->sk_max_pacing_rate, ulval); 1268 pacing_rate = READ_ONCE(sk->sk_pacing_rate); 1269 if (ulval < pacing_rate) 1270 WRITE_ONCE(sk->sk_pacing_rate, ulval); 1271 return 0; 1272 } 1273 case SO_TXREHASH: 1274 if (!sk_is_tcp(sk)) 1275 return -EOPNOTSUPP; 1276 if (val < -1 || val > 1) 1277 return -EINVAL; 1278 if ((u8)val == SOCK_TXREHASH_DEFAULT) 1279 val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); 1280 /* Paired with READ_ONCE() in tcp_rtx_synack() 1281 * and sk_getsockopt(). 1282 */ 1283 WRITE_ONCE(sk->sk_txrehash, (u8)val); 1284 return 0; 1285 case SO_PEEK_OFF: 1286 { 1287 int (*set_peek_off)(struct sock *sk, int val); 1288 1289 set_peek_off = READ_ONCE(sock->ops)->set_peek_off; 1290 if (set_peek_off) 1291 ret = set_peek_off(sk, val); 1292 else 1293 ret = -EOPNOTSUPP; 1294 return ret; 1295 } 1296 #ifdef CONFIG_PAGE_POOL 1297 case SO_DEVMEM_DONTNEED: 1298 return sock_devmem_dontneed(sk, optval, optlen); 1299 #endif 1300 case SO_SNDTIMEO_OLD: 1301 case SO_SNDTIMEO_NEW: 1302 return sock_set_timeout(&sk->sk_sndtimeo, optval, 1303 optlen, optname == SO_SNDTIMEO_OLD); 1304 case SO_RCVTIMEO_OLD: 1305 case SO_RCVTIMEO_NEW: 1306 return sock_set_timeout(&sk->sk_rcvtimeo, optval, 1307 optlen, optname == SO_RCVTIMEO_OLD); 1308 } 1309 1310 sockopt_lock_sock(sk); 1311 1312 switch (optname) { 1313 case SO_DEBUG: 1314 if (val && !sockopt_capable(CAP_NET_ADMIN)) 1315 ret = -EACCES; 1316 else 1317 sock_valbool_flag(sk, SOCK_DBG, valbool); 1318 break; 1319 case SO_REUSEADDR: 1320 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 1321 break; 1322 case SO_REUSEPORT: 1323 if (valbool && !sk_is_inet(sk)) 1324 ret = -EOPNOTSUPP; 1325 else 1326 sk->sk_reuseport = valbool; 1327 break; 1328 case SO_DONTROUTE: 1329 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 1330 sk_dst_reset(sk); 1331 break; 1332 case SO_BROADCAST: 1333 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 1334 break; 1335 case SO_SNDBUF: 1336 /* Don't error on this BSD doesn't and if you think 1337 * about it this is right. Otherwise apps have to 1338 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1339 * are treated in BSD as hints 1340 */ 1341 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); 1342 set_sndbuf: 1343 /* Ensure val * 2 fits into an int, to prevent max_t() 1344 * from treating it as a negative value. 1345 */ 1346 val = min_t(int, val, INT_MAX / 2); 1347 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 1348 WRITE_ONCE(sk->sk_sndbuf, 1349 max_t(int, val * 2, SOCK_MIN_SNDBUF)); 1350 /* Wake up sending tasks if we upped the value. */ 1351 sk->sk_write_space(sk); 1352 break; 1353 1354 case SO_SNDBUFFORCE: 1355 if (!sockopt_capable(CAP_NET_ADMIN)) { 1356 ret = -EPERM; 1357 break; 1358 } 1359 1360 /* No negative values (to prevent underflow, as val will be 1361 * multiplied by 2). 1362 */ 1363 if (val < 0) 1364 val = 0; 1365 goto set_sndbuf; 1366 1367 case SO_RCVBUF: 1368 /* Don't error on this BSD doesn't and if you think 1369 * about it this is right. Otherwise apps have to 1370 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1371 * are treated in BSD as hints 1372 */ 1373 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max))); 1374 break; 1375 1376 case SO_RCVBUFFORCE: 1377 if (!sockopt_capable(CAP_NET_ADMIN)) { 1378 ret = -EPERM; 1379 break; 1380 } 1381 1382 /* No negative values (to prevent underflow, as val will be 1383 * multiplied by 2). 1384 */ 1385 __sock_set_rcvbuf(sk, max(val, 0)); 1386 break; 1387 1388 case SO_KEEPALIVE: 1389 if (sk->sk_prot->keepalive) 1390 sk->sk_prot->keepalive(sk, valbool); 1391 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 1392 break; 1393 1394 case SO_OOBINLINE: 1395 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 1396 break; 1397 1398 case SO_NO_CHECK: 1399 sk->sk_no_check_tx = valbool; 1400 break; 1401 1402 case SO_LINGER: 1403 if (optlen < sizeof(ling)) { 1404 ret = -EINVAL; /* 1003.1g */ 1405 break; 1406 } 1407 if (copy_from_sockptr(&ling, optval, sizeof(ling))) { 1408 ret = -EFAULT; 1409 break; 1410 } 1411 if (!ling.l_onoff) { 1412 sock_reset_flag(sk, SOCK_LINGER); 1413 } else { 1414 unsigned long t_sec = ling.l_linger; 1415 1416 if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ) 1417 WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT); 1418 else 1419 WRITE_ONCE(sk->sk_lingertime, t_sec * HZ); 1420 sock_set_flag(sk, SOCK_LINGER); 1421 } 1422 break; 1423 1424 case SO_BSDCOMPAT: 1425 break; 1426 1427 case SO_TIMESTAMP_OLD: 1428 case SO_TIMESTAMP_NEW: 1429 case SO_TIMESTAMPNS_OLD: 1430 case SO_TIMESTAMPNS_NEW: 1431 sock_set_timestamp(sk, optname, valbool); 1432 break; 1433 1434 case SO_TIMESTAMPING_NEW: 1435 case SO_TIMESTAMPING_OLD: 1436 if (optlen == sizeof(timestamping)) { 1437 if (copy_from_sockptr(×tamping, optval, 1438 sizeof(timestamping))) { 1439 ret = -EFAULT; 1440 break; 1441 } 1442 } else { 1443 memset(×tamping, 0, sizeof(timestamping)); 1444 timestamping.flags = val; 1445 } 1446 ret = sock_set_timestamping(sk, optname, timestamping); 1447 break; 1448 1449 case SO_RCVLOWAT: 1450 { 1451 int (*set_rcvlowat)(struct sock *sk, int val) = NULL; 1452 1453 if (val < 0) 1454 val = INT_MAX; 1455 if (sock) 1456 set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat; 1457 if (set_rcvlowat) 1458 ret = set_rcvlowat(sk, val); 1459 else 1460 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); 1461 break; 1462 } 1463 case SO_ATTACH_FILTER: { 1464 struct sock_fprog fprog; 1465 1466 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1467 if (!ret) 1468 ret = sk_attach_filter(&fprog, sk); 1469 break; 1470 } 1471 case SO_ATTACH_BPF: 1472 ret = -EINVAL; 1473 if (optlen == sizeof(u32)) { 1474 u32 ufd; 1475 1476 ret = -EFAULT; 1477 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1478 break; 1479 1480 ret = sk_attach_bpf(ufd, sk); 1481 } 1482 break; 1483 1484 case SO_ATTACH_REUSEPORT_CBPF: { 1485 struct sock_fprog fprog; 1486 1487 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1488 if (!ret) 1489 ret = sk_reuseport_attach_filter(&fprog, sk); 1490 break; 1491 } 1492 case SO_ATTACH_REUSEPORT_EBPF: 1493 ret = -EINVAL; 1494 if (optlen == sizeof(u32)) { 1495 u32 ufd; 1496 1497 ret = -EFAULT; 1498 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1499 break; 1500 1501 ret = sk_reuseport_attach_bpf(ufd, sk); 1502 } 1503 break; 1504 1505 case SO_DETACH_REUSEPORT_BPF: 1506 ret = reuseport_detach_prog(sk); 1507 break; 1508 1509 case SO_DETACH_FILTER: 1510 ret = sk_detach_filter(sk); 1511 break; 1512 1513 case SO_LOCK_FILTER: 1514 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 1515 ret = -EPERM; 1516 else 1517 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 1518 break; 1519 1520 case SO_MARK: 1521 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 1522 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1523 ret = -EPERM; 1524 break; 1525 } 1526 1527 __sock_set_mark(sk, val); 1528 break; 1529 case SO_RCVMARK: 1530 sock_valbool_flag(sk, SOCK_RCVMARK, valbool); 1531 break; 1532 1533 case SO_RCVPRIORITY: 1534 sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool); 1535 break; 1536 1537 case SO_RXQ_OVFL: 1538 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 1539 break; 1540 1541 case SO_WIFI_STATUS: 1542 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1543 break; 1544 1545 case SO_NOFCS: 1546 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1547 break; 1548 1549 case SO_SELECT_ERR_QUEUE: 1550 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1551 break; 1552 1553 case SO_PASSCRED: 1554 if (sk_may_scm_recv(sk)) 1555 sk->sk_scm_credentials = valbool; 1556 else 1557 ret = -EOPNOTSUPP; 1558 break; 1559 1560 case SO_PASSSEC: 1561 if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk)) 1562 sk->sk_scm_security = valbool; 1563 else 1564 ret = -EOPNOTSUPP; 1565 break; 1566 1567 case SO_PASSPIDFD: 1568 if (sk_is_unix(sk)) 1569 sk->sk_scm_pidfd = valbool; 1570 else 1571 ret = -EOPNOTSUPP; 1572 break; 1573 1574 case SO_PASSRIGHTS: 1575 if (sk_is_unix(sk)) 1576 sk->sk_scm_rights = valbool; 1577 else 1578 ret = -EOPNOTSUPP; 1579 break; 1580 1581 case SO_INCOMING_CPU: 1582 reuseport_update_incoming_cpu(sk, val); 1583 break; 1584 1585 case SO_CNX_ADVICE: 1586 if (val == 1) 1587 dst_negative_advice(sk); 1588 break; 1589 1590 case SO_ZEROCOPY: 1591 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1592 if (!(sk_is_tcp(sk) || 1593 (sk->sk_type == SOCK_DGRAM && 1594 sk->sk_protocol == IPPROTO_UDP))) 1595 ret = -EOPNOTSUPP; 1596 } else if (sk->sk_family != PF_RDS) { 1597 ret = -EOPNOTSUPP; 1598 } 1599 if (!ret) { 1600 if (val < 0 || val > 1) 1601 ret = -EINVAL; 1602 else 1603 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1604 } 1605 break; 1606 1607 case SO_TXTIME: 1608 if (optlen != sizeof(struct sock_txtime)) { 1609 ret = -EINVAL; 1610 break; 1611 } else if (copy_from_sockptr(&sk_txtime, optval, 1612 sizeof(struct sock_txtime))) { 1613 ret = -EFAULT; 1614 break; 1615 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { 1616 ret = -EINVAL; 1617 break; 1618 } 1619 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet 1620 * scheduler has enough safe guards. 1621 */ 1622 if (sk_txtime.clockid != CLOCK_MONOTONIC && 1623 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1624 ret = -EPERM; 1625 break; 1626 } 1627 1628 ret = sockopt_validate_clockid(sk_txtime.clockid); 1629 if (ret) 1630 break; 1631 1632 sock_valbool_flag(sk, SOCK_TXTIME, true); 1633 sk->sk_clockid = sk_txtime.clockid; 1634 sk->sk_txtime_deadline_mode = 1635 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); 1636 sk->sk_txtime_report_errors = 1637 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); 1638 break; 1639 1640 case SO_BINDTOIFINDEX: 1641 ret = sock_bindtoindex_locked(sk, val); 1642 break; 1643 1644 case SO_BUF_LOCK: 1645 if (val & ~SOCK_BUF_LOCK_MASK) { 1646 ret = -EINVAL; 1647 break; 1648 } 1649 sk->sk_userlocks = val | (sk->sk_userlocks & 1650 ~SOCK_BUF_LOCK_MASK); 1651 break; 1652 1653 case SO_RESERVE_MEM: 1654 { 1655 int delta; 1656 1657 if (val < 0) { 1658 ret = -EINVAL; 1659 break; 1660 } 1661 1662 delta = val - sk->sk_reserved_mem; 1663 if (delta < 0) 1664 sock_release_reserved_memory(sk, -delta); 1665 else 1666 ret = sock_reserve_memory(sk, delta); 1667 break; 1668 } 1669 1670 default: 1671 ret = -ENOPROTOOPT; 1672 break; 1673 } 1674 sockopt_release_sock(sk); 1675 return ret; 1676 } 1677 1678 int sock_setsockopt(struct socket *sock, int level, int optname, 1679 sockptr_t optval, unsigned int optlen) 1680 { 1681 return sk_setsockopt(sock->sk, level, optname, 1682 optval, optlen); 1683 } 1684 EXPORT_SYMBOL(sock_setsockopt); 1685 1686 static const struct cred *sk_get_peer_cred(struct sock *sk) 1687 { 1688 const struct cred *cred; 1689 1690 spin_lock(&sk->sk_peer_lock); 1691 cred = get_cred(sk->sk_peer_cred); 1692 spin_unlock(&sk->sk_peer_lock); 1693 1694 return cred; 1695 } 1696 1697 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1698 struct ucred *ucred) 1699 { 1700 ucred->pid = pid_vnr(pid); 1701 ucred->uid = ucred->gid = -1; 1702 if (cred) { 1703 struct user_namespace *current_ns = current_user_ns(); 1704 1705 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1706 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1707 } 1708 } 1709 1710 static int groups_to_user(sockptr_t dst, const struct group_info *src) 1711 { 1712 struct user_namespace *user_ns = current_user_ns(); 1713 int i; 1714 1715 for (i = 0; i < src->ngroups; i++) { 1716 gid_t gid = from_kgid_munged(user_ns, src->gid[i]); 1717 1718 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid))) 1719 return -EFAULT; 1720 } 1721 1722 return 0; 1723 } 1724 1725 int sk_getsockopt(struct sock *sk, int level, int optname, 1726 sockptr_t optval, sockptr_t optlen) 1727 { 1728 struct socket *sock = sk->sk_socket; 1729 1730 union { 1731 int val; 1732 u64 val64; 1733 unsigned long ulval; 1734 struct linger ling; 1735 struct old_timeval32 tm32; 1736 struct __kernel_old_timeval tm; 1737 struct __kernel_sock_timeval stm; 1738 struct sock_txtime txtime; 1739 struct so_timestamping timestamping; 1740 } v; 1741 1742 int lv = sizeof(int); 1743 int len; 1744 1745 if (copy_from_sockptr(&len, optlen, sizeof(int))) 1746 return -EFAULT; 1747 if (len < 0) 1748 return -EINVAL; 1749 1750 memset(&v, 0, sizeof(v)); 1751 1752 switch (optname) { 1753 case SO_DEBUG: 1754 v.val = sock_flag(sk, SOCK_DBG); 1755 break; 1756 1757 case SO_DONTROUTE: 1758 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1759 break; 1760 1761 case SO_BROADCAST: 1762 v.val = sock_flag(sk, SOCK_BROADCAST); 1763 break; 1764 1765 case SO_SNDBUF: 1766 v.val = READ_ONCE(sk->sk_sndbuf); 1767 break; 1768 1769 case SO_RCVBUF: 1770 v.val = READ_ONCE(sk->sk_rcvbuf); 1771 break; 1772 1773 case SO_REUSEADDR: 1774 v.val = sk->sk_reuse; 1775 break; 1776 1777 case SO_REUSEPORT: 1778 v.val = sk->sk_reuseport; 1779 break; 1780 1781 case SO_KEEPALIVE: 1782 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1783 break; 1784 1785 case SO_TYPE: 1786 v.val = sk->sk_type; 1787 break; 1788 1789 case SO_PROTOCOL: 1790 v.val = sk->sk_protocol; 1791 break; 1792 1793 case SO_DOMAIN: 1794 v.val = sk->sk_family; 1795 break; 1796 1797 case SO_ERROR: 1798 v.val = -sock_error(sk); 1799 if (v.val == 0) 1800 v.val = xchg(&sk->sk_err_soft, 0); 1801 break; 1802 1803 case SO_OOBINLINE: 1804 v.val = sock_flag(sk, SOCK_URGINLINE); 1805 break; 1806 1807 case SO_NO_CHECK: 1808 v.val = sk->sk_no_check_tx; 1809 break; 1810 1811 case SO_PRIORITY: 1812 v.val = READ_ONCE(sk->sk_priority); 1813 break; 1814 1815 case SO_LINGER: 1816 lv = sizeof(v.ling); 1817 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1818 v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ; 1819 break; 1820 1821 case SO_BSDCOMPAT: 1822 break; 1823 1824 case SO_TIMESTAMP_OLD: 1825 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1826 !sock_flag(sk, SOCK_TSTAMP_NEW) && 1827 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1828 break; 1829 1830 case SO_TIMESTAMPNS_OLD: 1831 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); 1832 break; 1833 1834 case SO_TIMESTAMP_NEW: 1835 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); 1836 break; 1837 1838 case SO_TIMESTAMPNS_NEW: 1839 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); 1840 break; 1841 1842 case SO_TIMESTAMPING_OLD: 1843 case SO_TIMESTAMPING_NEW: 1844 lv = sizeof(v.timestamping); 1845 /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only 1846 * returning the flags when they were set through the same option. 1847 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD. 1848 */ 1849 if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) { 1850 v.timestamping.flags = READ_ONCE(sk->sk_tsflags); 1851 v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); 1852 } 1853 break; 1854 1855 case SO_RCVTIMEO_OLD: 1856 case SO_RCVTIMEO_NEW: 1857 lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v, 1858 SO_RCVTIMEO_OLD == optname); 1859 break; 1860 1861 case SO_SNDTIMEO_OLD: 1862 case SO_SNDTIMEO_NEW: 1863 lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v, 1864 SO_SNDTIMEO_OLD == optname); 1865 break; 1866 1867 case SO_RCVLOWAT: 1868 v.val = READ_ONCE(sk->sk_rcvlowat); 1869 break; 1870 1871 case SO_SNDLOWAT: 1872 v.val = 1; 1873 break; 1874 1875 case SO_PASSCRED: 1876 if (!sk_may_scm_recv(sk)) 1877 return -EOPNOTSUPP; 1878 1879 v.val = sk->sk_scm_credentials; 1880 break; 1881 1882 case SO_PASSPIDFD: 1883 if (!sk_is_unix(sk)) 1884 return -EOPNOTSUPP; 1885 1886 v.val = sk->sk_scm_pidfd; 1887 break; 1888 1889 case SO_PASSRIGHTS: 1890 if (!sk_is_unix(sk)) 1891 return -EOPNOTSUPP; 1892 1893 v.val = sk->sk_scm_rights; 1894 break; 1895 1896 case SO_PEERCRED: 1897 { 1898 struct ucred peercred; 1899 if (len > sizeof(peercred)) 1900 len = sizeof(peercred); 1901 1902 spin_lock(&sk->sk_peer_lock); 1903 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1904 spin_unlock(&sk->sk_peer_lock); 1905 1906 if (copy_to_sockptr(optval, &peercred, len)) 1907 return -EFAULT; 1908 goto lenout; 1909 } 1910 1911 case SO_PEERPIDFD: 1912 { 1913 struct pid *peer_pid; 1914 struct file *pidfd_file = NULL; 1915 unsigned int flags = 0; 1916 int pidfd; 1917 1918 if (len > sizeof(pidfd)) 1919 len = sizeof(pidfd); 1920 1921 spin_lock(&sk->sk_peer_lock); 1922 peer_pid = get_pid(sk->sk_peer_pid); 1923 spin_unlock(&sk->sk_peer_lock); 1924 1925 if (!peer_pid) 1926 return -ENODATA; 1927 1928 /* The use of PIDFD_STALE requires stashing of struct pid 1929 * on pidfs with pidfs_register_pid() and only AF_UNIX 1930 * were prepared for this. 1931 */ 1932 if (sk->sk_family == AF_UNIX) 1933 flags = PIDFD_STALE; 1934 1935 pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file); 1936 put_pid(peer_pid); 1937 if (pidfd < 0) 1938 return pidfd; 1939 1940 if (copy_to_sockptr(optval, &pidfd, len) || 1941 copy_to_sockptr(optlen, &len, sizeof(int))) { 1942 put_unused_fd(pidfd); 1943 fput(pidfd_file); 1944 1945 return -EFAULT; 1946 } 1947 1948 fd_install(pidfd, pidfd_file); 1949 return 0; 1950 } 1951 1952 case SO_PEERGROUPS: 1953 { 1954 const struct cred *cred; 1955 int ret, n; 1956 1957 cred = sk_get_peer_cred(sk); 1958 if (!cred) 1959 return -ENODATA; 1960 1961 n = cred->group_info->ngroups; 1962 if (len < n * sizeof(gid_t)) { 1963 len = n * sizeof(gid_t); 1964 put_cred(cred); 1965 return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE; 1966 } 1967 len = n * sizeof(gid_t); 1968 1969 ret = groups_to_user(optval, cred->group_info); 1970 put_cred(cred); 1971 if (ret) 1972 return ret; 1973 goto lenout; 1974 } 1975 1976 case SO_PEERNAME: 1977 { 1978 struct sockaddr_storage address; 1979 1980 lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2); 1981 if (lv < 0) 1982 return -ENOTCONN; 1983 if (lv < len) 1984 return -EINVAL; 1985 if (copy_to_sockptr(optval, &address, len)) 1986 return -EFAULT; 1987 goto lenout; 1988 } 1989 1990 /* Dubious BSD thing... Probably nobody even uses it, but 1991 * the UNIX standard wants it for whatever reason... -DaveM 1992 */ 1993 case SO_ACCEPTCONN: 1994 v.val = sk->sk_state == TCP_LISTEN; 1995 break; 1996 1997 case SO_PASSSEC: 1998 if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk)) 1999 return -EOPNOTSUPP; 2000 2001 v.val = sk->sk_scm_security; 2002 break; 2003 2004 case SO_PEERSEC: 2005 return security_socket_getpeersec_stream(sock, 2006 optval, optlen, len); 2007 2008 case SO_MARK: 2009 v.val = READ_ONCE(sk->sk_mark); 2010 break; 2011 2012 case SO_RCVMARK: 2013 v.val = sock_flag(sk, SOCK_RCVMARK); 2014 break; 2015 2016 case SO_RCVPRIORITY: 2017 v.val = sock_flag(sk, SOCK_RCVPRIORITY); 2018 break; 2019 2020 case SO_RXQ_OVFL: 2021 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 2022 break; 2023 2024 case SO_WIFI_STATUS: 2025 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 2026 break; 2027 2028 case SO_PEEK_OFF: 2029 if (!READ_ONCE(sock->ops)->set_peek_off) 2030 return -EOPNOTSUPP; 2031 2032 v.val = READ_ONCE(sk->sk_peek_off); 2033 break; 2034 case SO_NOFCS: 2035 v.val = sock_flag(sk, SOCK_NOFCS); 2036 break; 2037 2038 case SO_BINDTODEVICE: 2039 return sock_getbindtodevice(sk, optval, optlen, len); 2040 2041 case SO_GET_FILTER: 2042 len = sk_get_filter(sk, optval, len); 2043 if (len < 0) 2044 return len; 2045 2046 goto lenout; 2047 2048 case SO_LOCK_FILTER: 2049 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 2050 break; 2051 2052 case SO_BPF_EXTENSIONS: 2053 v.val = bpf_tell_extensions(); 2054 break; 2055 2056 case SO_SELECT_ERR_QUEUE: 2057 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 2058 break; 2059 2060 #ifdef CONFIG_NET_RX_BUSY_POLL 2061 case SO_BUSY_POLL: 2062 v.val = READ_ONCE(sk->sk_ll_usec); 2063 break; 2064 case SO_PREFER_BUSY_POLL: 2065 v.val = READ_ONCE(sk->sk_prefer_busy_poll); 2066 break; 2067 #endif 2068 2069 case SO_MAX_PACING_RATE: 2070 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */ 2071 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { 2072 lv = sizeof(v.ulval); 2073 v.ulval = READ_ONCE(sk->sk_max_pacing_rate); 2074 } else { 2075 /* 32bit version */ 2076 v.val = min_t(unsigned long, ~0U, 2077 READ_ONCE(sk->sk_max_pacing_rate)); 2078 } 2079 break; 2080 2081 case SO_INCOMING_CPU: 2082 v.val = READ_ONCE(sk->sk_incoming_cpu); 2083 break; 2084 2085 case SO_MEMINFO: 2086 { 2087 u32 meminfo[SK_MEMINFO_VARS]; 2088 2089 sk_get_meminfo(sk, meminfo); 2090 2091 len = min_t(unsigned int, len, sizeof(meminfo)); 2092 if (copy_to_sockptr(optval, &meminfo, len)) 2093 return -EFAULT; 2094 2095 goto lenout; 2096 } 2097 2098 #ifdef CONFIG_NET_RX_BUSY_POLL 2099 case SO_INCOMING_NAPI_ID: 2100 v.val = READ_ONCE(sk->sk_napi_id); 2101 2102 /* aggregate non-NAPI IDs down to 0 */ 2103 if (!napi_id_valid(v.val)) 2104 v.val = 0; 2105 2106 break; 2107 #endif 2108 2109 case SO_COOKIE: 2110 lv = sizeof(u64); 2111 if (len < lv) 2112 return -EINVAL; 2113 v.val64 = sock_gen_cookie(sk); 2114 break; 2115 2116 case SO_ZEROCOPY: 2117 v.val = sock_flag(sk, SOCK_ZEROCOPY); 2118 break; 2119 2120 case SO_TXTIME: 2121 lv = sizeof(v.txtime); 2122 v.txtime.clockid = sk->sk_clockid; 2123 v.txtime.flags |= sk->sk_txtime_deadline_mode ? 2124 SOF_TXTIME_DEADLINE_MODE : 0; 2125 v.txtime.flags |= sk->sk_txtime_report_errors ? 2126 SOF_TXTIME_REPORT_ERRORS : 0; 2127 break; 2128 2129 case SO_BINDTOIFINDEX: 2130 v.val = READ_ONCE(sk->sk_bound_dev_if); 2131 break; 2132 2133 case SO_NETNS_COOKIE: 2134 lv = sizeof(u64); 2135 if (len != lv) 2136 return -EINVAL; 2137 v.val64 = sock_net(sk)->net_cookie; 2138 break; 2139 2140 case SO_BUF_LOCK: 2141 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; 2142 break; 2143 2144 case SO_RESERVE_MEM: 2145 v.val = READ_ONCE(sk->sk_reserved_mem); 2146 break; 2147 2148 case SO_TXREHASH: 2149 if (!sk_is_tcp(sk)) 2150 return -EOPNOTSUPP; 2151 2152 /* Paired with WRITE_ONCE() in sk_setsockopt() */ 2153 v.val = READ_ONCE(sk->sk_txrehash); 2154 break; 2155 2156 default: 2157 /* We implement the SO_SNDLOWAT etc to not be settable 2158 * (1003.1g 7). 2159 */ 2160 return -ENOPROTOOPT; 2161 } 2162 2163 if (len > lv) 2164 len = lv; 2165 if (copy_to_sockptr(optval, &v, len)) 2166 return -EFAULT; 2167 lenout: 2168 if (copy_to_sockptr(optlen, &len, sizeof(int))) 2169 return -EFAULT; 2170 return 0; 2171 } 2172 2173 /* 2174 * Initialize an sk_lock. 2175 * 2176 * (We also register the sk_lock with the lock validator.) 2177 */ 2178 static inline void sock_lock_init(struct sock *sk) 2179 { 2180 sk_owner_clear(sk); 2181 2182 if (sk->sk_kern_sock) 2183 sock_lock_init_class_and_name( 2184 sk, 2185 af_family_kern_slock_key_strings[sk->sk_family], 2186 af_family_kern_slock_keys + sk->sk_family, 2187 af_family_kern_key_strings[sk->sk_family], 2188 af_family_kern_keys + sk->sk_family); 2189 else 2190 sock_lock_init_class_and_name( 2191 sk, 2192 af_family_slock_key_strings[sk->sk_family], 2193 af_family_slock_keys + sk->sk_family, 2194 af_family_key_strings[sk->sk_family], 2195 af_family_keys + sk->sk_family); 2196 } 2197 2198 /* 2199 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 2200 * even temporarily, because of RCU lookups. sk_node should also be left as is. 2201 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 2202 */ 2203 static void sock_copy(struct sock *nsk, const struct sock *osk) 2204 { 2205 const struct proto *prot = READ_ONCE(osk->sk_prot); 2206 #ifdef CONFIG_SECURITY_NETWORK 2207 void *sptr = nsk->sk_security; 2208 #endif 2209 2210 /* If we move sk_tx_queue_mapping out of the private section, 2211 * we must check if sk_tx_queue_clear() is called after 2212 * sock_copy() in sk_clone_lock(). 2213 */ 2214 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < 2215 offsetof(struct sock, sk_dontcopy_begin) || 2216 offsetof(struct sock, sk_tx_queue_mapping) >= 2217 offsetof(struct sock, sk_dontcopy_end)); 2218 2219 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 2220 2221 unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 2222 prot->obj_size - offsetof(struct sock, sk_dontcopy_end), 2223 /* alloc is larger than struct, see sk_prot_alloc() */); 2224 2225 #ifdef CONFIG_SECURITY_NETWORK 2226 nsk->sk_security = sptr; 2227 security_sk_clone(osk, nsk); 2228 #endif 2229 } 2230 2231 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 2232 int family) 2233 { 2234 struct sock *sk; 2235 struct kmem_cache *slab; 2236 2237 slab = prot->slab; 2238 if (slab != NULL) { 2239 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 2240 if (!sk) 2241 return sk; 2242 if (want_init_on_alloc(priority)) 2243 sk_prot_clear_nulls(sk, prot->obj_size); 2244 } else 2245 sk = kmalloc(prot->obj_size, priority); 2246 2247 if (sk != NULL) { 2248 if (security_sk_alloc(sk, family, priority)) 2249 goto out_free; 2250 2251 if (!try_module_get(prot->owner)) 2252 goto out_free_sec; 2253 } 2254 2255 return sk; 2256 2257 out_free_sec: 2258 security_sk_free(sk); 2259 out_free: 2260 if (slab != NULL) 2261 kmem_cache_free(slab, sk); 2262 else 2263 kfree(sk); 2264 return NULL; 2265 } 2266 2267 static void sk_prot_free(struct proto *prot, struct sock *sk) 2268 { 2269 struct kmem_cache *slab; 2270 struct module *owner; 2271 2272 owner = prot->owner; 2273 slab = prot->slab; 2274 2275 cgroup_sk_free(&sk->sk_cgrp_data); 2276 mem_cgroup_sk_free(sk); 2277 security_sk_free(sk); 2278 2279 sk_owner_put(sk); 2280 2281 if (slab != NULL) 2282 kmem_cache_free(slab, sk); 2283 else 2284 kfree(sk); 2285 module_put(owner); 2286 } 2287 2288 /** 2289 * sk_alloc - All socket objects are allocated here 2290 * @net: the applicable net namespace 2291 * @family: protocol family 2292 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 2293 * @prot: struct proto associated with this new sock instance 2294 * @kern: is this to be a kernel socket? 2295 */ 2296 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 2297 struct proto *prot, int kern) 2298 { 2299 struct sock *sk; 2300 2301 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 2302 if (sk) { 2303 sk->sk_family = family; 2304 /* 2305 * See comment in struct sock definition to understand 2306 * why we need sk_prot_creator -acme 2307 */ 2308 sk->sk_prot = sk->sk_prot_creator = prot; 2309 2310 if (READ_ONCE(net->core.sysctl_bypass_prot_mem)) 2311 sk->sk_bypass_prot_mem = 1; 2312 2313 sk->sk_kern_sock = kern; 2314 sock_lock_init(sk); 2315 2316 sk->sk_net_refcnt = kern ? 0 : 1; 2317 if (likely(sk->sk_net_refcnt)) { 2318 get_net_track(net, &sk->ns_tracker, priority); 2319 sock_inuse_add(net, 1); 2320 } else { 2321 net_passive_inc(net); 2322 __netns_tracker_alloc(net, &sk->ns_tracker, 2323 false, priority); 2324 } 2325 2326 sock_net_set(sk, net); 2327 refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); 2328 2329 mem_cgroup_sk_alloc(sk); 2330 cgroup_sk_alloc(&sk->sk_cgrp_data); 2331 sock_update_classid(&sk->sk_cgrp_data); 2332 sock_update_netprioidx(&sk->sk_cgrp_data); 2333 sk_tx_queue_clear(sk); 2334 } 2335 2336 return sk; 2337 } 2338 EXPORT_SYMBOL(sk_alloc); 2339 2340 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 2341 * grace period. This is the case for UDP sockets and TCP listeners. 2342 */ 2343 static void __sk_destruct(struct rcu_head *head) 2344 { 2345 struct sock *sk = container_of(head, struct sock, sk_rcu); 2346 struct net *net = sock_net(sk); 2347 struct sk_filter *filter; 2348 2349 if (sk->sk_destruct) 2350 sk->sk_destruct(sk); 2351 2352 filter = rcu_dereference_check(sk->sk_filter, 2353 refcount_read(&sk->sk_wmem_alloc) == 0); 2354 if (filter) { 2355 sk_filter_uncharge(sk, filter); 2356 RCU_INIT_POINTER(sk->sk_filter, NULL); 2357 } 2358 2359 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 2360 2361 #ifdef CONFIG_BPF_SYSCALL 2362 bpf_sk_storage_free(sk); 2363 #endif 2364 2365 if (atomic_read(&sk->sk_omem_alloc)) 2366 pr_debug("%s: optmem leakage (%d bytes) detected\n", 2367 __func__, atomic_read(&sk->sk_omem_alloc)); 2368 2369 if (sk->sk_frag.page) { 2370 put_page(sk->sk_frag.page); 2371 sk->sk_frag.page = NULL; 2372 } 2373 2374 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ 2375 put_cred(sk->sk_peer_cred); 2376 put_pid(sk->sk_peer_pid); 2377 2378 if (likely(sk->sk_net_refcnt)) { 2379 put_net_track(net, &sk->ns_tracker); 2380 } else { 2381 __netns_tracker_free(net, &sk->ns_tracker, false); 2382 net_passive_dec(net); 2383 } 2384 sk_prot_free(sk->sk_prot_creator, sk); 2385 } 2386 2387 void sk_net_refcnt_upgrade(struct sock *sk) 2388 { 2389 struct net *net = sock_net(sk); 2390 2391 WARN_ON_ONCE(sk->sk_net_refcnt); 2392 __netns_tracker_free(net, &sk->ns_tracker, false); 2393 net_passive_dec(net); 2394 sk->sk_net_refcnt = 1; 2395 get_net_track(net, &sk->ns_tracker, GFP_KERNEL); 2396 sock_inuse_add(net, 1); 2397 } 2398 EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade); 2399 2400 void sk_destruct(struct sock *sk) 2401 { 2402 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); 2403 2404 if (rcu_access_pointer(sk->sk_reuseport_cb)) { 2405 reuseport_detach_sock(sk); 2406 use_call_rcu = true; 2407 } 2408 2409 if (use_call_rcu) 2410 call_rcu(&sk->sk_rcu, __sk_destruct); 2411 else 2412 __sk_destruct(&sk->sk_rcu); 2413 } 2414 2415 static void __sk_free(struct sock *sk) 2416 { 2417 if (likely(sk->sk_net_refcnt)) 2418 sock_inuse_add(sock_net(sk), -1); 2419 2420 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 2421 sock_diag_broadcast_destroy(sk); 2422 else 2423 sk_destruct(sk); 2424 } 2425 2426 void sk_free(struct sock *sk) 2427 { 2428 /* 2429 * We subtract one from sk_wmem_alloc and can know if 2430 * some packets are still in some tx queue. 2431 * If not null, sock_wfree() will call __sk_free(sk) later 2432 */ 2433 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 2434 __sk_free(sk); 2435 } 2436 EXPORT_SYMBOL(sk_free); 2437 2438 static void sk_init_common(struct sock *sk) 2439 { 2440 skb_queue_head_init(&sk->sk_receive_queue); 2441 skb_queue_head_init(&sk->sk_write_queue); 2442 skb_queue_head_init(&sk->sk_error_queue); 2443 2444 rwlock_init(&sk->sk_callback_lock); 2445 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 2446 af_rlock_keys + sk->sk_family, 2447 af_family_rlock_key_strings[sk->sk_family]); 2448 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 2449 af_wlock_keys + sk->sk_family, 2450 af_family_wlock_key_strings[sk->sk_family]); 2451 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 2452 af_elock_keys + sk->sk_family, 2453 af_family_elock_key_strings[sk->sk_family]); 2454 if (sk->sk_kern_sock) 2455 lockdep_set_class_and_name(&sk->sk_callback_lock, 2456 af_kern_callback_keys + sk->sk_family, 2457 af_family_kern_clock_key_strings[sk->sk_family]); 2458 else 2459 lockdep_set_class_and_name(&sk->sk_callback_lock, 2460 af_callback_keys + sk->sk_family, 2461 af_family_clock_key_strings[sk->sk_family]); 2462 } 2463 2464 /** 2465 * sk_clone - clone a socket 2466 * @sk: the socket to clone 2467 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 2468 * @lock: if true, lock the cloned sk 2469 * 2470 * If @lock is true, the clone is locked by bh_lock_sock(), and 2471 * caller must unlock socket even in error path by bh_unlock_sock(). 2472 */ 2473 struct sock *sk_clone(const struct sock *sk, const gfp_t priority, 2474 bool lock) 2475 { 2476 struct proto *prot = READ_ONCE(sk->sk_prot); 2477 struct sk_filter *filter; 2478 bool is_charged = true; 2479 struct sock *newsk; 2480 2481 newsk = sk_prot_alloc(prot, priority, sk->sk_family); 2482 if (!newsk) 2483 goto out; 2484 2485 sock_copy(newsk, sk); 2486 2487 newsk->sk_prot_creator = prot; 2488 2489 /* SANITY */ 2490 if (likely(newsk->sk_net_refcnt)) { 2491 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority); 2492 sock_inuse_add(sock_net(newsk), 1); 2493 } else { 2494 /* Kernel sockets are not elevating the struct net refcount. 2495 * Instead, use a tracker to more easily detect if a layer 2496 * is not properly dismantling its kernel sockets at netns 2497 * destroy time. 2498 */ 2499 net_passive_inc(sock_net(newsk)); 2500 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, 2501 false, priority); 2502 } 2503 2504 sk_node_init(&newsk->sk_node); 2505 sock_lock_init(newsk); 2506 2507 if (lock) 2508 bh_lock_sock(newsk); 2509 2510 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 2511 newsk->sk_backlog.len = 0; 2512 2513 atomic_set(&newsk->sk_rmem_alloc, 0); 2514 2515 refcount_set(&newsk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); 2516 2517 atomic_set(&newsk->sk_omem_alloc, 0); 2518 sk_init_common(newsk); 2519 2520 newsk->sk_dst_cache = NULL; 2521 newsk->sk_dst_pending_confirm = 0; 2522 newsk->sk_wmem_queued = 0; 2523 newsk->sk_forward_alloc = 0; 2524 newsk->sk_reserved_mem = 0; 2525 DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters); 2526 sk_drops_reset(newsk); 2527 newsk->sk_send_head = NULL; 2528 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 2529 atomic_set(&newsk->sk_zckey, 0); 2530 2531 sock_reset_flag(newsk, SOCK_DONE); 2532 2533 #ifdef CONFIG_MEMCG 2534 /* sk->sk_memcg will be populated at accept() time */ 2535 newsk->sk_memcg = NULL; 2536 #endif 2537 2538 cgroup_sk_clone(&newsk->sk_cgrp_data); 2539 2540 rcu_read_lock(); 2541 filter = rcu_dereference(sk->sk_filter); 2542 if (filter != NULL) 2543 /* though it's an empty new sock, the charging may fail 2544 * if sysctl_optmem_max was changed between creation of 2545 * original socket and cloning 2546 */ 2547 is_charged = sk_filter_charge(newsk, filter); 2548 RCU_INIT_POINTER(newsk->sk_filter, filter); 2549 rcu_read_unlock(); 2550 2551 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 2552 /* We need to make sure that we don't uncharge the new 2553 * socket if we couldn't charge it in the first place 2554 * as otherwise we uncharge the parent's filter. 2555 */ 2556 if (!is_charged) 2557 RCU_INIT_POINTER(newsk->sk_filter, NULL); 2558 2559 goto free; 2560 } 2561 2562 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 2563 2564 if (bpf_sk_storage_clone(sk, newsk)) 2565 goto free; 2566 2567 /* Clear sk_user_data if parent had the pointer tagged 2568 * as not suitable for copying when cloning. 2569 */ 2570 if (sk_user_data_is_nocopy(newsk)) 2571 newsk->sk_user_data = NULL; 2572 2573 newsk->sk_err = 0; 2574 newsk->sk_err_soft = 0; 2575 newsk->sk_priority = 0; 2576 newsk->sk_incoming_cpu = raw_smp_processor_id(); 2577 2578 /* Before updating sk_refcnt, we must commit prior changes to memory 2579 * (Documentation/RCU/rculist_nulls.rst for details) 2580 */ 2581 smp_wmb(); 2582 refcount_set(&newsk->sk_refcnt, 2); 2583 2584 sk_set_socket(newsk, NULL); 2585 sk_tx_queue_clear(newsk); 2586 RCU_INIT_POINTER(newsk->sk_wq, NULL); 2587 2588 if (newsk->sk_prot->sockets_allocated) 2589 sk_sockets_allocated_inc(newsk); 2590 2591 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) 2592 net_enable_timestamp(); 2593 out: 2594 return newsk; 2595 free: 2596 /* It is still raw copy of parent, so invalidate 2597 * destructor and make plain sk_free() 2598 */ 2599 newsk->sk_destruct = NULL; 2600 if (lock) 2601 bh_unlock_sock(newsk); 2602 sk_free(newsk); 2603 newsk = NULL; 2604 goto out; 2605 } 2606 EXPORT_SYMBOL_GPL(sk_clone); 2607 2608 static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev) 2609 { 2610 bool is_ipv6 = false; 2611 u32 max_size; 2612 2613 #if IS_ENABLED(CONFIG_IPV6) 2614 is_ipv6 = (sk->sk_family == AF_INET6 && 2615 !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); 2616 #endif 2617 /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ 2618 max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) : 2619 READ_ONCE(dev->gso_ipv4_max_size); 2620 if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) 2621 max_size = GSO_LEGACY_MAX_SIZE; 2622 2623 return max_size - (MAX_TCP_HEADER + 1); 2624 } 2625 2626 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 2627 { 2628 const struct net_device *dev; 2629 u32 max_segs = 1; 2630 2631 rcu_read_lock(); 2632 dev = dst_dev_rcu(dst); 2633 sk->sk_route_caps = dev->features; 2634 if (sk_is_tcp(sk)) { 2635 struct inet_connection_sock *icsk = inet_csk(sk); 2636 2637 sk->sk_route_caps |= NETIF_F_GSO; 2638 icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK); 2639 } 2640 if (sk->sk_route_caps & NETIF_F_GSO) 2641 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 2642 if (unlikely(sk->sk_gso_disabled)) 2643 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2644 if (sk_can_gso(sk)) { 2645 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 2646 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2647 } else { 2648 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 2649 sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev); 2650 /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ 2651 max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1); 2652 } 2653 } 2654 sk->sk_gso_max_segs = max_segs; 2655 sk_dst_set(sk, dst); 2656 rcu_read_unlock(); 2657 } 2658 EXPORT_SYMBOL_GPL(sk_setup_caps); 2659 2660 /* 2661 * Simple resource managers for sockets. 2662 */ 2663 2664 2665 /* 2666 * Write buffer destructor automatically called from kfree_skb. 2667 */ 2668 void sock_wfree(struct sk_buff *skb) 2669 { 2670 unsigned int len = skb->truesize; 2671 struct sock *sk = skb->sk; 2672 bool free; 2673 int old; 2674 2675 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 2676 if (sock_flag(sk, SOCK_RCU_FREE) && 2677 sk->sk_write_space == sock_def_write_space) { 2678 rcu_read_lock(); 2679 free = __refcount_sub_and_test(len, &sk->sk_wmem_alloc, 2680 &old); 2681 sock_def_write_space_wfree(sk, old - len); 2682 rcu_read_unlock(); 2683 if (unlikely(free)) 2684 __sk_free(sk); 2685 return; 2686 } 2687 2688 /* 2689 * Keep a reference on sk_wmem_alloc, this will be released 2690 * after sk_write_space() call 2691 */ 2692 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 2693 sk->sk_write_space(sk); 2694 len = 1; 2695 } 2696 /* 2697 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 2698 * could not do because of in-flight packets 2699 */ 2700 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 2701 __sk_free(sk); 2702 } 2703 EXPORT_SYMBOL(sock_wfree); 2704 2705 /* This variant of sock_wfree() is used by TCP, 2706 * since it sets SOCK_USE_WRITE_QUEUE. 2707 */ 2708 void __sock_wfree(struct sk_buff *skb) 2709 { 2710 struct sock *sk = skb->sk; 2711 2712 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 2713 __sk_free(sk); 2714 } 2715 2716 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 2717 { 2718 int old_wmem; 2719 2720 skb_orphan(skb); 2721 #ifdef CONFIG_INET 2722 if (unlikely(!sk_fullsock(sk))) 2723 return skb_set_owner_edemux(skb, sk); 2724 #endif 2725 skb->sk = sk; 2726 skb->destructor = sock_wfree; 2727 skb_set_hash_from_sk(skb, sk); 2728 /* 2729 * We used to take a refcount on sk, but following operation 2730 * is enough to guarantee sk_free() won't free this sock until 2731 * all in-flight packets are completed 2732 */ 2733 __refcount_add(skb->truesize, &sk->sk_wmem_alloc, &old_wmem); 2734 2735 /* (old_wmem == SK_WMEM_ALLOC_BIAS) if no other TX packet for this socket 2736 * is in a host queue (qdisc, NIC queue). 2737 * Set skb->ooo_okay so that netdev_pick_tx() can choose a TX queue 2738 * based on XPS for better performance. 2739 * Otherwise clear ooo_okay to not risk Out Of Order delivery. 2740 */ 2741 skb->ooo_okay = (old_wmem == SK_WMEM_ALLOC_BIAS); 2742 } 2743 EXPORT_SYMBOL(skb_set_owner_w); 2744 2745 static bool can_skb_orphan_partial(const struct sk_buff *skb) 2746 { 2747 /* Drivers depend on in-order delivery for crypto offload, 2748 * partial orphan breaks out-of-order-OK logic. 2749 */ 2750 if (skb_is_decrypted(skb)) 2751 return false; 2752 2753 return (skb->destructor == sock_wfree || 2754 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); 2755 } 2756 2757 /* This helper is used by netem, as it can hold packets in its 2758 * delay queue. We want to allow the owner socket to send more 2759 * packets, as if they were already TX completed by a typical driver. 2760 * But we also want to keep skb->sk set because some packet schedulers 2761 * rely on it (sch_fq for example). 2762 */ 2763 void skb_orphan_partial(struct sk_buff *skb) 2764 { 2765 if (skb_is_tcp_pure_ack(skb)) 2766 return; 2767 2768 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) 2769 return; 2770 2771 skb_orphan(skb); 2772 } 2773 EXPORT_SYMBOL(skb_orphan_partial); 2774 2775 /* 2776 * Read buffer destructor automatically called from kfree_skb. 2777 */ 2778 void sock_rfree(struct sk_buff *skb) 2779 { 2780 struct sock *sk = skb->sk; 2781 unsigned int len = skb->truesize; 2782 2783 atomic_sub(len, &sk->sk_rmem_alloc); 2784 sk_mem_uncharge(sk, len); 2785 } 2786 EXPORT_SYMBOL(sock_rfree); 2787 2788 /* 2789 * Buffer destructor for skbs that are not used directly in read or write 2790 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 2791 */ 2792 void sock_efree(struct sk_buff *skb) 2793 { 2794 sock_put(skb->sk); 2795 } 2796 EXPORT_SYMBOL(sock_efree); 2797 2798 /* Buffer destructor for prefetch/receive path where reference count may 2799 * not be held, e.g. for listen sockets. 2800 */ 2801 #ifdef CONFIG_INET 2802 void sock_pfree(struct sk_buff *skb) 2803 { 2804 struct sock *sk = skb->sk; 2805 2806 if (!sk_is_refcounted(sk)) 2807 return; 2808 2809 if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) { 2810 inet_reqsk(sk)->rsk_listener = NULL; 2811 reqsk_free(inet_reqsk(sk)); 2812 return; 2813 } 2814 2815 sock_gen_put(sk); 2816 } 2817 EXPORT_SYMBOL(sock_pfree); 2818 #endif /* CONFIG_INET */ 2819 2820 /* 2821 * Allocate a skb from the socket's send buffer. 2822 */ 2823 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 2824 gfp_t priority) 2825 { 2826 if (force || 2827 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { 2828 struct sk_buff *skb = alloc_skb(size, priority); 2829 2830 if (skb) { 2831 skb_set_owner_w(skb, sk); 2832 return skb; 2833 } 2834 } 2835 return NULL; 2836 } 2837 EXPORT_SYMBOL(sock_wmalloc); 2838 2839 static void sock_ofree(struct sk_buff *skb) 2840 { 2841 struct sock *sk = skb->sk; 2842 2843 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 2844 } 2845 2846 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 2847 gfp_t priority) 2848 { 2849 struct sk_buff *skb; 2850 2851 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 2852 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 2853 READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) 2854 return NULL; 2855 2856 skb = alloc_skb(size, priority); 2857 if (!skb) 2858 return NULL; 2859 2860 atomic_add(skb->truesize, &sk->sk_omem_alloc); 2861 skb->sk = sk; 2862 skb->destructor = sock_ofree; 2863 return skb; 2864 } 2865 2866 /* 2867 * Allocate a memory block from the socket's option memory buffer. 2868 */ 2869 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 2870 { 2871 int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); 2872 2873 if ((unsigned int)size <= optmem_max && 2874 atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { 2875 void *mem; 2876 /* First do the add, to avoid the race if kmalloc 2877 * might sleep. 2878 */ 2879 atomic_add(size, &sk->sk_omem_alloc); 2880 mem = kmalloc(size, priority); 2881 if (mem) 2882 return mem; 2883 atomic_sub(size, &sk->sk_omem_alloc); 2884 } 2885 return NULL; 2886 } 2887 EXPORT_SYMBOL(sock_kmalloc); 2888 2889 /* 2890 * Duplicate the input "src" memory block using the socket's 2891 * option memory buffer. 2892 */ 2893 void *sock_kmemdup(struct sock *sk, const void *src, 2894 int size, gfp_t priority) 2895 { 2896 void *mem; 2897 2898 mem = sock_kmalloc(sk, size, priority); 2899 if (mem) 2900 memcpy(mem, src, size); 2901 return mem; 2902 } 2903 EXPORT_SYMBOL(sock_kmemdup); 2904 2905 /* Free an option memory block. Note, we actually want the inline 2906 * here as this allows gcc to detect the nullify and fold away the 2907 * condition entirely. 2908 */ 2909 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2910 const bool nullify) 2911 { 2912 if (WARN_ON_ONCE(!mem)) 2913 return; 2914 if (nullify) 2915 kfree_sensitive(mem); 2916 else 2917 kfree(mem); 2918 atomic_sub(size, &sk->sk_omem_alloc); 2919 } 2920 2921 void sock_kfree_s(struct sock *sk, void *mem, int size) 2922 { 2923 __sock_kfree_s(sk, mem, size, false); 2924 } 2925 EXPORT_SYMBOL(sock_kfree_s); 2926 2927 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2928 { 2929 __sock_kfree_s(sk, mem, size, true); 2930 } 2931 EXPORT_SYMBOL(sock_kzfree_s); 2932 2933 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2934 I think, these locks should be removed for datagram sockets. 2935 */ 2936 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2937 { 2938 DEFINE_WAIT(wait); 2939 2940 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2941 for (;;) { 2942 if (!timeo) 2943 break; 2944 if (signal_pending(current)) 2945 break; 2946 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2947 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2948 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) 2949 break; 2950 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2951 break; 2952 if (READ_ONCE(sk->sk_err)) 2953 break; 2954 timeo = schedule_timeout(timeo); 2955 } 2956 finish_wait(sk_sleep(sk), &wait); 2957 return timeo; 2958 } 2959 2960 2961 /* 2962 * Generic send/receive buffer handlers 2963 */ 2964 2965 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2966 unsigned long data_len, int noblock, 2967 int *errcode, int max_page_order) 2968 { 2969 struct sk_buff *skb; 2970 long timeo; 2971 int err; 2972 2973 timeo = sock_sndtimeo(sk, noblock); 2974 for (;;) { 2975 err = sock_error(sk); 2976 if (err != 0) 2977 goto failure; 2978 2979 err = -EPIPE; 2980 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2981 goto failure; 2982 2983 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) 2984 break; 2985 2986 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2987 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2988 err = -EAGAIN; 2989 if (!timeo) 2990 goto failure; 2991 if (signal_pending(current)) 2992 goto interrupted; 2993 timeo = sock_wait_for_wmem(sk, timeo); 2994 } 2995 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2996 errcode, sk->sk_allocation); 2997 if (skb) 2998 skb_set_owner_w(skb, sk); 2999 return skb; 3000 3001 interrupted: 3002 err = sock_intr_errno(timeo); 3003 failure: 3004 *errcode = err; 3005 return NULL; 3006 } 3007 EXPORT_SYMBOL(sock_alloc_send_pskb); 3008 3009 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, 3010 struct sockcm_cookie *sockc) 3011 { 3012 u32 tsflags; 3013 3014 BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31)); 3015 3016 switch (cmsg->cmsg_type) { 3017 case SO_MARK: 3018 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 3019 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3020 return -EPERM; 3021 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3022 return -EINVAL; 3023 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 3024 break; 3025 case SO_TIMESTAMPING_OLD: 3026 case SO_TIMESTAMPING_NEW: 3027 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3028 return -EINVAL; 3029 3030 tsflags = *(u32 *)CMSG_DATA(cmsg); 3031 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 3032 return -EINVAL; 3033 3034 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 3035 sockc->tsflags |= tsflags; 3036 break; 3037 case SCM_TXTIME: 3038 if (!sock_flag(sk, SOCK_TXTIME)) 3039 return -EINVAL; 3040 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 3041 return -EINVAL; 3042 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 3043 break; 3044 case SCM_TS_OPT_ID: 3045 if (sk_is_tcp(sk)) 3046 return -EINVAL; 3047 tsflags = READ_ONCE(sk->sk_tsflags); 3048 if (!(tsflags & SOF_TIMESTAMPING_OPT_ID)) 3049 return -EINVAL; 3050 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3051 return -EINVAL; 3052 sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg); 3053 sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID; 3054 break; 3055 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 3056 case SCM_RIGHTS: 3057 case SCM_CREDENTIALS: 3058 break; 3059 case SO_PRIORITY: 3060 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3061 return -EINVAL; 3062 if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg))) 3063 return -EPERM; 3064 sockc->priority = *(u32 *)CMSG_DATA(cmsg); 3065 break; 3066 case SCM_DEVMEM_DMABUF: 3067 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3068 return -EINVAL; 3069 sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg); 3070 break; 3071 default: 3072 return -EINVAL; 3073 } 3074 return 0; 3075 } 3076 EXPORT_SYMBOL(__sock_cmsg_send); 3077 3078 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 3079 struct sockcm_cookie *sockc) 3080 { 3081 struct cmsghdr *cmsg; 3082 int ret; 3083 3084 for_each_cmsghdr(cmsg, msg) { 3085 if (!CMSG_OK(msg, cmsg)) 3086 return -EINVAL; 3087 if (cmsg->cmsg_level != SOL_SOCKET) 3088 continue; 3089 ret = __sock_cmsg_send(sk, cmsg, sockc); 3090 if (ret) 3091 return ret; 3092 } 3093 return 0; 3094 } 3095 EXPORT_SYMBOL(sock_cmsg_send); 3096 3097 static void sk_enter_memory_pressure(struct sock *sk) 3098 { 3099 if (!sk->sk_prot->enter_memory_pressure) 3100 return; 3101 3102 sk->sk_prot->enter_memory_pressure(sk); 3103 } 3104 3105 static void sk_leave_memory_pressure(struct sock *sk) 3106 { 3107 if (sk->sk_prot->leave_memory_pressure) { 3108 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure, 3109 tcp_leave_memory_pressure, sk); 3110 } else { 3111 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 3112 3113 if (memory_pressure && READ_ONCE(*memory_pressure)) 3114 WRITE_ONCE(*memory_pressure, 0); 3115 } 3116 } 3117 3118 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); 3119 3120 /** 3121 * skb_page_frag_refill - check that a page_frag contains enough room 3122 * @sz: minimum size of the fragment we want to get 3123 * @pfrag: pointer to page_frag 3124 * @gfp: priority for memory allocation 3125 * 3126 * Note: While this allocator tries to use high order pages, there is 3127 * no guarantee that allocations succeed. Therefore, @sz MUST be 3128 * less or equal than PAGE_SIZE. 3129 */ 3130 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 3131 { 3132 if (pfrag->page) { 3133 if (page_ref_count(pfrag->page) == 1) { 3134 pfrag->offset = 0; 3135 return true; 3136 } 3137 if (pfrag->offset + sz <= pfrag->size) 3138 return true; 3139 put_page(pfrag->page); 3140 } 3141 3142 pfrag->offset = 0; 3143 if (SKB_FRAG_PAGE_ORDER && 3144 !static_branch_unlikely(&net_high_order_alloc_disable_key)) { 3145 /* Avoid direct reclaim but allow kswapd to wake */ 3146 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 3147 __GFP_COMP | __GFP_NOWARN | 3148 __GFP_NORETRY, 3149 SKB_FRAG_PAGE_ORDER); 3150 if (likely(pfrag->page)) { 3151 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 3152 return true; 3153 } 3154 } 3155 pfrag->page = alloc_page(gfp); 3156 if (likely(pfrag->page)) { 3157 pfrag->size = PAGE_SIZE; 3158 return true; 3159 } 3160 return false; 3161 } 3162 EXPORT_SYMBOL(skb_page_frag_refill); 3163 3164 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 3165 { 3166 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 3167 return true; 3168 3169 if (!sk->sk_bypass_prot_mem) 3170 sk_enter_memory_pressure(sk); 3171 3172 sk_stream_moderate_sndbuf(sk); 3173 3174 return false; 3175 } 3176 EXPORT_SYMBOL(sk_page_frag_refill); 3177 3178 void __lock_sock(struct sock *sk) 3179 __releases(&sk->sk_lock.slock) 3180 __acquires(&sk->sk_lock.slock) 3181 { 3182 DEFINE_WAIT(wait); 3183 3184 for (;;) { 3185 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 3186 TASK_UNINTERRUPTIBLE); 3187 spin_unlock_bh(&sk->sk_lock.slock); 3188 schedule(); 3189 spin_lock_bh(&sk->sk_lock.slock); 3190 if (!sock_owned_by_user(sk)) 3191 break; 3192 } 3193 finish_wait(&sk->sk_lock.wq, &wait); 3194 } 3195 3196 void __release_sock(struct sock *sk) 3197 __releases(&sk->sk_lock.slock) 3198 __acquires(&sk->sk_lock.slock) 3199 { 3200 struct sk_buff *skb, *next; 3201 int nb = 0; 3202 3203 while ((skb = sk->sk_backlog.head) != NULL) { 3204 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 3205 3206 spin_unlock_bh(&sk->sk_lock.slock); 3207 3208 while (1) { 3209 next = skb->next; 3210 prefetch(next); 3211 DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb)); 3212 skb_mark_not_on_list(skb); 3213 sk_backlog_rcv(sk, skb); 3214 3215 skb = next; 3216 if (!skb) 3217 break; 3218 3219 if (!(++nb & 15)) 3220 cond_resched(); 3221 } 3222 3223 spin_lock_bh(&sk->sk_lock.slock); 3224 } 3225 3226 /* 3227 * Doing the zeroing here guarantee we can not loop forever 3228 * while a wild producer attempts to flood us. 3229 */ 3230 sk->sk_backlog.len = 0; 3231 } 3232 3233 void __sk_flush_backlog(struct sock *sk) 3234 { 3235 spin_lock_bh(&sk->sk_lock.slock); 3236 __release_sock(sk); 3237 3238 if (sk->sk_prot->release_cb) 3239 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, 3240 tcp_release_cb, sk); 3241 3242 spin_unlock_bh(&sk->sk_lock.slock); 3243 } 3244 EXPORT_SYMBOL_GPL(__sk_flush_backlog); 3245 3246 /** 3247 * sk_wait_data - wait for data to arrive at sk_receive_queue 3248 * @sk: sock to wait on 3249 * @timeo: for how long 3250 * @skb: last skb seen on sk_receive_queue 3251 * 3252 * Now socket state including sk->sk_err is changed only under lock, 3253 * hence we may omit checks after joining wait queue. 3254 * We check receive queue before schedule() only as optimization; 3255 * it is very likely that release_sock() added new data. 3256 */ 3257 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 3258 { 3259 DEFINE_WAIT_FUNC(wait, woken_wake_function); 3260 int rc; 3261 3262 add_wait_queue(sk_sleep(sk), &wait); 3263 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 3264 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 3265 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 3266 remove_wait_queue(sk_sleep(sk), &wait); 3267 return rc; 3268 } 3269 EXPORT_SYMBOL(sk_wait_data); 3270 3271 /** 3272 * __sk_mem_raise_allocated - increase memory_allocated 3273 * @sk: socket 3274 * @size: memory size to allocate 3275 * @amt: pages to allocate 3276 * @kind: allocation type 3277 * 3278 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc. 3279 * 3280 * Unlike the globally shared limits among the sockets under same protocol, 3281 * consuming the budget of a memcg won't have direct effect on other ones. 3282 * So be optimistic about memcg's tolerance, and leave the callers to decide 3283 * whether or not to raise allocated through sk_under_memory_pressure() or 3284 * its variants. 3285 */ 3286 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 3287 { 3288 bool memcg_enabled = false, charged = false; 3289 struct proto *prot = sk->sk_prot; 3290 long allocated = 0; 3291 3292 if (!sk->sk_bypass_prot_mem) { 3293 sk_memory_allocated_add(sk, amt); 3294 allocated = sk_memory_allocated(sk); 3295 } 3296 3297 if (mem_cgroup_sk_enabled(sk)) { 3298 memcg_enabled = true; 3299 charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge()); 3300 if (!charged) 3301 goto suppress_allocation; 3302 } 3303 3304 if (!allocated) 3305 return 1; 3306 3307 /* Under limit. */ 3308 if (allocated <= sk_prot_mem_limits(sk, 0)) { 3309 sk_leave_memory_pressure(sk); 3310 return 1; 3311 } 3312 3313 /* Under pressure. */ 3314 if (allocated > sk_prot_mem_limits(sk, 1)) 3315 sk_enter_memory_pressure(sk); 3316 3317 /* Over hard limit. */ 3318 if (allocated > sk_prot_mem_limits(sk, 2)) 3319 goto suppress_allocation; 3320 3321 /* Guarantee minimum buffer size under pressure (either global 3322 * or memcg) to make sure features described in RFC 7323 (TCP 3323 * Extensions for High Performance) work properly. 3324 * 3325 * This rule does NOT stand when exceeds global or memcg's hard 3326 * limit, or else a DoS attack can be taken place by spawning 3327 * lots of sockets whose usage are under minimum buffer size. 3328 */ 3329 if (kind == SK_MEM_RECV) { 3330 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 3331 return 1; 3332 3333 } else { /* SK_MEM_SEND */ 3334 int wmem0 = sk_get_wmem0(sk, prot); 3335 3336 if (sk->sk_type == SOCK_STREAM) { 3337 if (sk->sk_wmem_queued < wmem0) 3338 return 1; 3339 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 3340 return 1; 3341 } 3342 } 3343 3344 if (sk_has_memory_pressure(sk)) { 3345 u64 alloc; 3346 3347 /* The following 'average' heuristic is within the 3348 * scope of global accounting, so it only makes 3349 * sense for global memory pressure. 3350 */ 3351 if (!sk_under_global_memory_pressure(sk)) 3352 return 1; 3353 3354 /* Try to be fair among all the sockets under global 3355 * pressure by allowing the ones that below average 3356 * usage to raise. 3357 */ 3358 alloc = sk_sockets_allocated_read_positive(sk); 3359 if (sk_prot_mem_limits(sk, 2) > alloc * 3360 sk_mem_pages(sk->sk_wmem_queued + 3361 atomic_read(&sk->sk_rmem_alloc) + 3362 sk->sk_forward_alloc)) 3363 return 1; 3364 } 3365 3366 suppress_allocation: 3367 3368 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 3369 sk_stream_moderate_sndbuf(sk); 3370 3371 /* Fail only if socket is _under_ its sndbuf. 3372 * In this case we cannot block, so that we have to fail. 3373 */ 3374 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { 3375 /* Force charge with __GFP_NOFAIL */ 3376 if (memcg_enabled && !charged) 3377 mem_cgroup_sk_charge(sk, amt, 3378 gfp_memcg_charge() | __GFP_NOFAIL); 3379 return 1; 3380 } 3381 } 3382 3383 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 3384 3385 if (allocated) 3386 sk_memory_allocated_sub(sk, amt); 3387 3388 if (charged) 3389 mem_cgroup_sk_uncharge(sk, amt); 3390 3391 return 0; 3392 } 3393 3394 /** 3395 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 3396 * @sk: socket 3397 * @size: memory size to allocate 3398 * @kind: allocation type 3399 * 3400 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 3401 * rmem allocation. This function assumes that protocols which have 3402 * memory_pressure use sk_wmem_queued as write buffer accounting. 3403 */ 3404 int __sk_mem_schedule(struct sock *sk, int size, int kind) 3405 { 3406 int ret, amt = sk_mem_pages(size); 3407 3408 sk_forward_alloc_add(sk, amt << PAGE_SHIFT); 3409 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 3410 if (!ret) 3411 sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT)); 3412 return ret; 3413 } 3414 EXPORT_SYMBOL(__sk_mem_schedule); 3415 3416 /** 3417 * __sk_mem_reduce_allocated - reclaim memory_allocated 3418 * @sk: socket 3419 * @amount: number of quanta 3420 * 3421 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 3422 */ 3423 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 3424 { 3425 if (mem_cgroup_sk_enabled(sk)) 3426 mem_cgroup_sk_uncharge(sk, amount); 3427 3428 if (sk->sk_bypass_prot_mem) 3429 return; 3430 3431 sk_memory_allocated_sub(sk, amount); 3432 3433 if (sk_under_global_memory_pressure(sk) && 3434 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 3435 sk_leave_memory_pressure(sk); 3436 } 3437 3438 /** 3439 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 3440 * @sk: socket 3441 * @amount: number of bytes (rounded down to a PAGE_SIZE multiple) 3442 */ 3443 void __sk_mem_reclaim(struct sock *sk, int amount) 3444 { 3445 amount >>= PAGE_SHIFT; 3446 sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT)); 3447 __sk_mem_reduce_allocated(sk, amount); 3448 } 3449 EXPORT_SYMBOL(__sk_mem_reclaim); 3450 3451 int sk_set_peek_off(struct sock *sk, int val) 3452 { 3453 WRITE_ONCE(sk->sk_peek_off, val); 3454 return 0; 3455 } 3456 EXPORT_SYMBOL_GPL(sk_set_peek_off); 3457 3458 /* 3459 * Set of default routines for initialising struct proto_ops when 3460 * the protocol does not support a particular function. In certain 3461 * cases where it makes no sense for a protocol to have a "do nothing" 3462 * function, some default processing is provided. 3463 */ 3464 3465 int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len) 3466 { 3467 return -EOPNOTSUPP; 3468 } 3469 EXPORT_SYMBOL(sock_no_bind); 3470 3471 int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr, 3472 int len, int flags) 3473 { 3474 return -EOPNOTSUPP; 3475 } 3476 EXPORT_SYMBOL(sock_no_connect); 3477 3478 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 3479 { 3480 return -EOPNOTSUPP; 3481 } 3482 EXPORT_SYMBOL(sock_no_socketpair); 3483 3484 int sock_no_accept(struct socket *sock, struct socket *newsock, 3485 struct proto_accept_arg *arg) 3486 { 3487 return -EOPNOTSUPP; 3488 } 3489 EXPORT_SYMBOL(sock_no_accept); 3490 3491 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 3492 int peer) 3493 { 3494 return -EOPNOTSUPP; 3495 } 3496 EXPORT_SYMBOL(sock_no_getname); 3497 3498 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3499 { 3500 return -EOPNOTSUPP; 3501 } 3502 EXPORT_SYMBOL(sock_no_ioctl); 3503 3504 int sock_no_listen(struct socket *sock, int backlog) 3505 { 3506 return -EOPNOTSUPP; 3507 } 3508 EXPORT_SYMBOL(sock_no_listen); 3509 3510 int sock_no_shutdown(struct socket *sock, int how) 3511 { 3512 return -EOPNOTSUPP; 3513 } 3514 EXPORT_SYMBOL(sock_no_shutdown); 3515 3516 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 3517 { 3518 return -EOPNOTSUPP; 3519 } 3520 EXPORT_SYMBOL(sock_no_sendmsg); 3521 3522 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 3523 { 3524 return -EOPNOTSUPP; 3525 } 3526 EXPORT_SYMBOL(sock_no_sendmsg_locked); 3527 3528 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 3529 int flags) 3530 { 3531 return -EOPNOTSUPP; 3532 } 3533 EXPORT_SYMBOL(sock_no_recvmsg); 3534 3535 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 3536 { 3537 /* Mirror missing mmap method error code */ 3538 return -ENODEV; 3539 } 3540 EXPORT_SYMBOL(sock_no_mmap); 3541 3542 /* 3543 * When a file is received (via SCM_RIGHTS, etc), we must bump the 3544 * various sock-based usage counts. 3545 */ 3546 void __receive_sock(struct file *file) 3547 { 3548 struct socket *sock; 3549 3550 sock = sock_from_file(file); 3551 if (sock) { 3552 sock_update_netprioidx(&sock->sk->sk_cgrp_data); 3553 sock_update_classid(&sock->sk->sk_cgrp_data); 3554 } 3555 } 3556 3557 /* 3558 * Default Socket Callbacks 3559 */ 3560 3561 static void sock_def_wakeup(struct sock *sk) 3562 { 3563 struct socket_wq *wq; 3564 3565 rcu_read_lock(); 3566 wq = rcu_dereference(sk->sk_wq); 3567 if (skwq_has_sleeper(wq)) 3568 wake_up_interruptible_all(&wq->wait); 3569 rcu_read_unlock(); 3570 } 3571 3572 static void sock_def_error_report(struct sock *sk) 3573 { 3574 struct socket_wq *wq; 3575 3576 rcu_read_lock(); 3577 wq = rcu_dereference(sk->sk_wq); 3578 if (skwq_has_sleeper(wq)) 3579 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 3580 sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR); 3581 rcu_read_unlock(); 3582 } 3583 3584 void sock_def_readable(struct sock *sk) 3585 { 3586 struct socket_wq *wq; 3587 3588 trace_sk_data_ready(sk); 3589 3590 rcu_read_lock(); 3591 wq = rcu_dereference(sk->sk_wq); 3592 if (skwq_has_sleeper(wq)) 3593 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 3594 EPOLLRDNORM | EPOLLRDBAND); 3595 sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); 3596 rcu_read_unlock(); 3597 } 3598 3599 static void sock_def_write_space(struct sock *sk) 3600 { 3601 struct socket_wq *wq; 3602 3603 rcu_read_lock(); 3604 3605 /* Do not wake up a writer until he can make "significant" 3606 * progress. --DaveM 3607 */ 3608 if (sock_writeable(sk)) { 3609 wq = rcu_dereference(sk->sk_wq); 3610 if (skwq_has_sleeper(wq)) 3611 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 3612 EPOLLWRNORM | EPOLLWRBAND); 3613 3614 /* Should agree with poll, otherwise some programs break */ 3615 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 3616 } 3617 3618 rcu_read_unlock(); 3619 } 3620 3621 /* An optimised version of sock_def_write_space(), should only be called 3622 * for SOCK_RCU_FREE sockets under RCU read section and after putting 3623 * ->sk_wmem_alloc. 3624 */ 3625 static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc) 3626 { 3627 /* Do not wake up a writer until he can make "significant" 3628 * progress. --DaveM 3629 */ 3630 if (__sock_writeable(sk, wmem_alloc)) { 3631 struct socket_wq *wq = rcu_dereference(sk->sk_wq); 3632 3633 /* rely on refcount_sub from sock_wfree() */ 3634 smp_mb__after_atomic(); 3635 if (wq && waitqueue_active(&wq->wait)) 3636 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 3637 EPOLLWRNORM | EPOLLWRBAND); 3638 3639 /* Should agree with poll, otherwise some programs break */ 3640 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 3641 } 3642 } 3643 3644 static void sock_def_destruct(struct sock *sk) 3645 { 3646 } 3647 3648 void sk_send_sigurg(struct sock *sk) 3649 { 3650 if (sk->sk_socket && sk->sk_socket->file) 3651 if (send_sigurg(sk->sk_socket->file)) 3652 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 3653 } 3654 EXPORT_SYMBOL(sk_send_sigurg); 3655 3656 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 3657 unsigned long expires) 3658 { 3659 if (!mod_timer(timer, expires)) 3660 sock_hold(sk); 3661 } 3662 EXPORT_SYMBOL(sk_reset_timer); 3663 3664 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 3665 { 3666 if (timer_delete(timer)) 3667 __sock_put(sk); 3668 } 3669 EXPORT_SYMBOL(sk_stop_timer); 3670 3671 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) 3672 { 3673 if (timer_delete_sync(timer)) 3674 __sock_put(sk); 3675 } 3676 EXPORT_SYMBOL(sk_stop_timer_sync); 3677 3678 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) 3679 { 3680 sk_init_common(sk); 3681 sk->sk_send_head = NULL; 3682 3683 timer_setup(&sk->sk_timer, NULL, 0); 3684 3685 sk->sk_allocation = GFP_KERNEL; 3686 sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); 3687 sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); 3688 sk->sk_state = TCP_CLOSE; 3689 sk->sk_use_task_frag = true; 3690 sk_set_socket(sk, sock); 3691 3692 sock_set_flag(sk, SOCK_ZAPPED); 3693 3694 if (sock) { 3695 sk->sk_type = sock->type; 3696 RCU_INIT_POINTER(sk->sk_wq, &sock->wq); 3697 sock->sk = sk; 3698 } else { 3699 RCU_INIT_POINTER(sk->sk_wq, NULL); 3700 } 3701 sk->sk_uid = uid; 3702 3703 sk->sk_state_change = sock_def_wakeup; 3704 sk->sk_data_ready = sock_def_readable; 3705 sk->sk_write_space = sock_def_write_space; 3706 sk->sk_error_report = sock_def_error_report; 3707 sk->sk_destruct = sock_def_destruct; 3708 3709 sk->sk_frag.page = NULL; 3710 sk->sk_frag.offset = 0; 3711 sk->sk_peek_off = -1; 3712 3713 sk->sk_peer_pid = NULL; 3714 sk->sk_peer_cred = NULL; 3715 spin_lock_init(&sk->sk_peer_lock); 3716 3717 sk->sk_write_pending = 0; 3718 sk->sk_rcvlowat = 1; 3719 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 3720 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 3721 3722 sk->sk_stamp = SK_DEFAULT_STAMP; 3723 #if BITS_PER_LONG==32 3724 seqlock_init(&sk->sk_stamp_seq); 3725 #endif 3726 atomic_set(&sk->sk_zckey, 0); 3727 3728 #ifdef CONFIG_NET_RX_BUSY_POLL 3729 sk->sk_napi_id = 0; 3730 sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read); 3731 #endif 3732 3733 sk->sk_max_pacing_rate = ~0UL; 3734 sk->sk_pacing_rate = ~0UL; 3735 WRITE_ONCE(sk->sk_pacing_shift, 10); 3736 sk->sk_incoming_cpu = -1; 3737 3738 sk_rx_queue_clear(sk); 3739 /* 3740 * Before updating sk_refcnt, we must commit prior changes to memory 3741 * (Documentation/RCU/rculist_nulls.rst for details) 3742 */ 3743 smp_wmb(); 3744 refcount_set(&sk->sk_refcnt, 1); 3745 sk_drops_reset(sk); 3746 } 3747 EXPORT_SYMBOL(sock_init_data_uid); 3748 3749 void sock_init_data(struct socket *sock, struct sock *sk) 3750 { 3751 kuid_t uid = sock ? 3752 SOCK_INODE(sock)->i_uid : 3753 make_kuid(sock_net(sk)->user_ns, 0); 3754 3755 sock_init_data_uid(sock, sk, uid); 3756 } 3757 EXPORT_SYMBOL(sock_init_data); 3758 3759 void lock_sock_nested(struct sock *sk, int subclass) 3760 { 3761 /* The sk_lock has mutex_lock() semantics here. */ 3762 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 3763 3764 might_sleep(); 3765 spin_lock_bh(&sk->sk_lock.slock); 3766 if (sock_owned_by_user_nocheck(sk)) 3767 __lock_sock(sk); 3768 sk->sk_lock.owned = 1; 3769 spin_unlock_bh(&sk->sk_lock.slock); 3770 } 3771 EXPORT_SYMBOL(lock_sock_nested); 3772 3773 void release_sock(struct sock *sk) 3774 { 3775 spin_lock_bh(&sk->sk_lock.slock); 3776 if (sk->sk_backlog.tail) 3777 __release_sock(sk); 3778 3779 if (sk->sk_prot->release_cb) 3780 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, 3781 tcp_release_cb, sk); 3782 3783 sock_release_ownership(sk); 3784 if (waitqueue_active(&sk->sk_lock.wq)) 3785 wake_up(&sk->sk_lock.wq); 3786 spin_unlock_bh(&sk->sk_lock.slock); 3787 } 3788 EXPORT_SYMBOL(release_sock); 3789 3790 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) 3791 { 3792 might_sleep(); 3793 spin_lock_bh(&sk->sk_lock.slock); 3794 3795 if (!sock_owned_by_user_nocheck(sk)) { 3796 /* 3797 * Fast path return with bottom halves disabled and 3798 * sock::sk_lock.slock held. 3799 * 3800 * The 'mutex' is not contended and holding 3801 * sock::sk_lock.slock prevents all other lockers to 3802 * proceed so the corresponding unlock_sock_fast() can 3803 * avoid the slow path of release_sock() completely and 3804 * just release slock. 3805 * 3806 * From a semantical POV this is equivalent to 'acquiring' 3807 * the 'mutex', hence the corresponding lockdep 3808 * mutex_release() has to happen in the fast path of 3809 * unlock_sock_fast(). 3810 */ 3811 return false; 3812 } 3813 3814 __lock_sock(sk); 3815 sk->sk_lock.owned = 1; 3816 __acquire(&sk->sk_lock.slock); 3817 spin_unlock_bh(&sk->sk_lock.slock); 3818 return true; 3819 } 3820 EXPORT_SYMBOL(__lock_sock_fast); 3821 3822 int sock_gettstamp(struct socket *sock, void __user *userstamp, 3823 bool timeval, bool time32) 3824 { 3825 struct sock *sk = sock->sk; 3826 struct timespec64 ts; 3827 3828 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 3829 ts = ktime_to_timespec64(sock_read_timestamp(sk)); 3830 if (ts.tv_sec == -1) 3831 return -ENOENT; 3832 if (ts.tv_sec == 0) { 3833 ktime_t kt = ktime_get_real(); 3834 sock_write_timestamp(sk, kt); 3835 ts = ktime_to_timespec64(kt); 3836 } 3837 3838 if (timeval) 3839 ts.tv_nsec /= 1000; 3840 3841 #ifdef CONFIG_COMPAT_32BIT_TIME 3842 if (time32) 3843 return put_old_timespec32(&ts, userstamp); 3844 #endif 3845 #ifdef CONFIG_SPARC64 3846 /* beware of padding in sparc64 timeval */ 3847 if (timeval && !in_compat_syscall()) { 3848 struct __kernel_old_timeval __user tv = { 3849 .tv_sec = ts.tv_sec, 3850 .tv_usec = ts.tv_nsec, 3851 }; 3852 if (copy_to_user(userstamp, &tv, sizeof(tv))) 3853 return -EFAULT; 3854 return 0; 3855 } 3856 #endif 3857 return put_timespec64(&ts, userstamp); 3858 } 3859 EXPORT_SYMBOL(sock_gettstamp); 3860 3861 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) 3862 { 3863 if (!sock_flag(sk, flag)) { 3864 unsigned long previous_flags = sk->sk_flags; 3865 3866 sock_set_flag(sk, flag); 3867 /* 3868 * we just set one of the two flags which require net 3869 * time stamping, but time stamping might have been on 3870 * already because of the other one 3871 */ 3872 if (sock_needs_netstamp(sk) && 3873 !(previous_flags & SK_FLAGS_TIMESTAMP)) 3874 net_enable_timestamp(); 3875 } 3876 } 3877 3878 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 3879 int level, int type) 3880 { 3881 struct sock_exterr_skb *serr; 3882 struct sk_buff *skb; 3883 int copied, err; 3884 3885 err = -EAGAIN; 3886 skb = sock_dequeue_err_skb(sk); 3887 if (skb == NULL) 3888 goto out; 3889 3890 copied = skb->len; 3891 if (copied > len) { 3892 msg->msg_flags |= MSG_TRUNC; 3893 copied = len; 3894 } 3895 err = skb_copy_datagram_msg(skb, 0, msg, copied); 3896 if (err) 3897 goto out_free_skb; 3898 3899 sock_recv_timestamp(msg, sk, skb); 3900 3901 serr = SKB_EXT_ERR(skb); 3902 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 3903 3904 msg->msg_flags |= MSG_ERRQUEUE; 3905 err = copied; 3906 3907 out_free_skb: 3908 kfree_skb(skb); 3909 out: 3910 return err; 3911 } 3912 EXPORT_SYMBOL(sock_recv_errqueue); 3913 3914 /* 3915 * Get a socket option on an socket. 3916 * 3917 * FIX: POSIX 1003.1g is very ambiguous here. It states that 3918 * asynchronous errors should be reported by getsockopt. We assume 3919 * this means if you specify SO_ERROR (otherwise what is the point of it). 3920 */ 3921 int sock_common_getsockopt(struct socket *sock, int level, int optname, 3922 char __user *optval, int __user *optlen) 3923 { 3924 struct sock *sk = sock->sk; 3925 3926 /* IPV6_ADDRFORM can change sk->sk_prot under us. */ 3927 return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen); 3928 } 3929 EXPORT_SYMBOL(sock_common_getsockopt); 3930 3931 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 3932 int flags) 3933 { 3934 struct sock *sk = sock->sk; 3935 int addr_len = 0; 3936 int err; 3937 3938 err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len); 3939 if (err >= 0) 3940 msg->msg_namelen = addr_len; 3941 return err; 3942 } 3943 EXPORT_SYMBOL(sock_common_recvmsg); 3944 3945 /* 3946 * Set socket options on an inet socket. 3947 */ 3948 int sock_common_setsockopt(struct socket *sock, int level, int optname, 3949 sockptr_t optval, unsigned int optlen) 3950 { 3951 struct sock *sk = sock->sk; 3952 3953 /* IPV6_ADDRFORM can change sk->sk_prot under us. */ 3954 return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen); 3955 } 3956 EXPORT_SYMBOL(sock_common_setsockopt); 3957 3958 void sk_common_release(struct sock *sk) 3959 { 3960 if (sk->sk_prot->destroy) 3961 sk->sk_prot->destroy(sk); 3962 3963 /* 3964 * Observation: when sk_common_release is called, processes have 3965 * no access to socket. But net still has. 3966 * Step one, detach it from networking: 3967 * 3968 * A. Remove from hash tables. 3969 */ 3970 3971 sk->sk_prot->unhash(sk); 3972 3973 /* 3974 * In this point socket cannot receive new packets, but it is possible 3975 * that some packets are in flight because some CPU runs receiver and 3976 * did hash table lookup before we unhashed socket. They will achieve 3977 * receive queue and will be purged by socket destructor. 3978 * 3979 * Also we still have packets pending on receive queue and probably, 3980 * our own packets waiting in device queues. sock_destroy will drain 3981 * receive queue, but transmitted packets will delay socket destruction 3982 * until the last reference will be released. 3983 */ 3984 3985 sock_orphan(sk); 3986 3987 xfrm_sk_free_policy(sk); 3988 3989 sock_put(sk); 3990 } 3991 EXPORT_SYMBOL(sk_common_release); 3992 3993 void sk_get_meminfo(const struct sock *sk, u32 *mem) 3994 { 3995 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 3996 3997 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 3998 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); 3999 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 4000 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); 4001 mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc); 4002 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); 4003 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 4004 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); 4005 mem[SK_MEMINFO_DROPS] = sk_drops_read(sk); 4006 } 4007 4008 #ifdef CONFIG_PROC_FS 4009 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 4010 4011 int sock_prot_inuse_get(struct net *net, struct proto *prot) 4012 { 4013 int cpu, idx = prot->inuse_idx; 4014 int res = 0; 4015 4016 for_each_possible_cpu(cpu) 4017 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 4018 4019 return res >= 0 ? res : 0; 4020 } 4021 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 4022 4023 int sock_inuse_get(struct net *net) 4024 { 4025 int cpu, res = 0; 4026 4027 for_each_possible_cpu(cpu) 4028 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all; 4029 4030 return res; 4031 } 4032 4033 EXPORT_SYMBOL_GPL(sock_inuse_get); 4034 4035 static int __net_init sock_inuse_init_net(struct net *net) 4036 { 4037 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 4038 if (net->core.prot_inuse == NULL) 4039 return -ENOMEM; 4040 return 0; 4041 } 4042 4043 static void __net_exit sock_inuse_exit_net(struct net *net) 4044 { 4045 free_percpu(net->core.prot_inuse); 4046 } 4047 4048 static struct pernet_operations net_inuse_ops = { 4049 .init = sock_inuse_init_net, 4050 .exit = sock_inuse_exit_net, 4051 }; 4052 4053 static __init int net_inuse_init(void) 4054 { 4055 if (register_pernet_subsys(&net_inuse_ops)) 4056 panic("Cannot initialize net inuse counters"); 4057 4058 return 0; 4059 } 4060 4061 core_initcall(net_inuse_init); 4062 4063 static int assign_proto_idx(struct proto *prot) 4064 { 4065 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 4066 4067 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) { 4068 pr_err("PROTO_INUSE_NR exhausted\n"); 4069 return -ENOSPC; 4070 } 4071 4072 set_bit(prot->inuse_idx, proto_inuse_idx); 4073 return 0; 4074 } 4075 4076 static void release_proto_idx(struct proto *prot) 4077 { 4078 if (prot->inuse_idx != PROTO_INUSE_NR) 4079 clear_bit(prot->inuse_idx, proto_inuse_idx); 4080 } 4081 #else 4082 static inline int assign_proto_idx(struct proto *prot) 4083 { 4084 return 0; 4085 } 4086 4087 static inline void release_proto_idx(struct proto *prot) 4088 { 4089 } 4090 4091 #endif 4092 4093 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) 4094 { 4095 if (!twsk_prot) 4096 return; 4097 kfree(twsk_prot->twsk_slab_name); 4098 twsk_prot->twsk_slab_name = NULL; 4099 kmem_cache_destroy(twsk_prot->twsk_slab); 4100 twsk_prot->twsk_slab = NULL; 4101 } 4102 4103 static int tw_prot_init(const struct proto *prot) 4104 { 4105 struct timewait_sock_ops *twsk_prot = prot->twsk_prot; 4106 4107 if (!twsk_prot) 4108 return 0; 4109 4110 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", 4111 prot->name); 4112 if (!twsk_prot->twsk_slab_name) 4113 return -ENOMEM; 4114 4115 twsk_prot->twsk_slab = 4116 kmem_cache_create(twsk_prot->twsk_slab_name, 4117 twsk_prot->twsk_obj_size, 0, 4118 SLAB_ACCOUNT | prot->slab_flags, 4119 NULL); 4120 if (!twsk_prot->twsk_slab) { 4121 pr_crit("%s: Can't create timewait sock SLAB cache!\n", 4122 prot->name); 4123 return -ENOMEM; 4124 } 4125 4126 return 0; 4127 } 4128 4129 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 4130 { 4131 if (!rsk_prot) 4132 return; 4133 kfree(rsk_prot->slab_name); 4134 rsk_prot->slab_name = NULL; 4135 kmem_cache_destroy(rsk_prot->slab); 4136 rsk_prot->slab = NULL; 4137 } 4138 4139 static int req_prot_init(const struct proto *prot) 4140 { 4141 struct request_sock_ops *rsk_prot = prot->rsk_prot; 4142 4143 if (!rsk_prot) 4144 return 0; 4145 4146 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 4147 prot->name); 4148 if (!rsk_prot->slab_name) 4149 return -ENOMEM; 4150 4151 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 4152 rsk_prot->obj_size, 0, 4153 SLAB_ACCOUNT | prot->slab_flags, 4154 NULL); 4155 4156 if (!rsk_prot->slab) { 4157 pr_crit("%s: Can't create request sock SLAB cache!\n", 4158 prot->name); 4159 return -ENOMEM; 4160 } 4161 return 0; 4162 } 4163 4164 int proto_register(struct proto *prot, int alloc_slab) 4165 { 4166 int ret = -ENOBUFS; 4167 4168 if (prot->memory_allocated && !prot->sysctl_mem) { 4169 pr_err("%s: missing sysctl_mem\n", prot->name); 4170 return -EINVAL; 4171 } 4172 if (prot->memory_allocated && !prot->per_cpu_fw_alloc) { 4173 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name); 4174 return -EINVAL; 4175 } 4176 if (alloc_slab) { 4177 prot->slab = kmem_cache_create_usercopy(prot->name, 4178 prot->obj_size, 0, 4179 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | 4180 prot->slab_flags, 4181 prot->useroffset, prot->usersize, 4182 NULL); 4183 4184 if (prot->slab == NULL) { 4185 pr_crit("%s: Can't create sock SLAB cache!\n", 4186 prot->name); 4187 goto out; 4188 } 4189 4190 if (req_prot_init(prot)) 4191 goto out_free_request_sock_slab; 4192 4193 if (tw_prot_init(prot)) 4194 goto out_free_timewait_sock_slab; 4195 } 4196 4197 mutex_lock(&proto_list_mutex); 4198 ret = assign_proto_idx(prot); 4199 if (ret) { 4200 mutex_unlock(&proto_list_mutex); 4201 goto out_free_timewait_sock_slab; 4202 } 4203 list_add(&prot->node, &proto_list); 4204 mutex_unlock(&proto_list_mutex); 4205 return ret; 4206 4207 out_free_timewait_sock_slab: 4208 if (alloc_slab) 4209 tw_prot_cleanup(prot->twsk_prot); 4210 out_free_request_sock_slab: 4211 if (alloc_slab) { 4212 req_prot_cleanup(prot->rsk_prot); 4213 4214 kmem_cache_destroy(prot->slab); 4215 prot->slab = NULL; 4216 } 4217 out: 4218 return ret; 4219 } 4220 EXPORT_SYMBOL(proto_register); 4221 4222 void proto_unregister(struct proto *prot) 4223 { 4224 mutex_lock(&proto_list_mutex); 4225 release_proto_idx(prot); 4226 list_del(&prot->node); 4227 mutex_unlock(&proto_list_mutex); 4228 4229 kmem_cache_destroy(prot->slab); 4230 prot->slab = NULL; 4231 4232 req_prot_cleanup(prot->rsk_prot); 4233 tw_prot_cleanup(prot->twsk_prot); 4234 } 4235 EXPORT_SYMBOL(proto_unregister); 4236 4237 int sock_load_diag_module(int family, int protocol) 4238 { 4239 if (!protocol) { 4240 if (!sock_is_registered(family)) 4241 return -ENOENT; 4242 4243 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 4244 NETLINK_SOCK_DIAG, family); 4245 } 4246 4247 #ifdef CONFIG_INET 4248 if (family == AF_INET && 4249 protocol != IPPROTO_RAW && 4250 protocol < MAX_INET_PROTOS && 4251 !rcu_access_pointer(inet_protos[protocol])) 4252 return -ENOENT; 4253 #endif 4254 4255 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 4256 NETLINK_SOCK_DIAG, family, protocol); 4257 } 4258 EXPORT_SYMBOL(sock_load_diag_module); 4259 4260 #ifdef CONFIG_PROC_FS 4261 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 4262 __acquires(proto_list_mutex) 4263 { 4264 mutex_lock(&proto_list_mutex); 4265 return seq_list_start_head(&proto_list, *pos); 4266 } 4267 4268 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 4269 { 4270 return seq_list_next(v, &proto_list, pos); 4271 } 4272 4273 static void proto_seq_stop(struct seq_file *seq, void *v) 4274 __releases(proto_list_mutex) 4275 { 4276 mutex_unlock(&proto_list_mutex); 4277 } 4278 4279 static char proto_method_implemented(const void *method) 4280 { 4281 return method == NULL ? 'n' : 'y'; 4282 } 4283 static long sock_prot_memory_allocated(struct proto *proto) 4284 { 4285 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 4286 } 4287 4288 static const char *sock_prot_memory_pressure(struct proto *proto) 4289 { 4290 return proto->memory_pressure != NULL ? 4291 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 4292 } 4293 4294 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 4295 { 4296 4297 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 4298 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 4299 proto->name, 4300 proto->obj_size, 4301 sock_prot_inuse_get(seq_file_net(seq), proto), 4302 sock_prot_memory_allocated(proto), 4303 sock_prot_memory_pressure(proto), 4304 proto->max_header, 4305 proto->slab == NULL ? "no" : "yes", 4306 module_name(proto->owner), 4307 proto_method_implemented(proto->close), 4308 proto_method_implemented(proto->connect), 4309 proto_method_implemented(proto->disconnect), 4310 proto_method_implemented(proto->accept), 4311 proto_method_implemented(proto->ioctl), 4312 proto_method_implemented(proto->init), 4313 proto_method_implemented(proto->destroy), 4314 proto_method_implemented(proto->shutdown), 4315 proto_method_implemented(proto->setsockopt), 4316 proto_method_implemented(proto->getsockopt), 4317 proto_method_implemented(proto->sendmsg), 4318 proto_method_implemented(proto->recvmsg), 4319 proto_method_implemented(proto->bind), 4320 proto_method_implemented(proto->backlog_rcv), 4321 proto_method_implemented(proto->hash), 4322 proto_method_implemented(proto->unhash), 4323 proto_method_implemented(proto->get_port), 4324 proto_method_implemented(proto->enter_memory_pressure)); 4325 } 4326 4327 static int proto_seq_show(struct seq_file *seq, void *v) 4328 { 4329 if (v == &proto_list) 4330 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 4331 "protocol", 4332 "size", 4333 "sockets", 4334 "memory", 4335 "press", 4336 "maxhdr", 4337 "slab", 4338 "module", 4339 "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n"); 4340 else 4341 proto_seq_printf(seq, list_entry(v, struct proto, node)); 4342 return 0; 4343 } 4344 4345 static const struct seq_operations proto_seq_ops = { 4346 .start = proto_seq_start, 4347 .next = proto_seq_next, 4348 .stop = proto_seq_stop, 4349 .show = proto_seq_show, 4350 }; 4351 4352 static __net_init int proto_init_net(struct net *net) 4353 { 4354 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, 4355 sizeof(struct seq_net_private))) 4356 return -ENOMEM; 4357 4358 return 0; 4359 } 4360 4361 static __net_exit void proto_exit_net(struct net *net) 4362 { 4363 remove_proc_entry("protocols", net->proc_net); 4364 } 4365 4366 4367 static __net_initdata struct pernet_operations proto_net_ops = { 4368 .init = proto_init_net, 4369 .exit = proto_exit_net, 4370 }; 4371 4372 static int __init proto_init(void) 4373 { 4374 return register_pernet_subsys(&proto_net_ops); 4375 } 4376 4377 subsys_initcall(proto_init); 4378 4379 #endif /* PROC_FS */ 4380 4381 #ifdef CONFIG_NET_RX_BUSY_POLL 4382 bool sk_busy_loop_end(void *p, unsigned long start_time) 4383 { 4384 struct sock *sk = p; 4385 4386 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 4387 return true; 4388 4389 if (sk_is_udp(sk) && 4390 !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) 4391 return true; 4392 4393 return sk_busy_loop_timeout(sk, start_time); 4394 } 4395 EXPORT_SYMBOL(sk_busy_loop_end); 4396 #endif /* CONFIG_NET_RX_BUSY_POLL */ 4397 4398 int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len) 4399 { 4400 if (!sk->sk_prot->bind_add) 4401 return -EOPNOTSUPP; 4402 return sk->sk_prot->bind_add(sk, addr, addr_len); 4403 } 4404 EXPORT_SYMBOL(sock_bind_add); 4405 4406 /* Copy 'size' bytes from userspace and return `size` back to userspace */ 4407 int sock_ioctl_inout(struct sock *sk, unsigned int cmd, 4408 void __user *arg, void *karg, size_t size) 4409 { 4410 int ret; 4411 4412 if (copy_from_user(karg, arg, size)) 4413 return -EFAULT; 4414 4415 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg); 4416 if (ret) 4417 return ret; 4418 4419 if (copy_to_user(arg, karg, size)) 4420 return -EFAULT; 4421 4422 return 0; 4423 } 4424 EXPORT_SYMBOL(sock_ioctl_inout); 4425 4426 /* This is the most common ioctl prep function, where the result (4 bytes) is 4427 * copied back to userspace if the ioctl() returns successfully. No input is 4428 * copied from userspace as input argument. 4429 */ 4430 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg) 4431 { 4432 int ret, karg = 0; 4433 4434 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg); 4435 if (ret) 4436 return ret; 4437 4438 return put_user(karg, (int __user *)arg); 4439 } 4440 4441 /* A wrapper around sock ioctls, which copies the data from userspace 4442 * (depending on the protocol/ioctl), and copies back the result to userspace. 4443 * The main motivation for this function is to pass kernel memory to the 4444 * protocol ioctl callbacks, instead of userspace memory. 4445 */ 4446 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) 4447 { 4448 int rc = 1; 4449 4450 if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET) 4451 rc = ipmr_sk_ioctl(sk, cmd, arg); 4452 else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6) 4453 rc = ip6mr_sk_ioctl(sk, cmd, arg); 4454 else if (sk_is_phonet(sk)) 4455 rc = phonet_sk_ioctl(sk, cmd, arg); 4456 4457 /* If ioctl was processed, returns its value */ 4458 if (rc <= 0) 4459 return rc; 4460 4461 /* Otherwise call the default handler */ 4462 return sock_ioctl_out(sk, cmd, arg); 4463 } 4464 EXPORT_SYMBOL(sk_ioctl); 4465 4466 static int __init sock_struct_check(void) 4467 { 4468 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops); 4469 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off); 4470 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue); 4471 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue); 4472 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog); 4473 4474 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst); 4475 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex); 4476 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie); 4477 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf); 4478 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter); 4479 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq); 4480 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready); 4481 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo); 4482 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat); 4483 4484 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err); 4485 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket); 4486 #ifdef CONFIG_MEMCG 4487 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); 4488 #endif 4489 4490 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); 4491 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); 4492 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc); 4493 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags); 4494 4495 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); 4496 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); 4497 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft); 4498 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued); 4499 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc); 4500 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags); 4501 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head); 4502 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue); 4503 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending); 4504 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm); 4505 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status); 4506 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag); 4507 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer); 4508 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate); 4509 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey); 4510 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey); 4511 4512 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate); 4513 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo); 4514 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority); 4515 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark); 4516 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid); 4517 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol); 4518 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache); 4519 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps); 4520 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type); 4521 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size); 4522 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation); 4523 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash); 4524 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf); 4525 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs); 4526 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift); 4527 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag); 4528 return 0; 4529 } 4530 4531 core_initcall(sock_struct_check); 4532