1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic socket support routines. Memory allocators, socket lock/release 8 * handler for protocols to use and generic option handler. 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 */ 85 86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 87 88 #include <linux/unaligned.h> 89 #include <linux/capability.h> 90 #include <linux/errno.h> 91 #include <linux/errqueue.h> 92 #include <linux/types.h> 93 #include <linux/socket.h> 94 #include <linux/in.h> 95 #include <linux/kernel.h> 96 #include <linux/module.h> 97 #include <linux/proc_fs.h> 98 #include <linux/seq_file.h> 99 #include <linux/sched.h> 100 #include <linux/sched/mm.h> 101 #include <linux/timer.h> 102 #include <linux/string.h> 103 #include <linux/sockios.h> 104 #include <linux/net.h> 105 #include <linux/mm.h> 106 #include <linux/slab.h> 107 #include <linux/interrupt.h> 108 #include <linux/poll.h> 109 #include <linux/tcp.h> 110 #include <linux/udp.h> 111 #include <linux/init.h> 112 #include <linux/highmem.h> 113 #include <linux/user_namespace.h> 114 #include <linux/static_key.h> 115 #include <linux/memcontrol.h> 116 #include <linux/prefetch.h> 117 #include <linux/compat.h> 118 #include <linux/mroute.h> 119 #include <linux/mroute6.h> 120 #include <linux/icmpv6.h> 121 122 #include <linux/uaccess.h> 123 124 #include <linux/netdevice.h> 125 #include <net/protocol.h> 126 #include <linux/skbuff.h> 127 #include <linux/skbuff_ref.h> 128 #include <net/net_namespace.h> 129 #include <net/request_sock.h> 130 #include <net/sock.h> 131 #include <net/proto_memory.h> 132 #include <linux/net_tstamp.h> 133 #include <net/xfrm.h> 134 #include <linux/ipsec.h> 135 #include <net/cls_cgroup.h> 136 #include <net/netprio_cgroup.h> 137 #include <linux/sock_diag.h> 138 139 #include <linux/filter.h> 140 #include <net/sock_reuseport.h> 141 #include <net/bpf_sk_storage.h> 142 143 #include <trace/events/sock.h> 144 145 #include <net/tcp.h> 146 #include <net/busy_poll.h> 147 #include <net/phonet/phonet.h> 148 149 #include <linux/ethtool.h> 150 151 #include <uapi/linux/pidfd.h> 152 153 #include "dev.h" 154 155 static DEFINE_MUTEX(proto_list_mutex); 156 static LIST_HEAD(proto_list); 157 158 static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc); 159 static void sock_def_write_space(struct sock *sk); 160 161 /** 162 * sk_ns_capable - General socket capability test 163 * @sk: Socket to use a capability on or through 164 * @user_ns: The user namespace of the capability to use 165 * @cap: The capability to use 166 * 167 * Test to see if the opener of the socket had when the socket was 168 * created and the current process has the capability @cap in the user 169 * namespace @user_ns. 170 */ 171 bool sk_ns_capable(const struct sock *sk, 172 struct user_namespace *user_ns, int cap) 173 { 174 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 175 ns_capable(user_ns, cap); 176 } 177 EXPORT_SYMBOL(sk_ns_capable); 178 179 /** 180 * sk_capable - Socket global capability test 181 * @sk: Socket to use a capability on or through 182 * @cap: The global capability to use 183 * 184 * Test to see if the opener of the socket had when the socket was 185 * created and the current process has the capability @cap in all user 186 * namespaces. 187 */ 188 bool sk_capable(const struct sock *sk, int cap) 189 { 190 return sk_ns_capable(sk, &init_user_ns, cap); 191 } 192 EXPORT_SYMBOL(sk_capable); 193 194 /** 195 * sk_net_capable - Network namespace socket capability test 196 * @sk: Socket to use a capability on or through 197 * @cap: The capability to use 198 * 199 * Test to see if the opener of the socket had when the socket was created 200 * and the current process has the capability @cap over the network namespace 201 * the socket is a member of. 202 */ 203 bool sk_net_capable(const struct sock *sk, int cap) 204 { 205 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 206 } 207 EXPORT_SYMBOL(sk_net_capable); 208 209 /* 210 * Each address family might have different locking rules, so we have 211 * one slock key per address family and separate keys for internal and 212 * userspace sockets. 213 */ 214 static struct lock_class_key af_family_keys[AF_MAX]; 215 static struct lock_class_key af_family_kern_keys[AF_MAX]; 216 static struct lock_class_key af_family_slock_keys[AF_MAX]; 217 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 218 219 /* 220 * Make lock validator output more readable. (we pre-construct these 221 * strings build-time, so that runtime initialization of socket 222 * locks is fast): 223 */ 224 225 #define _sock_locks(x) \ 226 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 227 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 228 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 229 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 230 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 231 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 232 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 233 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 234 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 235 x "27" , x "28" , x "AF_CAN" , \ 236 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 237 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 238 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 239 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 240 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 241 x "AF_MCTP" , \ 242 x "AF_MAX" 243 244 static const char *const af_family_key_strings[AF_MAX+1] = { 245 _sock_locks("sk_lock-") 246 }; 247 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 248 _sock_locks("slock-") 249 }; 250 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 251 _sock_locks("clock-") 252 }; 253 254 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 255 _sock_locks("k-sk_lock-") 256 }; 257 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 258 _sock_locks("k-slock-") 259 }; 260 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 261 _sock_locks("k-clock-") 262 }; 263 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 264 _sock_locks("rlock-") 265 }; 266 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 267 _sock_locks("wlock-") 268 }; 269 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 270 _sock_locks("elock-") 271 }; 272 273 /* 274 * sk_callback_lock and sk queues locking rules are per-address-family, 275 * so split the lock classes by using a per-AF key: 276 */ 277 static struct lock_class_key af_callback_keys[AF_MAX]; 278 static struct lock_class_key af_rlock_keys[AF_MAX]; 279 static struct lock_class_key af_wlock_keys[AF_MAX]; 280 static struct lock_class_key af_elock_keys[AF_MAX]; 281 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 282 283 /* Run time adjustable parameters. */ 284 __u32 sysctl_wmem_max __read_mostly = 4 << 20; 285 EXPORT_SYMBOL(sysctl_wmem_max); 286 __u32 sysctl_rmem_max __read_mostly = 4 << 20; 287 EXPORT_SYMBOL(sysctl_rmem_max); 288 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT; 289 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT; 290 291 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 292 EXPORT_SYMBOL_GPL(memalloc_socks_key); 293 294 /** 295 * sk_set_memalloc - sets %SOCK_MEMALLOC 296 * @sk: socket to set it on 297 * 298 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 299 * It's the responsibility of the admin to adjust min_free_kbytes 300 * to meet the requirements 301 */ 302 void sk_set_memalloc(struct sock *sk) 303 { 304 sock_set_flag(sk, SOCK_MEMALLOC); 305 sk->sk_allocation |= __GFP_MEMALLOC; 306 static_branch_inc(&memalloc_socks_key); 307 } 308 EXPORT_SYMBOL_GPL(sk_set_memalloc); 309 310 void sk_clear_memalloc(struct sock *sk) 311 { 312 sock_reset_flag(sk, SOCK_MEMALLOC); 313 sk->sk_allocation &= ~__GFP_MEMALLOC; 314 static_branch_dec(&memalloc_socks_key); 315 316 /* 317 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 318 * progress of swapping. SOCK_MEMALLOC may be cleared while 319 * it has rmem allocations due to the last swapfile being deactivated 320 * but there is a risk that the socket is unusable due to exceeding 321 * the rmem limits. Reclaim the reserves and obey rmem limits again. 322 */ 323 sk_mem_reclaim(sk); 324 } 325 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 326 327 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 328 { 329 int ret; 330 unsigned int noreclaim_flag; 331 332 /* these should have been dropped before queueing */ 333 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 334 335 noreclaim_flag = memalloc_noreclaim_save(); 336 ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv, 337 tcp_v6_do_rcv, 338 tcp_v4_do_rcv, 339 sk, skb); 340 memalloc_noreclaim_restore(noreclaim_flag); 341 342 return ret; 343 } 344 EXPORT_SYMBOL(__sk_backlog_rcv); 345 346 void sk_error_report(struct sock *sk) 347 { 348 sk->sk_error_report(sk); 349 350 switch (sk->sk_family) { 351 case AF_INET: 352 fallthrough; 353 case AF_INET6: 354 trace_inet_sk_error_report(sk); 355 break; 356 default: 357 break; 358 } 359 } 360 EXPORT_SYMBOL(sk_error_report); 361 362 int sock_get_timeout(long timeo, void *optval, bool old_timeval) 363 { 364 struct __kernel_sock_timeval tv; 365 366 if (timeo == MAX_SCHEDULE_TIMEOUT) { 367 tv.tv_sec = 0; 368 tv.tv_usec = 0; 369 } else { 370 tv.tv_sec = timeo / HZ; 371 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; 372 } 373 374 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 375 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; 376 *(struct old_timeval32 *)optval = tv32; 377 return sizeof(tv32); 378 } 379 380 if (old_timeval) { 381 struct __kernel_old_timeval old_tv; 382 old_tv.tv_sec = tv.tv_sec; 383 old_tv.tv_usec = tv.tv_usec; 384 *(struct __kernel_old_timeval *)optval = old_tv; 385 return sizeof(old_tv); 386 } 387 388 *(struct __kernel_sock_timeval *)optval = tv; 389 return sizeof(tv); 390 } 391 EXPORT_SYMBOL(sock_get_timeout); 392 393 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, 394 sockptr_t optval, int optlen, bool old_timeval) 395 { 396 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 397 struct old_timeval32 tv32; 398 399 if (optlen < sizeof(tv32)) 400 return -EINVAL; 401 402 if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) 403 return -EFAULT; 404 tv->tv_sec = tv32.tv_sec; 405 tv->tv_usec = tv32.tv_usec; 406 } else if (old_timeval) { 407 struct __kernel_old_timeval old_tv; 408 409 if (optlen < sizeof(old_tv)) 410 return -EINVAL; 411 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) 412 return -EFAULT; 413 tv->tv_sec = old_tv.tv_sec; 414 tv->tv_usec = old_tv.tv_usec; 415 } else { 416 if (optlen < sizeof(*tv)) 417 return -EINVAL; 418 if (copy_from_sockptr(tv, optval, sizeof(*tv))) 419 return -EFAULT; 420 } 421 422 return 0; 423 } 424 EXPORT_SYMBOL(sock_copy_user_timeval); 425 426 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, 427 bool old_timeval) 428 { 429 struct __kernel_sock_timeval tv; 430 int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval); 431 long val; 432 433 if (err) 434 return err; 435 436 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 437 return -EDOM; 438 439 if (tv.tv_sec < 0) { 440 static int warned __read_mostly; 441 442 WRITE_ONCE(*timeo_p, 0); 443 if (warned < 10 && net_ratelimit()) { 444 warned++; 445 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 446 __func__, current->comm, task_pid_nr(current)); 447 } 448 return 0; 449 } 450 val = MAX_SCHEDULE_TIMEOUT; 451 if ((tv.tv_sec || tv.tv_usec) && 452 (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))) 453 val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, 454 USEC_PER_SEC / HZ); 455 WRITE_ONCE(*timeo_p, val); 456 return 0; 457 } 458 459 static bool sk_set_prio_allowed(const struct sock *sk, int val) 460 { 461 return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) || 462 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || 463 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); 464 } 465 466 static bool sock_needs_netstamp(const struct sock *sk) 467 { 468 switch (sk->sk_family) { 469 case AF_UNSPEC: 470 case AF_UNIX: 471 return false; 472 default: 473 return true; 474 } 475 } 476 477 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 478 { 479 if (sk->sk_flags & flags) { 480 sk->sk_flags &= ~flags; 481 if (sock_needs_netstamp(sk) && 482 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 483 net_disable_timestamp(); 484 } 485 } 486 487 488 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 489 { 490 unsigned long flags; 491 struct sk_buff_head *list = &sk->sk_receive_queue; 492 493 if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { 494 sk_drops_inc(sk); 495 trace_sock_rcvqueue_full(sk, skb); 496 return -ENOMEM; 497 } 498 499 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 500 sk_drops_inc(sk); 501 return -ENOBUFS; 502 } 503 504 skb->dev = NULL; 505 skb_set_owner_r(skb, sk); 506 507 /* we escape from rcu protected region, make sure we dont leak 508 * a norefcounted dst 509 */ 510 skb_dst_force(skb); 511 512 spin_lock_irqsave(&list->lock, flags); 513 sock_skb_set_dropcount(sk, skb); 514 __skb_queue_tail(list, skb); 515 spin_unlock_irqrestore(&list->lock, flags); 516 517 if (!sock_flag(sk, SOCK_DEAD)) 518 sk->sk_data_ready(sk); 519 return 0; 520 } 521 EXPORT_SYMBOL(__sock_queue_rcv_skb); 522 523 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, 524 enum skb_drop_reason *reason) 525 { 526 enum skb_drop_reason drop_reason; 527 int err; 528 529 err = sk_filter_reason(sk, skb, &drop_reason); 530 if (err) 531 goto out; 532 533 err = __sock_queue_rcv_skb(sk, skb); 534 switch (err) { 535 case -ENOMEM: 536 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; 537 break; 538 case -ENOBUFS: 539 drop_reason = SKB_DROP_REASON_PROTO_MEM; 540 break; 541 default: 542 drop_reason = SKB_NOT_DROPPED_YET; 543 break; 544 } 545 out: 546 if (reason) 547 *reason = drop_reason; 548 return err; 549 } 550 EXPORT_SYMBOL(sock_queue_rcv_skb_reason); 551 552 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 553 const int nested, unsigned int trim_cap, bool refcounted) 554 { 555 enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; 556 int rc = NET_RX_SUCCESS; 557 int err; 558 559 if (sk_filter_trim_cap(sk, skb, trim_cap, &reason)) 560 goto discard_and_relse; 561 562 skb->dev = NULL; 563 564 if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) { 565 sk_drops_inc(sk); 566 reason = SKB_DROP_REASON_SOCKET_RCVBUFF; 567 goto discard_and_relse; 568 } 569 if (nested) 570 bh_lock_sock_nested(sk); 571 else 572 bh_lock_sock(sk); 573 if (!sock_owned_by_user(sk)) { 574 /* 575 * trylock + unlock semantics: 576 */ 577 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 578 579 rc = sk_backlog_rcv(sk, skb); 580 581 mutex_release(&sk->sk_lock.dep_map, _RET_IP_); 582 } else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) { 583 bh_unlock_sock(sk); 584 if (err == -ENOMEM) 585 reason = SKB_DROP_REASON_PFMEMALLOC; 586 if (err == -ENOBUFS) 587 reason = SKB_DROP_REASON_SOCKET_BACKLOG; 588 sk_drops_inc(sk); 589 goto discard_and_relse; 590 } 591 592 bh_unlock_sock(sk); 593 out: 594 if (refcounted) 595 sock_put(sk); 596 return rc; 597 discard_and_relse: 598 sk_skb_reason_drop(sk, skb, reason); 599 goto out; 600 } 601 EXPORT_SYMBOL(__sk_receive_skb); 602 603 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, 604 u32)); 605 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 606 u32)); 607 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 608 { 609 struct dst_entry *dst = __sk_dst_get(sk); 610 611 if (dst && READ_ONCE(dst->obsolete) && 612 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 613 dst, cookie) == NULL) { 614 sk_tx_queue_clear(sk); 615 WRITE_ONCE(sk->sk_dst_pending_confirm, 0); 616 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 617 dst_release(dst); 618 return NULL; 619 } 620 621 return dst; 622 } 623 EXPORT_SYMBOL(__sk_dst_check); 624 625 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 626 { 627 struct dst_entry *dst = sk_dst_get(sk); 628 629 if (dst && READ_ONCE(dst->obsolete) && 630 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 631 dst, cookie) == NULL) { 632 sk_dst_reset(sk); 633 dst_release(dst); 634 return NULL; 635 } 636 637 return dst; 638 } 639 EXPORT_SYMBOL(sk_dst_check); 640 641 static int sock_bindtoindex_locked(struct sock *sk, int ifindex) 642 { 643 int ret = -ENOPROTOOPT; 644 #ifdef CONFIG_NETDEVICES 645 struct net *net = sock_net(sk); 646 647 /* Sorry... */ 648 ret = -EPERM; 649 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) 650 goto out; 651 652 ret = -EINVAL; 653 if (ifindex < 0) 654 goto out; 655 656 /* Paired with all READ_ONCE() done locklessly. */ 657 WRITE_ONCE(sk->sk_bound_dev_if, ifindex); 658 659 if (sk->sk_prot->rehash) 660 sk->sk_prot->rehash(sk); 661 sk_dst_reset(sk); 662 663 ret = 0; 664 665 out: 666 #endif 667 668 return ret; 669 } 670 671 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) 672 { 673 int ret; 674 675 if (lock_sk) 676 lock_sock(sk); 677 ret = sock_bindtoindex_locked(sk, ifindex); 678 if (lock_sk) 679 release_sock(sk); 680 681 return ret; 682 } 683 EXPORT_SYMBOL(sock_bindtoindex); 684 685 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) 686 { 687 int ret = -ENOPROTOOPT; 688 #ifdef CONFIG_NETDEVICES 689 struct net *net = sock_net(sk); 690 char devname[IFNAMSIZ]; 691 int index; 692 693 ret = -EINVAL; 694 if (optlen < 0) 695 goto out; 696 697 /* Bind this socket to a particular device like "eth0", 698 * as specified in the passed interface name. If the 699 * name is "" or the option length is zero the socket 700 * is not bound. 701 */ 702 if (optlen > IFNAMSIZ - 1) 703 optlen = IFNAMSIZ - 1; 704 memset(devname, 0, sizeof(devname)); 705 706 ret = -EFAULT; 707 if (copy_from_sockptr(devname, optval, optlen)) 708 goto out; 709 710 index = 0; 711 if (devname[0] != '\0') { 712 struct net_device *dev; 713 714 rcu_read_lock(); 715 dev = dev_get_by_name_rcu(net, devname); 716 if (dev) 717 index = dev->ifindex; 718 rcu_read_unlock(); 719 ret = -ENODEV; 720 if (!dev) 721 goto out; 722 } 723 724 sockopt_lock_sock(sk); 725 ret = sock_bindtoindex_locked(sk, index); 726 sockopt_release_sock(sk); 727 out: 728 #endif 729 730 return ret; 731 } 732 733 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval, 734 sockptr_t optlen, int len) 735 { 736 int ret = -ENOPROTOOPT; 737 #ifdef CONFIG_NETDEVICES 738 int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); 739 struct net *net = sock_net(sk); 740 char devname[IFNAMSIZ]; 741 742 if (bound_dev_if == 0) { 743 len = 0; 744 goto zero; 745 } 746 747 ret = -EINVAL; 748 if (len < IFNAMSIZ) 749 goto out; 750 751 ret = netdev_get_name(net, devname, bound_dev_if); 752 if (ret) 753 goto out; 754 755 len = strlen(devname) + 1; 756 757 ret = -EFAULT; 758 if (copy_to_sockptr(optval, devname, len)) 759 goto out; 760 761 zero: 762 ret = -EFAULT; 763 if (copy_to_sockptr(optlen, &len, sizeof(int))) 764 goto out; 765 766 ret = 0; 767 768 out: 769 #endif 770 771 return ret; 772 } 773 774 bool sk_mc_loop(const struct sock *sk) 775 { 776 if (dev_recursion_level()) 777 return false; 778 if (!sk) 779 return true; 780 /* IPV6_ADDRFORM can change sk->sk_family under us. */ 781 switch (READ_ONCE(sk->sk_family)) { 782 case AF_INET: 783 return inet_test_bit(MC_LOOP, sk); 784 #if IS_ENABLED(CONFIG_IPV6) 785 case AF_INET6: 786 return inet6_test_bit(MC6_LOOP, sk); 787 #endif 788 } 789 WARN_ON_ONCE(1); 790 return true; 791 } 792 EXPORT_SYMBOL(sk_mc_loop); 793 794 void sock_set_reuseaddr(struct sock *sk) 795 { 796 lock_sock(sk); 797 sk->sk_reuse = SK_CAN_REUSE; 798 release_sock(sk); 799 } 800 EXPORT_SYMBOL(sock_set_reuseaddr); 801 802 void sock_set_reuseport(struct sock *sk) 803 { 804 lock_sock(sk); 805 sk->sk_reuseport = true; 806 release_sock(sk); 807 } 808 EXPORT_SYMBOL(sock_set_reuseport); 809 810 void sock_no_linger(struct sock *sk) 811 { 812 lock_sock(sk); 813 WRITE_ONCE(sk->sk_lingertime, 0); 814 sock_set_flag(sk, SOCK_LINGER); 815 release_sock(sk); 816 } 817 EXPORT_SYMBOL(sock_no_linger); 818 819 void sock_set_priority(struct sock *sk, u32 priority) 820 { 821 WRITE_ONCE(sk->sk_priority, priority); 822 } 823 EXPORT_SYMBOL(sock_set_priority); 824 825 void sock_set_sndtimeo(struct sock *sk, s64 secs) 826 { 827 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) 828 WRITE_ONCE(sk->sk_sndtimeo, secs * HZ); 829 else 830 WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT); 831 } 832 EXPORT_SYMBOL(sock_set_sndtimeo); 833 834 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) 835 { 836 sock_valbool_flag(sk, SOCK_RCVTSTAMP, val); 837 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns); 838 if (val) { 839 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); 840 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 841 } 842 } 843 844 void sock_set_timestamp(struct sock *sk, int optname, bool valbool) 845 { 846 switch (optname) { 847 case SO_TIMESTAMP_OLD: 848 __sock_set_timestamps(sk, valbool, false, false); 849 break; 850 case SO_TIMESTAMP_NEW: 851 __sock_set_timestamps(sk, valbool, true, false); 852 break; 853 case SO_TIMESTAMPNS_OLD: 854 __sock_set_timestamps(sk, valbool, false, true); 855 break; 856 case SO_TIMESTAMPNS_NEW: 857 __sock_set_timestamps(sk, valbool, true, true); 858 break; 859 } 860 } 861 862 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) 863 { 864 struct net *net = sock_net(sk); 865 struct net_device *dev = NULL; 866 bool match = false; 867 int *vclock_index; 868 int i, num; 869 870 if (sk->sk_bound_dev_if) 871 dev = dev_get_by_index(net, sk->sk_bound_dev_if); 872 873 if (!dev) { 874 pr_err("%s: sock not bind to device\n", __func__); 875 return -EOPNOTSUPP; 876 } 877 878 num = ethtool_get_phc_vclocks(dev, &vclock_index); 879 dev_put(dev); 880 881 for (i = 0; i < num; i++) { 882 if (*(vclock_index + i) == phc_index) { 883 match = true; 884 break; 885 } 886 } 887 888 if (num > 0) 889 kfree(vclock_index); 890 891 if (!match) 892 return -EINVAL; 893 894 WRITE_ONCE(sk->sk_bind_phc, phc_index); 895 896 return 0; 897 } 898 899 int sock_set_timestamping(struct sock *sk, int optname, 900 struct so_timestamping timestamping) 901 { 902 int val = timestamping.flags; 903 int ret; 904 905 if (val & ~SOF_TIMESTAMPING_MASK) 906 return -EINVAL; 907 908 if (val & SOF_TIMESTAMPING_OPT_ID_TCP && 909 !(val & SOF_TIMESTAMPING_OPT_ID)) 910 return -EINVAL; 911 912 if (val & SOF_TIMESTAMPING_OPT_ID && 913 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 914 if (sk_is_tcp(sk)) { 915 if ((1 << sk->sk_state) & 916 (TCPF_CLOSE | TCPF_LISTEN)) 917 return -EINVAL; 918 if (val & SOF_TIMESTAMPING_OPT_ID_TCP) 919 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); 920 else 921 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); 922 } else { 923 atomic_set(&sk->sk_tskey, 0); 924 } 925 } 926 927 if (val & SOF_TIMESTAMPING_OPT_STATS && 928 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) 929 return -EINVAL; 930 931 if (val & SOF_TIMESTAMPING_BIND_PHC) { 932 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc); 933 if (ret) 934 return ret; 935 } 936 937 WRITE_ONCE(sk->sk_tsflags, val); 938 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); 939 sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY)); 940 941 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 942 sock_enable_timestamp(sk, 943 SOCK_TIMESTAMPING_RX_SOFTWARE); 944 else 945 sock_disable_timestamp(sk, 946 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 947 return 0; 948 } 949 950 #if defined(CONFIG_CGROUP_BPF) 951 void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op) 952 { 953 struct bpf_sock_ops_kern sock_ops; 954 955 memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); 956 sock_ops.op = op; 957 sock_ops.is_fullsock = 1; 958 sock_ops.sk = sk; 959 bpf_skops_init_skb(&sock_ops, skb, 0); 960 __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS); 961 } 962 #endif 963 964 void sock_set_keepalive(struct sock *sk) 965 { 966 lock_sock(sk); 967 if (sk->sk_prot->keepalive) 968 sk->sk_prot->keepalive(sk, true); 969 sock_valbool_flag(sk, SOCK_KEEPOPEN, true); 970 release_sock(sk); 971 } 972 EXPORT_SYMBOL(sock_set_keepalive); 973 974 static void __sock_set_rcvbuf(struct sock *sk, int val) 975 { 976 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it 977 * as a negative value. 978 */ 979 val = min_t(int, val, INT_MAX / 2); 980 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 981 982 /* We double it on the way in to account for "struct sk_buff" etc. 983 * overhead. Applications assume that the SO_RCVBUF setting they make 984 * will allow that much actual data to be received on that socket. 985 * 986 * Applications are unaware that "struct sk_buff" and other overheads 987 * allocate from the receive buffer during socket buffer allocation. 988 * 989 * And after considering the possible alternatives, returning the value 990 * we actually used in getsockopt is the most desirable behavior. 991 */ 992 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); 993 } 994 995 void sock_set_rcvbuf(struct sock *sk, int val) 996 { 997 lock_sock(sk); 998 __sock_set_rcvbuf(sk, val); 999 release_sock(sk); 1000 } 1001 EXPORT_SYMBOL(sock_set_rcvbuf); 1002 1003 static void __sock_set_mark(struct sock *sk, u32 val) 1004 { 1005 if (val != sk->sk_mark) { 1006 WRITE_ONCE(sk->sk_mark, val); 1007 sk_dst_reset(sk); 1008 } 1009 } 1010 1011 void sock_set_mark(struct sock *sk, u32 val) 1012 { 1013 lock_sock(sk); 1014 __sock_set_mark(sk, val); 1015 release_sock(sk); 1016 } 1017 EXPORT_SYMBOL(sock_set_mark); 1018 1019 static void sock_release_reserved_memory(struct sock *sk, int bytes) 1020 { 1021 /* Round down bytes to multiple of pages */ 1022 bytes = round_down(bytes, PAGE_SIZE); 1023 1024 WARN_ON(bytes > sk->sk_reserved_mem); 1025 WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes); 1026 sk_mem_reclaim(sk); 1027 } 1028 1029 static int sock_reserve_memory(struct sock *sk, int bytes) 1030 { 1031 long allocated; 1032 bool charged; 1033 int pages; 1034 1035 if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk)) 1036 return -EOPNOTSUPP; 1037 1038 if (!bytes) 1039 return 0; 1040 1041 pages = sk_mem_pages(bytes); 1042 1043 /* pre-charge to memcg */ 1044 charged = mem_cgroup_sk_charge(sk, pages, 1045 GFP_KERNEL | __GFP_RETRY_MAYFAIL); 1046 if (!charged) 1047 return -ENOMEM; 1048 1049 if (sk->sk_bypass_prot_mem) 1050 goto success; 1051 1052 /* pre-charge to forward_alloc */ 1053 sk_memory_allocated_add(sk, pages); 1054 allocated = sk_memory_allocated(sk); 1055 1056 /* If the system goes into memory pressure with this 1057 * precharge, give up and return error. 1058 */ 1059 if (allocated > sk_prot_mem_limits(sk, 1)) { 1060 sk_memory_allocated_sub(sk, pages); 1061 mem_cgroup_sk_uncharge(sk, pages); 1062 return -ENOMEM; 1063 } 1064 1065 success: 1066 sk_forward_alloc_add(sk, pages << PAGE_SHIFT); 1067 1068 WRITE_ONCE(sk->sk_reserved_mem, 1069 sk->sk_reserved_mem + (pages << PAGE_SHIFT)); 1070 1071 return 0; 1072 } 1073 1074 #ifdef CONFIG_PAGE_POOL 1075 1076 /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED 1077 * in 1 syscall. The limit exists to limit the amount of memory the kernel 1078 * allocates to copy these tokens, and to prevent looping over the frags for 1079 * too long. 1080 */ 1081 #define MAX_DONTNEED_TOKENS 128 1082 #define MAX_DONTNEED_FRAGS 1024 1083 1084 static noinline_for_stack int 1085 sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) 1086 { 1087 unsigned int num_tokens, i, j, k, netmem_num = 0; 1088 struct dmabuf_token *tokens; 1089 int ret = 0, num_frags = 0; 1090 netmem_ref netmems[16]; 1091 1092 if (!sk_is_tcp(sk)) 1093 return -EBADF; 1094 1095 if (optlen % sizeof(*tokens) || 1096 optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS) 1097 return -EINVAL; 1098 1099 num_tokens = optlen / sizeof(*tokens); 1100 tokens = kvmalloc_objs(*tokens, num_tokens); 1101 if (!tokens) 1102 return -ENOMEM; 1103 1104 if (copy_from_sockptr(tokens, optval, optlen)) { 1105 kvfree(tokens); 1106 return -EFAULT; 1107 } 1108 1109 xa_lock_bh(&sk->sk_user_frags); 1110 for (i = 0; i < num_tokens; i++) { 1111 for (j = 0; j < tokens[i].token_count; j++) { 1112 if (++num_frags > MAX_DONTNEED_FRAGS) 1113 goto frag_limit_reached; 1114 1115 netmem_ref netmem = (__force netmem_ref)__xa_erase( 1116 &sk->sk_user_frags, tokens[i].token_start + j); 1117 1118 if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem))) 1119 continue; 1120 1121 netmems[netmem_num++] = netmem; 1122 if (netmem_num == ARRAY_SIZE(netmems)) { 1123 xa_unlock_bh(&sk->sk_user_frags); 1124 for (k = 0; k < netmem_num; k++) 1125 WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); 1126 netmem_num = 0; 1127 xa_lock_bh(&sk->sk_user_frags); 1128 } 1129 ret++; 1130 } 1131 } 1132 1133 frag_limit_reached: 1134 xa_unlock_bh(&sk->sk_user_frags); 1135 for (k = 0; k < netmem_num; k++) 1136 WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); 1137 1138 kvfree(tokens); 1139 return ret; 1140 } 1141 #endif 1142 1143 void sockopt_lock_sock(struct sock *sk) 1144 { 1145 /* When current->bpf_ctx is set, the setsockopt is called from 1146 * a bpf prog. bpf has ensured the sk lock has been 1147 * acquired before calling setsockopt(). 1148 */ 1149 if (has_current_bpf_ctx()) 1150 return; 1151 1152 lock_sock(sk); 1153 } 1154 EXPORT_SYMBOL(sockopt_lock_sock); 1155 1156 void sockopt_release_sock(struct sock *sk) 1157 { 1158 if (has_current_bpf_ctx()) 1159 return; 1160 1161 release_sock(sk); 1162 } 1163 EXPORT_SYMBOL(sockopt_release_sock); 1164 1165 bool sockopt_ns_capable(struct user_namespace *ns, int cap) 1166 { 1167 return has_current_bpf_ctx() || ns_capable(ns, cap); 1168 } 1169 EXPORT_SYMBOL(sockopt_ns_capable); 1170 1171 bool sockopt_capable(int cap) 1172 { 1173 return has_current_bpf_ctx() || capable(cap); 1174 } 1175 EXPORT_SYMBOL(sockopt_capable); 1176 1177 static int sockopt_validate_clockid(__kernel_clockid_t value) 1178 { 1179 switch (value) { 1180 case CLOCK_REALTIME: 1181 case CLOCK_MONOTONIC: 1182 case CLOCK_TAI: 1183 return 0; 1184 } 1185 return -EINVAL; 1186 } 1187 1188 /* 1189 * This is meant for all protocols to use and covers goings on 1190 * at the socket level. Everything here is generic. 1191 */ 1192 1193 int sk_setsockopt(struct sock *sk, int level, int optname, 1194 sockptr_t optval, unsigned int optlen) 1195 { 1196 struct so_timestamping timestamping; 1197 struct socket *sock = sk->sk_socket; 1198 struct sock_txtime sk_txtime; 1199 int val; 1200 int valbool; 1201 struct linger ling; 1202 int ret = 0; 1203 1204 /* 1205 * Options without arguments 1206 */ 1207 1208 if (optname == SO_BINDTODEVICE) 1209 return sock_setbindtodevice(sk, optval, optlen); 1210 1211 if (optlen < sizeof(int)) 1212 return -EINVAL; 1213 1214 if (copy_from_sockptr(&val, optval, sizeof(val))) 1215 return -EFAULT; 1216 1217 valbool = val ? 1 : 0; 1218 1219 /* handle options which do not require locking the socket. */ 1220 switch (optname) { 1221 case SO_PRIORITY: 1222 if (sk_set_prio_allowed(sk, val)) { 1223 sock_set_priority(sk, val); 1224 return 0; 1225 } 1226 return -EPERM; 1227 case SO_TYPE: 1228 case SO_PROTOCOL: 1229 case SO_DOMAIN: 1230 case SO_ERROR: 1231 return -ENOPROTOOPT; 1232 #ifdef CONFIG_NET_RX_BUSY_POLL 1233 case SO_BUSY_POLL: 1234 if (val < 0) 1235 return -EINVAL; 1236 WRITE_ONCE(sk->sk_ll_usec, val); 1237 return 0; 1238 case SO_PREFER_BUSY_POLL: 1239 if (valbool && !sockopt_capable(CAP_NET_ADMIN)) 1240 return -EPERM; 1241 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); 1242 return 0; 1243 case SO_BUSY_POLL_BUDGET: 1244 if (val > READ_ONCE(sk->sk_busy_poll_budget) && 1245 !sockopt_capable(CAP_NET_ADMIN)) 1246 return -EPERM; 1247 if (val < 0 || val > U16_MAX) 1248 return -EINVAL; 1249 WRITE_ONCE(sk->sk_busy_poll_budget, val); 1250 return 0; 1251 #endif 1252 case SO_MAX_PACING_RATE: 1253 { 1254 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; 1255 unsigned long pacing_rate; 1256 1257 if (sizeof(ulval) != sizeof(val) && 1258 optlen >= sizeof(ulval) && 1259 copy_from_sockptr(&ulval, optval, sizeof(ulval))) { 1260 return -EFAULT; 1261 } 1262 if (ulval != ~0UL) 1263 cmpxchg(&sk->sk_pacing_status, 1264 SK_PACING_NONE, 1265 SK_PACING_NEEDED); 1266 /* Pairs with READ_ONCE() from sk_getsockopt() */ 1267 WRITE_ONCE(sk->sk_max_pacing_rate, ulval); 1268 pacing_rate = READ_ONCE(sk->sk_pacing_rate); 1269 if (ulval < pacing_rate) 1270 WRITE_ONCE(sk->sk_pacing_rate, ulval); 1271 return 0; 1272 } 1273 case SO_TXREHASH: 1274 if (!sk_is_tcp(sk)) 1275 return -EOPNOTSUPP; 1276 if (val < -1 || val > 1) 1277 return -EINVAL; 1278 if ((u8)val == SOCK_TXREHASH_DEFAULT) 1279 val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); 1280 /* Paired with READ_ONCE() in tcp_rtx_synack() 1281 * and sk_getsockopt(). 1282 */ 1283 WRITE_ONCE(sk->sk_txrehash, (u8)val); 1284 return 0; 1285 case SO_PEEK_OFF: 1286 { 1287 int (*set_peek_off)(struct sock *sk, int val); 1288 1289 set_peek_off = READ_ONCE(sock->ops)->set_peek_off; 1290 if (set_peek_off) 1291 ret = set_peek_off(sk, val); 1292 else 1293 ret = -EOPNOTSUPP; 1294 return ret; 1295 } 1296 #ifdef CONFIG_PAGE_POOL 1297 case SO_DEVMEM_DONTNEED: 1298 return sock_devmem_dontneed(sk, optval, optlen); 1299 #endif 1300 case SO_SNDTIMEO_OLD: 1301 case SO_SNDTIMEO_NEW: 1302 return sock_set_timeout(&sk->sk_sndtimeo, optval, 1303 optlen, optname == SO_SNDTIMEO_OLD); 1304 case SO_RCVTIMEO_OLD: 1305 case SO_RCVTIMEO_NEW: 1306 return sock_set_timeout(&sk->sk_rcvtimeo, optval, 1307 optlen, optname == SO_RCVTIMEO_OLD); 1308 } 1309 1310 sockopt_lock_sock(sk); 1311 1312 switch (optname) { 1313 case SO_DEBUG: 1314 if (val && !sockopt_capable(CAP_NET_ADMIN)) 1315 ret = -EACCES; 1316 else 1317 sock_valbool_flag(sk, SOCK_DBG, valbool); 1318 break; 1319 case SO_REUSEADDR: 1320 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 1321 break; 1322 case SO_REUSEPORT: 1323 if (valbool && !sk_is_inet(sk)) 1324 ret = -EOPNOTSUPP; 1325 else 1326 sk->sk_reuseport = valbool; 1327 break; 1328 case SO_DONTROUTE: 1329 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 1330 sk_dst_reset(sk); 1331 break; 1332 case SO_BROADCAST: 1333 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 1334 break; 1335 case SO_SNDBUF: 1336 /* Don't error on this BSD doesn't and if you think 1337 * about it this is right. Otherwise apps have to 1338 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1339 * are treated in BSD as hints 1340 */ 1341 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); 1342 set_sndbuf: 1343 /* Ensure val * 2 fits into an int, to prevent max_t() 1344 * from treating it as a negative value. 1345 */ 1346 val = min_t(int, val, INT_MAX / 2); 1347 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 1348 WRITE_ONCE(sk->sk_sndbuf, 1349 max_t(int, val * 2, SOCK_MIN_SNDBUF)); 1350 /* Wake up sending tasks if we upped the value. */ 1351 sk->sk_write_space(sk); 1352 break; 1353 1354 case SO_SNDBUFFORCE: 1355 if (!sockopt_capable(CAP_NET_ADMIN)) { 1356 ret = -EPERM; 1357 break; 1358 } 1359 1360 /* No negative values (to prevent underflow, as val will be 1361 * multiplied by 2). 1362 */ 1363 if (val < 0) 1364 val = 0; 1365 goto set_sndbuf; 1366 1367 case SO_RCVBUF: 1368 /* Don't error on this BSD doesn't and if you think 1369 * about it this is right. Otherwise apps have to 1370 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1371 * are treated in BSD as hints 1372 */ 1373 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max))); 1374 break; 1375 1376 case SO_RCVBUFFORCE: 1377 if (!sockopt_capable(CAP_NET_ADMIN)) { 1378 ret = -EPERM; 1379 break; 1380 } 1381 1382 /* No negative values (to prevent underflow, as val will be 1383 * multiplied by 2). 1384 */ 1385 __sock_set_rcvbuf(sk, max(val, 0)); 1386 break; 1387 1388 case SO_KEEPALIVE: 1389 if (sk->sk_prot->keepalive) 1390 sk->sk_prot->keepalive(sk, valbool); 1391 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 1392 break; 1393 1394 case SO_OOBINLINE: 1395 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 1396 break; 1397 1398 case SO_NO_CHECK: 1399 sk->sk_no_check_tx = valbool; 1400 break; 1401 1402 case SO_LINGER: 1403 if (optlen < sizeof(ling)) { 1404 ret = -EINVAL; /* 1003.1g */ 1405 break; 1406 } 1407 if (copy_from_sockptr(&ling, optval, sizeof(ling))) { 1408 ret = -EFAULT; 1409 break; 1410 } 1411 if (!ling.l_onoff) { 1412 sock_reset_flag(sk, SOCK_LINGER); 1413 } else { 1414 unsigned long t_sec = ling.l_linger; 1415 1416 if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ) 1417 WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT); 1418 else 1419 WRITE_ONCE(sk->sk_lingertime, t_sec * HZ); 1420 sock_set_flag(sk, SOCK_LINGER); 1421 } 1422 break; 1423 1424 case SO_BSDCOMPAT: 1425 break; 1426 1427 case SO_TIMESTAMP_OLD: 1428 case SO_TIMESTAMP_NEW: 1429 case SO_TIMESTAMPNS_OLD: 1430 case SO_TIMESTAMPNS_NEW: 1431 sock_set_timestamp(sk, optname, valbool); 1432 break; 1433 1434 case SO_TIMESTAMPING_NEW: 1435 case SO_TIMESTAMPING_OLD: 1436 if (optlen == sizeof(timestamping)) { 1437 if (copy_from_sockptr(×tamping, optval, 1438 sizeof(timestamping))) { 1439 ret = -EFAULT; 1440 break; 1441 } 1442 } else { 1443 memset(×tamping, 0, sizeof(timestamping)); 1444 timestamping.flags = val; 1445 } 1446 ret = sock_set_timestamping(sk, optname, timestamping); 1447 break; 1448 1449 case SO_RCVLOWAT: 1450 { 1451 int (*set_rcvlowat)(struct sock *sk, int val) = NULL; 1452 1453 if (val < 0) 1454 val = INT_MAX; 1455 if (sock) 1456 set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat; 1457 if (set_rcvlowat) 1458 ret = set_rcvlowat(sk, val); 1459 else 1460 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); 1461 break; 1462 } 1463 case SO_ATTACH_FILTER: { 1464 struct sock_fprog fprog; 1465 1466 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1467 if (!ret) 1468 ret = sk_attach_filter(&fprog, sk); 1469 break; 1470 } 1471 case SO_ATTACH_BPF: 1472 ret = -EINVAL; 1473 if (optlen == sizeof(u32)) { 1474 u32 ufd; 1475 1476 ret = -EFAULT; 1477 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1478 break; 1479 1480 ret = sk_attach_bpf(ufd, sk); 1481 } 1482 break; 1483 1484 case SO_ATTACH_REUSEPORT_CBPF: { 1485 struct sock_fprog fprog; 1486 1487 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1488 if (!ret) 1489 ret = sk_reuseport_attach_filter(&fprog, sk); 1490 break; 1491 } 1492 case SO_ATTACH_REUSEPORT_EBPF: 1493 ret = -EINVAL; 1494 if (optlen == sizeof(u32)) { 1495 u32 ufd; 1496 1497 ret = -EFAULT; 1498 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1499 break; 1500 1501 ret = sk_reuseport_attach_bpf(ufd, sk); 1502 } 1503 break; 1504 1505 case SO_DETACH_REUSEPORT_BPF: 1506 ret = reuseport_detach_prog(sk); 1507 break; 1508 1509 case SO_DETACH_FILTER: 1510 ret = sk_detach_filter(sk); 1511 break; 1512 1513 case SO_LOCK_FILTER: 1514 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 1515 ret = -EPERM; 1516 else 1517 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 1518 break; 1519 1520 case SO_MARK: 1521 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 1522 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1523 ret = -EPERM; 1524 break; 1525 } 1526 1527 __sock_set_mark(sk, val); 1528 break; 1529 case SO_RCVMARK: 1530 sock_valbool_flag(sk, SOCK_RCVMARK, valbool); 1531 break; 1532 1533 case SO_RCVPRIORITY: 1534 sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool); 1535 break; 1536 1537 case SO_RXQ_OVFL: 1538 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 1539 break; 1540 1541 case SO_WIFI_STATUS: 1542 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1543 break; 1544 1545 case SO_NOFCS: 1546 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1547 break; 1548 1549 case SO_SELECT_ERR_QUEUE: 1550 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1551 break; 1552 1553 case SO_PASSCRED: 1554 if (sk_may_scm_recv(sk)) 1555 sk->sk_scm_credentials = valbool; 1556 else 1557 ret = -EOPNOTSUPP; 1558 break; 1559 1560 case SO_PASSSEC: 1561 if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk)) 1562 sk->sk_scm_security = valbool; 1563 else 1564 ret = -EOPNOTSUPP; 1565 break; 1566 1567 case SO_PASSPIDFD: 1568 if (sk_is_unix(sk)) 1569 sk->sk_scm_pidfd = valbool; 1570 else 1571 ret = -EOPNOTSUPP; 1572 break; 1573 1574 case SO_PASSRIGHTS: 1575 if (sk_is_unix(sk)) 1576 sk->sk_scm_rights = valbool; 1577 else 1578 ret = -EOPNOTSUPP; 1579 break; 1580 1581 case SO_INCOMING_CPU: 1582 reuseport_update_incoming_cpu(sk, val); 1583 break; 1584 1585 case SO_CNX_ADVICE: 1586 if (val == 1) 1587 dst_negative_advice(sk); 1588 break; 1589 1590 case SO_ZEROCOPY: 1591 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1592 if (!(sk_is_tcp(sk) || 1593 (sk->sk_type == SOCK_DGRAM && 1594 sk->sk_protocol == IPPROTO_UDP))) 1595 ret = -EOPNOTSUPP; 1596 } else if (sk->sk_family != PF_RDS) { 1597 ret = -EOPNOTSUPP; 1598 } 1599 if (!ret) { 1600 if (val < 0 || val > 1) 1601 ret = -EINVAL; 1602 else 1603 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1604 } 1605 break; 1606 1607 case SO_TXTIME: 1608 if (optlen != sizeof(struct sock_txtime)) { 1609 ret = -EINVAL; 1610 break; 1611 } else if (copy_from_sockptr(&sk_txtime, optval, 1612 sizeof(struct sock_txtime))) { 1613 ret = -EFAULT; 1614 break; 1615 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { 1616 ret = -EINVAL; 1617 break; 1618 } 1619 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet 1620 * scheduler has enough safe guards. 1621 */ 1622 if (sk_txtime.clockid != CLOCK_MONOTONIC && 1623 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1624 ret = -EPERM; 1625 break; 1626 } 1627 1628 ret = sockopt_validate_clockid(sk_txtime.clockid); 1629 if (ret) 1630 break; 1631 1632 sock_valbool_flag(sk, SOCK_TXTIME, true); 1633 sk->sk_clockid = sk_txtime.clockid; 1634 sk->sk_txtime_deadline_mode = 1635 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); 1636 sk->sk_txtime_report_errors = 1637 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); 1638 break; 1639 1640 case SO_BINDTOIFINDEX: 1641 ret = sock_bindtoindex_locked(sk, val); 1642 break; 1643 1644 case SO_BUF_LOCK: 1645 if (val & ~SOCK_BUF_LOCK_MASK) { 1646 ret = -EINVAL; 1647 break; 1648 } 1649 sk->sk_userlocks = val | (sk->sk_userlocks & 1650 ~SOCK_BUF_LOCK_MASK); 1651 break; 1652 1653 case SO_RESERVE_MEM: 1654 { 1655 int delta; 1656 1657 if (val < 0) { 1658 ret = -EINVAL; 1659 break; 1660 } 1661 1662 delta = val - sk->sk_reserved_mem; 1663 if (delta < 0) 1664 sock_release_reserved_memory(sk, -delta); 1665 else 1666 ret = sock_reserve_memory(sk, delta); 1667 break; 1668 } 1669 1670 default: 1671 ret = -ENOPROTOOPT; 1672 break; 1673 } 1674 sockopt_release_sock(sk); 1675 return ret; 1676 } 1677 1678 int sock_setsockopt(struct socket *sock, int level, int optname, 1679 sockptr_t optval, unsigned int optlen) 1680 { 1681 return sk_setsockopt(sock->sk, level, optname, 1682 optval, optlen); 1683 } 1684 EXPORT_SYMBOL(sock_setsockopt); 1685 1686 static const struct cred *sk_get_peer_cred(struct sock *sk) 1687 { 1688 const struct cred *cred; 1689 1690 spin_lock(&sk->sk_peer_lock); 1691 cred = get_cred(sk->sk_peer_cred); 1692 spin_unlock(&sk->sk_peer_lock); 1693 1694 return cred; 1695 } 1696 1697 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1698 struct ucred *ucred) 1699 { 1700 ucred->pid = pid_vnr(pid); 1701 ucred->uid = ucred->gid = -1; 1702 if (cred) { 1703 struct user_namespace *current_ns = current_user_ns(); 1704 1705 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1706 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1707 } 1708 } 1709 1710 static int groups_to_user(sockptr_t dst, const struct group_info *src) 1711 { 1712 struct user_namespace *user_ns = current_user_ns(); 1713 int i; 1714 1715 for (i = 0; i < src->ngroups; i++) { 1716 gid_t gid = from_kgid_munged(user_ns, src->gid[i]); 1717 1718 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid))) 1719 return -EFAULT; 1720 } 1721 1722 return 0; 1723 } 1724 1725 int sk_getsockopt(struct sock *sk, int level, int optname, 1726 sockptr_t optval, sockptr_t optlen) 1727 { 1728 struct socket *sock = sk->sk_socket; 1729 1730 union { 1731 int val; 1732 u64 val64; 1733 unsigned long ulval; 1734 struct linger ling; 1735 struct old_timeval32 tm32; 1736 struct __kernel_old_timeval tm; 1737 struct __kernel_sock_timeval stm; 1738 struct sock_txtime txtime; 1739 struct so_timestamping timestamping; 1740 } v; 1741 1742 int lv = sizeof(int); 1743 int len; 1744 1745 if (copy_from_sockptr(&len, optlen, sizeof(int))) 1746 return -EFAULT; 1747 if (len < 0) 1748 return -EINVAL; 1749 1750 memset(&v, 0, sizeof(v)); 1751 1752 switch (optname) { 1753 case SO_DEBUG: 1754 v.val = sock_flag(sk, SOCK_DBG); 1755 break; 1756 1757 case SO_DONTROUTE: 1758 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1759 break; 1760 1761 case SO_BROADCAST: 1762 v.val = sock_flag(sk, SOCK_BROADCAST); 1763 break; 1764 1765 case SO_SNDBUF: 1766 v.val = READ_ONCE(sk->sk_sndbuf); 1767 break; 1768 1769 case SO_RCVBUF: 1770 v.val = READ_ONCE(sk->sk_rcvbuf); 1771 break; 1772 1773 case SO_REUSEADDR: 1774 v.val = sk->sk_reuse; 1775 break; 1776 1777 case SO_REUSEPORT: 1778 v.val = sk->sk_reuseport; 1779 break; 1780 1781 case SO_KEEPALIVE: 1782 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1783 break; 1784 1785 case SO_TYPE: 1786 v.val = sk->sk_type; 1787 break; 1788 1789 case SO_PROTOCOL: 1790 v.val = sk->sk_protocol; 1791 break; 1792 1793 case SO_DOMAIN: 1794 v.val = sk->sk_family; 1795 break; 1796 1797 case SO_ERROR: 1798 v.val = -sock_error(sk); 1799 if (v.val == 0) 1800 v.val = xchg(&sk->sk_err_soft, 0); 1801 break; 1802 1803 case SO_OOBINLINE: 1804 v.val = sock_flag(sk, SOCK_URGINLINE); 1805 break; 1806 1807 case SO_NO_CHECK: 1808 v.val = sk->sk_no_check_tx; 1809 break; 1810 1811 case SO_PRIORITY: 1812 v.val = READ_ONCE(sk->sk_priority); 1813 break; 1814 1815 case SO_LINGER: 1816 lv = sizeof(v.ling); 1817 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1818 v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ; 1819 break; 1820 1821 case SO_BSDCOMPAT: 1822 break; 1823 1824 case SO_TIMESTAMP_OLD: 1825 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1826 !sock_flag(sk, SOCK_TSTAMP_NEW) && 1827 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1828 break; 1829 1830 case SO_TIMESTAMPNS_OLD: 1831 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); 1832 break; 1833 1834 case SO_TIMESTAMP_NEW: 1835 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); 1836 break; 1837 1838 case SO_TIMESTAMPNS_NEW: 1839 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); 1840 break; 1841 1842 case SO_TIMESTAMPING_OLD: 1843 case SO_TIMESTAMPING_NEW: 1844 lv = sizeof(v.timestamping); 1845 /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only 1846 * returning the flags when they were set through the same option. 1847 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD. 1848 */ 1849 if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) { 1850 v.timestamping.flags = READ_ONCE(sk->sk_tsflags); 1851 v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); 1852 } 1853 break; 1854 1855 case SO_RCVTIMEO_OLD: 1856 case SO_RCVTIMEO_NEW: 1857 lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v, 1858 SO_RCVTIMEO_OLD == optname); 1859 break; 1860 1861 case SO_SNDTIMEO_OLD: 1862 case SO_SNDTIMEO_NEW: 1863 lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v, 1864 SO_SNDTIMEO_OLD == optname); 1865 break; 1866 1867 case SO_RCVLOWAT: 1868 v.val = READ_ONCE(sk->sk_rcvlowat); 1869 break; 1870 1871 case SO_SNDLOWAT: 1872 v.val = 1; 1873 break; 1874 1875 case SO_PASSCRED: 1876 if (!sk_may_scm_recv(sk)) 1877 return -EOPNOTSUPP; 1878 1879 v.val = sk->sk_scm_credentials; 1880 break; 1881 1882 case SO_PASSPIDFD: 1883 if (!sk_is_unix(sk)) 1884 return -EOPNOTSUPP; 1885 1886 v.val = sk->sk_scm_pidfd; 1887 break; 1888 1889 case SO_PASSRIGHTS: 1890 if (!sk_is_unix(sk)) 1891 return -EOPNOTSUPP; 1892 1893 v.val = sk->sk_scm_rights; 1894 break; 1895 1896 case SO_PEERCRED: 1897 { 1898 struct ucred peercred; 1899 if (len > sizeof(peercred)) 1900 len = sizeof(peercred); 1901 1902 spin_lock(&sk->sk_peer_lock); 1903 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1904 spin_unlock(&sk->sk_peer_lock); 1905 1906 if (copy_to_sockptr(optval, &peercred, len)) 1907 return -EFAULT; 1908 goto lenout; 1909 } 1910 1911 case SO_PEERPIDFD: 1912 { 1913 struct pid *peer_pid; 1914 struct file *pidfd_file = NULL; 1915 unsigned int flags = 0; 1916 int pidfd; 1917 1918 if (len > sizeof(pidfd)) 1919 len = sizeof(pidfd); 1920 1921 spin_lock(&sk->sk_peer_lock); 1922 peer_pid = get_pid(sk->sk_peer_pid); 1923 spin_unlock(&sk->sk_peer_lock); 1924 1925 if (!peer_pid) 1926 return -ENODATA; 1927 1928 /* The use of PIDFD_STALE requires stashing of struct pid 1929 * on pidfs with pidfs_register_pid() and only AF_UNIX 1930 * were prepared for this. 1931 */ 1932 if (sk->sk_family == AF_UNIX) 1933 flags = PIDFD_STALE; 1934 1935 pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file); 1936 put_pid(peer_pid); 1937 if (pidfd < 0) 1938 return pidfd; 1939 1940 if (copy_to_sockptr(optval, &pidfd, len) || 1941 copy_to_sockptr(optlen, &len, sizeof(int))) { 1942 put_unused_fd(pidfd); 1943 fput(pidfd_file); 1944 1945 return -EFAULT; 1946 } 1947 1948 fd_install(pidfd, pidfd_file); 1949 return 0; 1950 } 1951 1952 case SO_PEERGROUPS: 1953 { 1954 const struct cred *cred; 1955 int ret, n; 1956 1957 cred = sk_get_peer_cred(sk); 1958 if (!cred) 1959 return -ENODATA; 1960 1961 n = cred->group_info->ngroups; 1962 if (len < n * sizeof(gid_t)) { 1963 len = n * sizeof(gid_t); 1964 put_cred(cred); 1965 return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE; 1966 } 1967 len = n * sizeof(gid_t); 1968 1969 ret = groups_to_user(optval, cred->group_info); 1970 put_cred(cred); 1971 if (ret) 1972 return ret; 1973 goto lenout; 1974 } 1975 1976 case SO_PEERNAME: 1977 { 1978 struct sockaddr_storage address; 1979 1980 lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2); 1981 if (lv < 0) 1982 return -ENOTCONN; 1983 if (lv < len) 1984 return -EINVAL; 1985 if (copy_to_sockptr(optval, &address, len)) 1986 return -EFAULT; 1987 goto lenout; 1988 } 1989 1990 /* Dubious BSD thing... Probably nobody even uses it, but 1991 * the UNIX standard wants it for whatever reason... -DaveM 1992 */ 1993 case SO_ACCEPTCONN: 1994 v.val = sk->sk_state == TCP_LISTEN; 1995 break; 1996 1997 case SO_PASSSEC: 1998 if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk)) 1999 return -EOPNOTSUPP; 2000 2001 v.val = sk->sk_scm_security; 2002 break; 2003 2004 case SO_PEERSEC: 2005 return security_socket_getpeersec_stream(sock, 2006 optval, optlen, len); 2007 2008 case SO_MARK: 2009 v.val = READ_ONCE(sk->sk_mark); 2010 break; 2011 2012 case SO_RCVMARK: 2013 v.val = sock_flag(sk, SOCK_RCVMARK); 2014 break; 2015 2016 case SO_RCVPRIORITY: 2017 v.val = sock_flag(sk, SOCK_RCVPRIORITY); 2018 break; 2019 2020 case SO_RXQ_OVFL: 2021 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 2022 break; 2023 2024 case SO_WIFI_STATUS: 2025 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 2026 break; 2027 2028 case SO_PEEK_OFF: 2029 if (!READ_ONCE(sock->ops)->set_peek_off) 2030 return -EOPNOTSUPP; 2031 2032 v.val = READ_ONCE(sk->sk_peek_off); 2033 break; 2034 case SO_NOFCS: 2035 v.val = sock_flag(sk, SOCK_NOFCS); 2036 break; 2037 2038 case SO_BINDTODEVICE: 2039 return sock_getbindtodevice(sk, optval, optlen, len); 2040 2041 case SO_GET_FILTER: 2042 len = sk_get_filter(sk, optval, len); 2043 if (len < 0) 2044 return len; 2045 2046 goto lenout; 2047 2048 case SO_LOCK_FILTER: 2049 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 2050 break; 2051 2052 case SO_BPF_EXTENSIONS: 2053 v.val = bpf_tell_extensions(); 2054 break; 2055 2056 case SO_SELECT_ERR_QUEUE: 2057 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 2058 break; 2059 2060 #ifdef CONFIG_NET_RX_BUSY_POLL 2061 case SO_BUSY_POLL: 2062 v.val = READ_ONCE(sk->sk_ll_usec); 2063 break; 2064 case SO_PREFER_BUSY_POLL: 2065 v.val = READ_ONCE(sk->sk_prefer_busy_poll); 2066 break; 2067 #endif 2068 2069 case SO_MAX_PACING_RATE: 2070 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */ 2071 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { 2072 lv = sizeof(v.ulval); 2073 v.ulval = READ_ONCE(sk->sk_max_pacing_rate); 2074 } else { 2075 /* 32bit version */ 2076 v.val = min_t(unsigned long, ~0U, 2077 READ_ONCE(sk->sk_max_pacing_rate)); 2078 } 2079 break; 2080 2081 case SO_INCOMING_CPU: 2082 v.val = READ_ONCE(sk->sk_incoming_cpu); 2083 break; 2084 2085 case SO_MEMINFO: 2086 { 2087 u32 meminfo[SK_MEMINFO_VARS]; 2088 2089 sk_get_meminfo(sk, meminfo); 2090 2091 len = min_t(unsigned int, len, sizeof(meminfo)); 2092 if (copy_to_sockptr(optval, &meminfo, len)) 2093 return -EFAULT; 2094 2095 goto lenout; 2096 } 2097 2098 #ifdef CONFIG_NET_RX_BUSY_POLL 2099 case SO_INCOMING_NAPI_ID: 2100 v.val = READ_ONCE(sk->sk_napi_id); 2101 2102 /* aggregate non-NAPI IDs down to 0 */ 2103 if (!napi_id_valid(v.val)) 2104 v.val = 0; 2105 2106 break; 2107 #endif 2108 2109 case SO_COOKIE: 2110 lv = sizeof(u64); 2111 if (len < lv) 2112 return -EINVAL; 2113 v.val64 = sock_gen_cookie(sk); 2114 break; 2115 2116 case SO_ZEROCOPY: 2117 v.val = sock_flag(sk, SOCK_ZEROCOPY); 2118 break; 2119 2120 case SO_TXTIME: 2121 lv = sizeof(v.txtime); 2122 v.txtime.clockid = sk->sk_clockid; 2123 v.txtime.flags |= sk->sk_txtime_deadline_mode ? 2124 SOF_TXTIME_DEADLINE_MODE : 0; 2125 v.txtime.flags |= sk->sk_txtime_report_errors ? 2126 SOF_TXTIME_REPORT_ERRORS : 0; 2127 break; 2128 2129 case SO_BINDTOIFINDEX: 2130 v.val = READ_ONCE(sk->sk_bound_dev_if); 2131 break; 2132 2133 case SO_NETNS_COOKIE: 2134 lv = sizeof(u64); 2135 if (len != lv) 2136 return -EINVAL; 2137 v.val64 = sock_net(sk)->net_cookie; 2138 break; 2139 2140 case SO_BUF_LOCK: 2141 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; 2142 break; 2143 2144 case SO_RESERVE_MEM: 2145 v.val = READ_ONCE(sk->sk_reserved_mem); 2146 break; 2147 2148 case SO_TXREHASH: 2149 if (!sk_is_tcp(sk)) 2150 return -EOPNOTSUPP; 2151 2152 /* Paired with WRITE_ONCE() in sk_setsockopt() */ 2153 v.val = READ_ONCE(sk->sk_txrehash); 2154 break; 2155 2156 default: 2157 /* We implement the SO_SNDLOWAT etc to not be settable 2158 * (1003.1g 7). 2159 */ 2160 return -ENOPROTOOPT; 2161 } 2162 2163 if (len > lv) 2164 len = lv; 2165 if (copy_to_sockptr(optval, &v, len)) 2166 return -EFAULT; 2167 lenout: 2168 if (copy_to_sockptr(optlen, &len, sizeof(int))) 2169 return -EFAULT; 2170 return 0; 2171 } 2172 2173 /* 2174 * Initialize an sk_lock. 2175 * 2176 * (We also register the sk_lock with the lock validator.) 2177 */ 2178 static inline void sock_lock_init(struct sock *sk) 2179 { 2180 sk_owner_clear(sk); 2181 2182 if (sk->sk_kern_sock) 2183 sock_lock_init_class_and_name( 2184 sk, 2185 af_family_kern_slock_key_strings[sk->sk_family], 2186 af_family_kern_slock_keys + sk->sk_family, 2187 af_family_kern_key_strings[sk->sk_family], 2188 af_family_kern_keys + sk->sk_family); 2189 else 2190 sock_lock_init_class_and_name( 2191 sk, 2192 af_family_slock_key_strings[sk->sk_family], 2193 af_family_slock_keys + sk->sk_family, 2194 af_family_key_strings[sk->sk_family], 2195 af_family_keys + sk->sk_family); 2196 } 2197 2198 /* 2199 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 2200 * even temporarily, because of RCU lookups. sk_node should also be left as is. 2201 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 2202 */ 2203 static void sock_copy(struct sock *nsk, const struct sock *osk) 2204 { 2205 const struct proto *prot = READ_ONCE(osk->sk_prot); 2206 #ifdef CONFIG_SECURITY_NETWORK 2207 void *sptr = nsk->sk_security; 2208 #endif 2209 2210 /* If we move sk_tx_queue_mapping out of the private section, 2211 * we must check if sk_tx_queue_clear() is called after 2212 * sock_copy() in sk_clone_lock(). 2213 */ 2214 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < 2215 offsetof(struct sock, sk_dontcopy_begin) || 2216 offsetof(struct sock, sk_tx_queue_mapping) >= 2217 offsetof(struct sock, sk_dontcopy_end)); 2218 2219 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 2220 2221 unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 2222 prot->obj_size - offsetof(struct sock, sk_dontcopy_end), 2223 /* alloc is larger than struct, see sk_prot_alloc() */); 2224 2225 #ifdef CONFIG_SECURITY_NETWORK 2226 nsk->sk_security = sptr; 2227 security_sk_clone(osk, nsk); 2228 #endif 2229 } 2230 2231 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 2232 int family) 2233 { 2234 struct sock *sk; 2235 struct kmem_cache *slab; 2236 2237 slab = prot->slab; 2238 if (slab != NULL) { 2239 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 2240 if (!sk) 2241 return sk; 2242 if (want_init_on_alloc(priority)) 2243 sk_prot_clear_nulls(sk, prot->obj_size); 2244 } else 2245 sk = kmalloc(prot->obj_size, priority); 2246 2247 if (sk != NULL) { 2248 if (security_sk_alloc(sk, family, priority)) 2249 goto out_free; 2250 2251 if (!try_module_get(prot->owner)) 2252 goto out_free_sec; 2253 } 2254 2255 return sk; 2256 2257 out_free_sec: 2258 security_sk_free(sk); 2259 out_free: 2260 if (slab != NULL) 2261 kmem_cache_free(slab, sk); 2262 else 2263 kfree(sk); 2264 return NULL; 2265 } 2266 2267 static void sk_prot_free(struct proto *prot, struct sock *sk) 2268 { 2269 struct kmem_cache *slab; 2270 struct module *owner; 2271 2272 owner = prot->owner; 2273 slab = prot->slab; 2274 2275 cgroup_sk_free(&sk->sk_cgrp_data); 2276 mem_cgroup_sk_free(sk); 2277 security_sk_free(sk); 2278 2279 sk_owner_put(sk); 2280 2281 if (slab != NULL) 2282 kmem_cache_free(slab, sk); 2283 else 2284 kfree(sk); 2285 module_put(owner); 2286 } 2287 2288 /** 2289 * sk_alloc - All socket objects are allocated here 2290 * @net: the applicable net namespace 2291 * @family: protocol family 2292 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 2293 * @prot: struct proto associated with this new sock instance 2294 * @kern: is this to be a kernel socket? 2295 */ 2296 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 2297 struct proto *prot, int kern) 2298 { 2299 struct sock *sk; 2300 2301 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 2302 if (sk) { 2303 sk->sk_family = family; 2304 /* 2305 * See comment in struct sock definition to understand 2306 * why we need sk_prot_creator -acme 2307 */ 2308 sk->sk_prot = sk->sk_prot_creator = prot; 2309 2310 if (READ_ONCE(net->core.sysctl_bypass_prot_mem)) 2311 sk->sk_bypass_prot_mem = 1; 2312 2313 sk->sk_kern_sock = kern; 2314 sock_lock_init(sk); 2315 2316 sk->sk_net_refcnt = kern ? 0 : 1; 2317 if (likely(sk->sk_net_refcnt)) { 2318 get_net_track(net, &sk->ns_tracker, priority); 2319 sock_inuse_add(net, 1); 2320 } else { 2321 net_passive_inc(net); 2322 __netns_tracker_alloc(net, &sk->ns_tracker, 2323 false, priority); 2324 } 2325 2326 sock_net_set(sk, net); 2327 refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); 2328 2329 mem_cgroup_sk_alloc(sk); 2330 cgroup_sk_alloc(&sk->sk_cgrp_data); 2331 sock_update_classid(&sk->sk_cgrp_data); 2332 sock_update_netprioidx(&sk->sk_cgrp_data); 2333 sk_tx_queue_clear(sk); 2334 } 2335 2336 return sk; 2337 } 2338 EXPORT_SYMBOL(sk_alloc); 2339 2340 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 2341 * grace period. This is the case for UDP sockets and TCP listeners. 2342 */ 2343 static void __sk_destruct(struct rcu_head *head) 2344 { 2345 struct sock *sk = container_of(head, struct sock, sk_rcu); 2346 struct net *net = sock_net(sk); 2347 struct sk_filter *filter; 2348 2349 if (sk->sk_destruct) 2350 sk->sk_destruct(sk); 2351 2352 filter = rcu_dereference_check(sk->sk_filter, 2353 refcount_read(&sk->sk_wmem_alloc) == 0); 2354 if (filter) { 2355 sk_filter_uncharge(sk, filter); 2356 RCU_INIT_POINTER(sk->sk_filter, NULL); 2357 } 2358 2359 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 2360 2361 #ifdef CONFIG_BPF_SYSCALL 2362 bpf_sk_storage_free(sk); 2363 #endif 2364 2365 if (atomic_read(&sk->sk_omem_alloc)) 2366 pr_debug("%s: optmem leakage (%d bytes) detected\n", 2367 __func__, atomic_read(&sk->sk_omem_alloc)); 2368 2369 if (sk->sk_frag.page) { 2370 put_page(sk->sk_frag.page); 2371 sk->sk_frag.page = NULL; 2372 } 2373 2374 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ 2375 put_cred(sk->sk_peer_cred); 2376 put_pid(sk->sk_peer_pid); 2377 2378 if (likely(sk->sk_net_refcnt)) { 2379 put_net_track(net, &sk->ns_tracker); 2380 } else { 2381 __netns_tracker_free(net, &sk->ns_tracker, false); 2382 net_passive_dec(net); 2383 } 2384 sk_prot_free(sk->sk_prot_creator, sk); 2385 } 2386 2387 void sk_net_refcnt_upgrade(struct sock *sk) 2388 { 2389 struct net *net = sock_net(sk); 2390 2391 WARN_ON_ONCE(sk->sk_net_refcnt); 2392 __netns_tracker_free(net, &sk->ns_tracker, false); 2393 net_passive_dec(net); 2394 sk->sk_net_refcnt = 1; 2395 get_net_track(net, &sk->ns_tracker, GFP_KERNEL); 2396 sock_inuse_add(net, 1); 2397 } 2398 EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade); 2399 2400 void sk_destruct(struct sock *sk) 2401 { 2402 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); 2403 2404 if (rcu_access_pointer(sk->sk_reuseport_cb)) { 2405 reuseport_detach_sock(sk); 2406 use_call_rcu = true; 2407 } 2408 2409 if (use_call_rcu) 2410 call_rcu(&sk->sk_rcu, __sk_destruct); 2411 else 2412 __sk_destruct(&sk->sk_rcu); 2413 } 2414 2415 static void __sk_free(struct sock *sk) 2416 { 2417 if (likely(sk->sk_net_refcnt)) 2418 sock_inuse_add(sock_net(sk), -1); 2419 2420 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 2421 sock_diag_broadcast_destroy(sk); 2422 else 2423 sk_destruct(sk); 2424 } 2425 2426 void sk_free(struct sock *sk) 2427 { 2428 /* 2429 * We subtract one from sk_wmem_alloc and can know if 2430 * some packets are still in some tx queue. 2431 * If not null, sock_wfree() will call __sk_free(sk) later 2432 */ 2433 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 2434 __sk_free(sk); 2435 } 2436 EXPORT_SYMBOL(sk_free); 2437 2438 static void sk_init_common(struct sock *sk) 2439 { 2440 skb_queue_head_init(&sk->sk_receive_queue); 2441 skb_queue_head_init(&sk->sk_write_queue); 2442 skb_queue_head_init(&sk->sk_error_queue); 2443 2444 rwlock_init(&sk->sk_callback_lock); 2445 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 2446 af_rlock_keys + sk->sk_family, 2447 af_family_rlock_key_strings[sk->sk_family]); 2448 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 2449 af_wlock_keys + sk->sk_family, 2450 af_family_wlock_key_strings[sk->sk_family]); 2451 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 2452 af_elock_keys + sk->sk_family, 2453 af_family_elock_key_strings[sk->sk_family]); 2454 if (sk->sk_kern_sock) 2455 lockdep_set_class_and_name(&sk->sk_callback_lock, 2456 af_kern_callback_keys + sk->sk_family, 2457 af_family_kern_clock_key_strings[sk->sk_family]); 2458 else 2459 lockdep_set_class_and_name(&sk->sk_callback_lock, 2460 af_callback_keys + sk->sk_family, 2461 af_family_clock_key_strings[sk->sk_family]); 2462 } 2463 2464 /** 2465 * sk_clone - clone a socket 2466 * @sk: the socket to clone 2467 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 2468 * @lock: if true, lock the cloned sk 2469 * 2470 * If @lock is true, the clone is locked by bh_lock_sock(), and 2471 * caller must unlock socket even in error path by bh_unlock_sock(). 2472 */ 2473 struct sock *sk_clone(const struct sock *sk, const gfp_t priority, 2474 bool lock) 2475 { 2476 struct proto *prot = READ_ONCE(sk->sk_prot); 2477 struct sk_filter *filter; 2478 bool is_charged = true; 2479 struct sock *newsk; 2480 2481 newsk = sk_prot_alloc(prot, priority, sk->sk_family); 2482 if (!newsk) 2483 goto out; 2484 2485 sock_copy(newsk, sk); 2486 2487 newsk->sk_prot_creator = prot; 2488 2489 /* SANITY */ 2490 if (likely(newsk->sk_net_refcnt)) { 2491 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority); 2492 sock_inuse_add(sock_net(newsk), 1); 2493 } else { 2494 /* Kernel sockets are not elevating the struct net refcount. 2495 * Instead, use a tracker to more easily detect if a layer 2496 * is not properly dismantling its kernel sockets at netns 2497 * destroy time. 2498 */ 2499 net_passive_inc(sock_net(newsk)); 2500 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, 2501 false, priority); 2502 } 2503 2504 sk_node_init(&newsk->sk_node); 2505 sock_lock_init(newsk); 2506 2507 if (lock) 2508 bh_lock_sock(newsk); 2509 2510 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 2511 newsk->sk_backlog.len = 0; 2512 2513 atomic_set(&newsk->sk_rmem_alloc, 0); 2514 2515 refcount_set(&newsk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); 2516 2517 atomic_set(&newsk->sk_omem_alloc, 0); 2518 sk_init_common(newsk); 2519 2520 newsk->sk_dst_cache = NULL; 2521 newsk->sk_dst_pending_confirm = 0; 2522 newsk->sk_wmem_queued = 0; 2523 newsk->sk_forward_alloc = 0; 2524 newsk->sk_reserved_mem = 0; 2525 DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters); 2526 sk_drops_reset(newsk); 2527 newsk->sk_send_head = NULL; 2528 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 2529 atomic_set(&newsk->sk_zckey, 0); 2530 2531 sock_reset_flag(newsk, SOCK_DONE); 2532 2533 #ifdef CONFIG_MEMCG 2534 /* sk->sk_memcg will be populated at accept() time */ 2535 newsk->sk_memcg = NULL; 2536 #endif 2537 2538 cgroup_sk_clone(&newsk->sk_cgrp_data); 2539 2540 rcu_read_lock(); 2541 filter = rcu_dereference(sk->sk_filter); 2542 if (filter != NULL) 2543 /* though it's an empty new sock, the charging may fail 2544 * if sysctl_optmem_max was changed between creation of 2545 * original socket and cloning 2546 */ 2547 is_charged = sk_filter_charge(newsk, filter); 2548 RCU_INIT_POINTER(newsk->sk_filter, filter); 2549 rcu_read_unlock(); 2550 2551 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 2552 /* We need to make sure that we don't uncharge the new 2553 * socket if we couldn't charge it in the first place 2554 * as otherwise we uncharge the parent's filter. 2555 */ 2556 if (!is_charged) 2557 RCU_INIT_POINTER(newsk->sk_filter, NULL); 2558 2559 goto free; 2560 } 2561 2562 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 2563 2564 if (bpf_sk_storage_clone(sk, newsk)) 2565 goto free; 2566 2567 /* Clear sk_user_data if parent had the pointer tagged 2568 * as not suitable for copying when cloning. 2569 */ 2570 if (sk_user_data_is_nocopy(newsk)) 2571 newsk->sk_user_data = NULL; 2572 2573 newsk->sk_err = 0; 2574 newsk->sk_err_soft = 0; 2575 newsk->sk_priority = 0; 2576 newsk->sk_incoming_cpu = raw_smp_processor_id(); 2577 2578 /* Before updating sk_refcnt, we must commit prior changes to memory 2579 * (Documentation/RCU/rculist_nulls.rst for details) 2580 */ 2581 smp_wmb(); 2582 refcount_set(&newsk->sk_refcnt, 2); 2583 2584 sk_set_socket(newsk, NULL); 2585 sk_tx_queue_clear(newsk); 2586 sk_rx_queue_clear(newsk); 2587 RCU_INIT_POINTER(newsk->sk_wq, NULL); 2588 2589 if (newsk->sk_prot->sockets_allocated) 2590 sk_sockets_allocated_inc(newsk); 2591 2592 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) 2593 net_enable_timestamp(); 2594 out: 2595 return newsk; 2596 free: 2597 /* It is still raw copy of parent, so invalidate 2598 * destructor and make plain sk_free() 2599 */ 2600 newsk->sk_destruct = NULL; 2601 if (lock) 2602 bh_unlock_sock(newsk); 2603 sk_free(newsk); 2604 newsk = NULL; 2605 goto out; 2606 } 2607 EXPORT_SYMBOL_GPL(sk_clone); 2608 2609 static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev) 2610 { 2611 bool is_ipv6 = false; 2612 u32 max_size; 2613 2614 #if IS_ENABLED(CONFIG_IPV6) 2615 is_ipv6 = (sk->sk_family == AF_INET6 && 2616 !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); 2617 #endif 2618 /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ 2619 max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) : 2620 READ_ONCE(dev->gso_ipv4_max_size); 2621 if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) 2622 max_size = GSO_LEGACY_MAX_SIZE; 2623 2624 return max_size - (MAX_TCP_HEADER + 1); 2625 } 2626 2627 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 2628 { 2629 const struct net_device *dev; 2630 u32 max_segs = 1; 2631 2632 rcu_read_lock(); 2633 dev = dst_dev_rcu(dst); 2634 sk->sk_route_caps = dev->features; 2635 if (sk_is_tcp(sk)) { 2636 struct inet_connection_sock *icsk = inet_csk(sk); 2637 2638 sk->sk_route_caps |= NETIF_F_GSO; 2639 icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK); 2640 } 2641 if (sk->sk_route_caps & NETIF_F_GSO) 2642 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 2643 if (unlikely(sk->sk_gso_disabled)) 2644 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2645 if (sk_can_gso(sk)) { 2646 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 2647 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2648 } else { 2649 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 2650 sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev); 2651 /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ 2652 max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1); 2653 } 2654 } 2655 sk->sk_gso_max_segs = max_segs; 2656 sk_dst_set(sk, dst); 2657 rcu_read_unlock(); 2658 } 2659 EXPORT_SYMBOL_GPL(sk_setup_caps); 2660 2661 /* 2662 * Simple resource managers for sockets. 2663 */ 2664 2665 2666 /* 2667 * Write buffer destructor automatically called from kfree_skb. 2668 */ 2669 void sock_wfree(struct sk_buff *skb) 2670 { 2671 unsigned int len = skb->truesize; 2672 struct sock *sk = skb->sk; 2673 bool free; 2674 int old; 2675 2676 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 2677 if (sock_flag(sk, SOCK_RCU_FREE) && 2678 sk->sk_write_space == sock_def_write_space) { 2679 rcu_read_lock(); 2680 free = __refcount_sub_and_test(len, &sk->sk_wmem_alloc, 2681 &old); 2682 sock_def_write_space_wfree(sk, old - len); 2683 rcu_read_unlock(); 2684 if (unlikely(free)) 2685 __sk_free(sk); 2686 return; 2687 } 2688 2689 /* 2690 * Keep a reference on sk_wmem_alloc, this will be released 2691 * after sk_write_space() call 2692 */ 2693 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 2694 sk->sk_write_space(sk); 2695 len = 1; 2696 } 2697 /* 2698 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 2699 * could not do because of in-flight packets 2700 */ 2701 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 2702 __sk_free(sk); 2703 } 2704 EXPORT_SYMBOL(sock_wfree); 2705 2706 /* This variant of sock_wfree() is used by TCP, 2707 * since it sets SOCK_USE_WRITE_QUEUE. 2708 */ 2709 void __sock_wfree(struct sk_buff *skb) 2710 { 2711 struct sock *sk = skb->sk; 2712 2713 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 2714 __sk_free(sk); 2715 } 2716 2717 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 2718 { 2719 int old_wmem; 2720 2721 skb_orphan(skb); 2722 #ifdef CONFIG_INET 2723 if (unlikely(!sk_fullsock(sk))) 2724 return skb_set_owner_edemux(skb, sk); 2725 #endif 2726 skb->sk = sk; 2727 skb->destructor = sock_wfree; 2728 skb_set_hash_from_sk(skb, sk); 2729 /* 2730 * We used to take a refcount on sk, but following operation 2731 * is enough to guarantee sk_free() won't free this sock until 2732 * all in-flight packets are completed 2733 */ 2734 __refcount_add(skb->truesize, &sk->sk_wmem_alloc, &old_wmem); 2735 2736 /* (old_wmem == SK_WMEM_ALLOC_BIAS) if no other TX packet for this socket 2737 * is in a host queue (qdisc, NIC queue). 2738 * Set skb->ooo_okay so that netdev_pick_tx() can choose a TX queue 2739 * based on XPS for better performance. 2740 * Otherwise clear ooo_okay to not risk Out Of Order delivery. 2741 */ 2742 skb->ooo_okay = (old_wmem == SK_WMEM_ALLOC_BIAS); 2743 } 2744 EXPORT_SYMBOL(skb_set_owner_w); 2745 2746 static bool can_skb_orphan_partial(const struct sk_buff *skb) 2747 { 2748 /* Drivers depend on in-order delivery for crypto offload, 2749 * partial orphan breaks out-of-order-OK logic. 2750 */ 2751 if (skb_is_decrypted(skb)) 2752 return false; 2753 2754 return (skb->destructor == sock_wfree || 2755 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); 2756 } 2757 2758 /* This helper is used by netem, as it can hold packets in its 2759 * delay queue. We want to allow the owner socket to send more 2760 * packets, as if they were already TX completed by a typical driver. 2761 * But we also want to keep skb->sk set because some packet schedulers 2762 * rely on it (sch_fq for example). 2763 */ 2764 void skb_orphan_partial(struct sk_buff *skb) 2765 { 2766 if (skb_is_tcp_pure_ack(skb)) 2767 return; 2768 2769 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) 2770 return; 2771 2772 skb_orphan(skb); 2773 } 2774 EXPORT_SYMBOL(skb_orphan_partial); 2775 2776 /* 2777 * Read buffer destructor automatically called from kfree_skb. 2778 */ 2779 void sock_rfree(struct sk_buff *skb) 2780 { 2781 struct sock *sk = skb->sk; 2782 unsigned int len = skb->truesize; 2783 2784 atomic_sub(len, &sk->sk_rmem_alloc); 2785 sk_mem_uncharge(sk, len); 2786 } 2787 EXPORT_SYMBOL(sock_rfree); 2788 2789 /* 2790 * Buffer destructor for skbs that are not used directly in read or write 2791 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 2792 */ 2793 void sock_efree(struct sk_buff *skb) 2794 { 2795 sock_put(skb->sk); 2796 } 2797 EXPORT_SYMBOL(sock_efree); 2798 2799 /* Buffer destructor for prefetch/receive path where reference count may 2800 * not be held, e.g. for listen sockets. 2801 */ 2802 #ifdef CONFIG_INET 2803 void sock_pfree(struct sk_buff *skb) 2804 { 2805 struct sock *sk = skb->sk; 2806 2807 if (!sk_is_refcounted(sk)) 2808 return; 2809 2810 if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) { 2811 inet_reqsk(sk)->rsk_listener = NULL; 2812 reqsk_free(inet_reqsk(sk)); 2813 return; 2814 } 2815 2816 sock_gen_put(sk); 2817 } 2818 EXPORT_SYMBOL(sock_pfree); 2819 #endif /* CONFIG_INET */ 2820 2821 /* 2822 * Allocate a skb from the socket's send buffer. 2823 */ 2824 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 2825 gfp_t priority) 2826 { 2827 if (force || 2828 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { 2829 struct sk_buff *skb = alloc_skb(size, priority); 2830 2831 if (skb) { 2832 skb_set_owner_w(skb, sk); 2833 return skb; 2834 } 2835 } 2836 return NULL; 2837 } 2838 EXPORT_SYMBOL(sock_wmalloc); 2839 2840 static void sock_ofree(struct sk_buff *skb) 2841 { 2842 struct sock *sk = skb->sk; 2843 2844 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 2845 } 2846 2847 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 2848 gfp_t priority) 2849 { 2850 struct sk_buff *skb; 2851 2852 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 2853 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 2854 READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) 2855 return NULL; 2856 2857 skb = alloc_skb(size, priority); 2858 if (!skb) 2859 return NULL; 2860 2861 atomic_add(skb->truesize, &sk->sk_omem_alloc); 2862 skb->sk = sk; 2863 skb->destructor = sock_ofree; 2864 return skb; 2865 } 2866 2867 /* 2868 * Allocate a memory block from the socket's option memory buffer. 2869 */ 2870 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 2871 { 2872 int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); 2873 2874 if ((unsigned int)size <= optmem_max && 2875 atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { 2876 void *mem; 2877 /* First do the add, to avoid the race if kmalloc 2878 * might sleep. 2879 */ 2880 atomic_add(size, &sk->sk_omem_alloc); 2881 mem = kmalloc(size, priority); 2882 if (mem) 2883 return mem; 2884 atomic_sub(size, &sk->sk_omem_alloc); 2885 } 2886 return NULL; 2887 } 2888 EXPORT_SYMBOL(sock_kmalloc); 2889 2890 /* 2891 * Duplicate the input "src" memory block using the socket's 2892 * option memory buffer. 2893 */ 2894 void *sock_kmemdup(struct sock *sk, const void *src, 2895 int size, gfp_t priority) 2896 { 2897 void *mem; 2898 2899 mem = sock_kmalloc(sk, size, priority); 2900 if (mem) 2901 memcpy(mem, src, size); 2902 return mem; 2903 } 2904 EXPORT_SYMBOL(sock_kmemdup); 2905 2906 /* Free an option memory block. Note, we actually want the inline 2907 * here as this allows gcc to detect the nullify and fold away the 2908 * condition entirely. 2909 */ 2910 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2911 const bool nullify) 2912 { 2913 if (WARN_ON_ONCE(!mem)) 2914 return; 2915 if (nullify) 2916 kfree_sensitive(mem); 2917 else 2918 kfree(mem); 2919 atomic_sub(size, &sk->sk_omem_alloc); 2920 } 2921 2922 void sock_kfree_s(struct sock *sk, void *mem, int size) 2923 { 2924 __sock_kfree_s(sk, mem, size, false); 2925 } 2926 EXPORT_SYMBOL(sock_kfree_s); 2927 2928 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2929 { 2930 __sock_kfree_s(sk, mem, size, true); 2931 } 2932 EXPORT_SYMBOL(sock_kzfree_s); 2933 2934 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2935 I think, these locks should be removed for datagram sockets. 2936 */ 2937 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2938 { 2939 DEFINE_WAIT(wait); 2940 2941 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2942 for (;;) { 2943 if (!timeo) 2944 break; 2945 if (signal_pending(current)) 2946 break; 2947 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2948 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2949 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) 2950 break; 2951 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2952 break; 2953 if (READ_ONCE(sk->sk_err)) 2954 break; 2955 timeo = schedule_timeout(timeo); 2956 } 2957 finish_wait(sk_sleep(sk), &wait); 2958 return timeo; 2959 } 2960 2961 2962 /* 2963 * Generic send/receive buffer handlers 2964 */ 2965 2966 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2967 unsigned long data_len, int noblock, 2968 int *errcode, int max_page_order) 2969 { 2970 struct sk_buff *skb; 2971 long timeo; 2972 int err; 2973 2974 timeo = sock_sndtimeo(sk, noblock); 2975 for (;;) { 2976 err = sock_error(sk); 2977 if (err != 0) 2978 goto failure; 2979 2980 err = -EPIPE; 2981 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2982 goto failure; 2983 2984 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) 2985 break; 2986 2987 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2988 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2989 err = -EAGAIN; 2990 if (!timeo) 2991 goto failure; 2992 if (signal_pending(current)) 2993 goto interrupted; 2994 timeo = sock_wait_for_wmem(sk, timeo); 2995 } 2996 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2997 errcode, sk->sk_allocation); 2998 if (skb) 2999 skb_set_owner_w(skb, sk); 3000 return skb; 3001 3002 interrupted: 3003 err = sock_intr_errno(timeo); 3004 failure: 3005 *errcode = err; 3006 return NULL; 3007 } 3008 EXPORT_SYMBOL(sock_alloc_send_pskb); 3009 3010 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, 3011 struct sockcm_cookie *sockc) 3012 { 3013 u32 tsflags; 3014 3015 BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31)); 3016 3017 switch (cmsg->cmsg_type) { 3018 case SO_MARK: 3019 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 3020 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3021 return -EPERM; 3022 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3023 return -EINVAL; 3024 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 3025 break; 3026 case SO_TIMESTAMPING_OLD: 3027 case SO_TIMESTAMPING_NEW: 3028 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3029 return -EINVAL; 3030 3031 tsflags = *(u32 *)CMSG_DATA(cmsg); 3032 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 3033 return -EINVAL; 3034 3035 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 3036 sockc->tsflags |= tsflags; 3037 break; 3038 case SCM_TXTIME: 3039 if (!sock_flag(sk, SOCK_TXTIME)) 3040 return -EINVAL; 3041 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 3042 return -EINVAL; 3043 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 3044 break; 3045 case SCM_TS_OPT_ID: 3046 if (sk_is_tcp(sk)) 3047 return -EINVAL; 3048 tsflags = READ_ONCE(sk->sk_tsflags); 3049 if (!(tsflags & SOF_TIMESTAMPING_OPT_ID)) 3050 return -EINVAL; 3051 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3052 return -EINVAL; 3053 sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg); 3054 sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID; 3055 break; 3056 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 3057 case SCM_RIGHTS: 3058 case SCM_CREDENTIALS: 3059 break; 3060 case SO_PRIORITY: 3061 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3062 return -EINVAL; 3063 if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg))) 3064 return -EPERM; 3065 sockc->priority = *(u32 *)CMSG_DATA(cmsg); 3066 break; 3067 case SCM_DEVMEM_DMABUF: 3068 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3069 return -EINVAL; 3070 sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg); 3071 break; 3072 default: 3073 return -EINVAL; 3074 } 3075 return 0; 3076 } 3077 EXPORT_SYMBOL(__sock_cmsg_send); 3078 3079 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 3080 struct sockcm_cookie *sockc) 3081 { 3082 struct cmsghdr *cmsg; 3083 int ret; 3084 3085 for_each_cmsghdr(cmsg, msg) { 3086 if (!CMSG_OK(msg, cmsg)) 3087 return -EINVAL; 3088 if (cmsg->cmsg_level != SOL_SOCKET) 3089 continue; 3090 ret = __sock_cmsg_send(sk, cmsg, sockc); 3091 if (ret) 3092 return ret; 3093 } 3094 return 0; 3095 } 3096 EXPORT_SYMBOL(sock_cmsg_send); 3097 3098 static void sk_enter_memory_pressure(struct sock *sk) 3099 { 3100 if (!sk->sk_prot->enter_memory_pressure) 3101 return; 3102 3103 sk->sk_prot->enter_memory_pressure(sk); 3104 } 3105 3106 static void sk_leave_memory_pressure(struct sock *sk) 3107 { 3108 if (sk->sk_prot->leave_memory_pressure) { 3109 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure, 3110 tcp_leave_memory_pressure, sk); 3111 } else { 3112 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 3113 3114 if (memory_pressure && READ_ONCE(*memory_pressure)) 3115 WRITE_ONCE(*memory_pressure, 0); 3116 } 3117 } 3118 3119 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); 3120 3121 /** 3122 * skb_page_frag_refill - check that a page_frag contains enough room 3123 * @sz: minimum size of the fragment we want to get 3124 * @pfrag: pointer to page_frag 3125 * @gfp: priority for memory allocation 3126 * 3127 * Note: While this allocator tries to use high order pages, there is 3128 * no guarantee that allocations succeed. Therefore, @sz MUST be 3129 * less or equal than PAGE_SIZE. 3130 */ 3131 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 3132 { 3133 if (pfrag->page) { 3134 if (page_ref_count(pfrag->page) == 1) { 3135 pfrag->offset = 0; 3136 return true; 3137 } 3138 if (pfrag->offset + sz <= pfrag->size) 3139 return true; 3140 put_page(pfrag->page); 3141 } 3142 3143 pfrag->offset = 0; 3144 if (SKB_FRAG_PAGE_ORDER && 3145 !static_branch_unlikely(&net_high_order_alloc_disable_key)) { 3146 /* Avoid direct reclaim but allow kswapd to wake */ 3147 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 3148 __GFP_COMP | __GFP_NOWARN | 3149 __GFP_NORETRY, 3150 SKB_FRAG_PAGE_ORDER); 3151 if (likely(pfrag->page)) { 3152 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 3153 return true; 3154 } 3155 } 3156 pfrag->page = alloc_page(gfp); 3157 if (likely(pfrag->page)) { 3158 pfrag->size = PAGE_SIZE; 3159 return true; 3160 } 3161 return false; 3162 } 3163 EXPORT_SYMBOL(skb_page_frag_refill); 3164 3165 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 3166 { 3167 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 3168 return true; 3169 3170 if (!sk->sk_bypass_prot_mem) 3171 sk_enter_memory_pressure(sk); 3172 3173 sk_stream_moderate_sndbuf(sk); 3174 3175 return false; 3176 } 3177 EXPORT_SYMBOL(sk_page_frag_refill); 3178 3179 static void __lock_sock(struct sock *sk) 3180 __releases(&sk->sk_lock.slock) 3181 __acquires(&sk->sk_lock.slock) 3182 { 3183 DEFINE_WAIT(wait); 3184 3185 for (;;) { 3186 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 3187 TASK_UNINTERRUPTIBLE); 3188 spin_unlock_bh(&sk->sk_lock.slock); 3189 schedule(); 3190 spin_lock_bh(&sk->sk_lock.slock); 3191 if (!sock_owned_by_user(sk)) 3192 break; 3193 } 3194 finish_wait(&sk->sk_lock.wq, &wait); 3195 } 3196 3197 void __release_sock(struct sock *sk) 3198 __releases(&sk->sk_lock.slock) 3199 __acquires(&sk->sk_lock.slock) 3200 { 3201 struct sk_buff *skb, *next; 3202 int nb = 0; 3203 3204 while ((skb = sk->sk_backlog.head) != NULL) { 3205 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 3206 3207 spin_unlock_bh(&sk->sk_lock.slock); 3208 3209 while (1) { 3210 next = skb->next; 3211 prefetch(next); 3212 DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb)); 3213 skb_mark_not_on_list(skb); 3214 sk_backlog_rcv(sk, skb); 3215 3216 skb = next; 3217 if (!skb) 3218 break; 3219 3220 if (!(++nb & 15)) 3221 cond_resched(); 3222 } 3223 3224 spin_lock_bh(&sk->sk_lock.slock); 3225 } 3226 3227 /* 3228 * Doing the zeroing here guarantee we can not loop forever 3229 * while a wild producer attempts to flood us. 3230 */ 3231 sk->sk_backlog.len = 0; 3232 } 3233 3234 void __sk_flush_backlog(struct sock *sk) 3235 { 3236 spin_lock_bh(&sk->sk_lock.slock); 3237 __release_sock(sk); 3238 3239 if (sk->sk_prot->release_cb) 3240 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, 3241 tcp_release_cb, sk); 3242 3243 spin_unlock_bh(&sk->sk_lock.slock); 3244 } 3245 EXPORT_SYMBOL_GPL(__sk_flush_backlog); 3246 3247 /** 3248 * sk_wait_data - wait for data to arrive at sk_receive_queue 3249 * @sk: sock to wait on 3250 * @timeo: for how long 3251 * @skb: last skb seen on sk_receive_queue 3252 * 3253 * Now socket state including sk->sk_err is changed only under lock, 3254 * hence we may omit checks after joining wait queue. 3255 * We check receive queue before schedule() only as optimization; 3256 * it is very likely that release_sock() added new data. 3257 */ 3258 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 3259 { 3260 DEFINE_WAIT_FUNC(wait, woken_wake_function); 3261 int rc; 3262 3263 add_wait_queue(sk_sleep(sk), &wait); 3264 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 3265 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 3266 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 3267 remove_wait_queue(sk_sleep(sk), &wait); 3268 return rc; 3269 } 3270 EXPORT_SYMBOL(sk_wait_data); 3271 3272 /** 3273 * __sk_mem_raise_allocated - increase memory_allocated 3274 * @sk: socket 3275 * @size: memory size to allocate 3276 * @amt: pages to allocate 3277 * @kind: allocation type 3278 * 3279 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc. 3280 * 3281 * Unlike the globally shared limits among the sockets under same protocol, 3282 * consuming the budget of a memcg won't have direct effect on other ones. 3283 * So be optimistic about memcg's tolerance, and leave the callers to decide 3284 * whether or not to raise allocated through sk_under_memory_pressure() or 3285 * its variants. 3286 */ 3287 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 3288 { 3289 bool memcg_enabled = false, charged = false; 3290 struct proto *prot = sk->sk_prot; 3291 long allocated = 0; 3292 3293 if (!sk->sk_bypass_prot_mem) { 3294 sk_memory_allocated_add(sk, amt); 3295 allocated = sk_memory_allocated(sk); 3296 } 3297 3298 if (mem_cgroup_sk_enabled(sk)) { 3299 memcg_enabled = true; 3300 charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge()); 3301 if (!charged) 3302 goto suppress_allocation; 3303 } 3304 3305 if (!allocated) 3306 return 1; 3307 3308 /* Under limit. */ 3309 if (allocated <= sk_prot_mem_limits(sk, 0)) { 3310 sk_leave_memory_pressure(sk); 3311 return 1; 3312 } 3313 3314 /* Under pressure. */ 3315 if (allocated > sk_prot_mem_limits(sk, 1)) 3316 sk_enter_memory_pressure(sk); 3317 3318 /* Over hard limit. */ 3319 if (allocated > sk_prot_mem_limits(sk, 2)) 3320 goto suppress_allocation; 3321 3322 /* Guarantee minimum buffer size under pressure (either global 3323 * or memcg) to make sure features described in RFC 7323 (TCP 3324 * Extensions for High Performance) work properly. 3325 * 3326 * This rule does NOT stand when exceeds global or memcg's hard 3327 * limit, or else a DoS attack can be taken place by spawning 3328 * lots of sockets whose usage are under minimum buffer size. 3329 */ 3330 if (kind == SK_MEM_RECV) { 3331 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 3332 return 1; 3333 3334 } else { /* SK_MEM_SEND */ 3335 int wmem0 = sk_get_wmem0(sk, prot); 3336 3337 if (sk->sk_type == SOCK_STREAM) { 3338 if (sk->sk_wmem_queued < wmem0) 3339 return 1; 3340 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 3341 return 1; 3342 } 3343 } 3344 3345 if (sk_has_memory_pressure(sk)) { 3346 u64 alloc; 3347 3348 /* The following 'average' heuristic is within the 3349 * scope of global accounting, so it only makes 3350 * sense for global memory pressure. 3351 */ 3352 if (!sk_under_global_memory_pressure(sk)) 3353 return 1; 3354 3355 /* Try to be fair among all the sockets under global 3356 * pressure by allowing the ones that below average 3357 * usage to raise. 3358 */ 3359 alloc = sk_sockets_allocated_read_positive(sk); 3360 if (sk_prot_mem_limits(sk, 2) > alloc * 3361 sk_mem_pages(sk->sk_wmem_queued + 3362 atomic_read(&sk->sk_rmem_alloc) + 3363 sk->sk_forward_alloc)) 3364 return 1; 3365 } 3366 3367 suppress_allocation: 3368 3369 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 3370 sk_stream_moderate_sndbuf(sk); 3371 3372 /* Fail only if socket is _under_ its sndbuf. 3373 * In this case we cannot block, so that we have to fail. 3374 */ 3375 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { 3376 /* Force charge with __GFP_NOFAIL */ 3377 if (memcg_enabled && !charged) 3378 mem_cgroup_sk_charge(sk, amt, 3379 gfp_memcg_charge() | __GFP_NOFAIL); 3380 return 1; 3381 } 3382 } 3383 3384 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 3385 3386 if (allocated) 3387 sk_memory_allocated_sub(sk, amt); 3388 3389 if (charged) 3390 mem_cgroup_sk_uncharge(sk, amt); 3391 3392 return 0; 3393 } 3394 3395 /** 3396 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 3397 * @sk: socket 3398 * @size: memory size to allocate 3399 * @kind: allocation type 3400 * 3401 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 3402 * rmem allocation. This function assumes that protocols which have 3403 * memory_pressure use sk_wmem_queued as write buffer accounting. 3404 */ 3405 int __sk_mem_schedule(struct sock *sk, int size, int kind) 3406 { 3407 int ret, amt = sk_mem_pages(size); 3408 3409 sk_forward_alloc_add(sk, amt << PAGE_SHIFT); 3410 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 3411 if (!ret) 3412 sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT)); 3413 return ret; 3414 } 3415 EXPORT_SYMBOL(__sk_mem_schedule); 3416 3417 /** 3418 * __sk_mem_reduce_allocated - reclaim memory_allocated 3419 * @sk: socket 3420 * @amount: number of quanta 3421 * 3422 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 3423 */ 3424 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 3425 { 3426 if (mem_cgroup_sk_enabled(sk)) 3427 mem_cgroup_sk_uncharge(sk, amount); 3428 3429 if (sk->sk_bypass_prot_mem) 3430 return; 3431 3432 sk_memory_allocated_sub(sk, amount); 3433 3434 if (sk_under_global_memory_pressure(sk) && 3435 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 3436 sk_leave_memory_pressure(sk); 3437 } 3438 3439 /** 3440 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 3441 * @sk: socket 3442 * @amount: number of bytes (rounded down to a PAGE_SIZE multiple) 3443 */ 3444 void __sk_mem_reclaim(struct sock *sk, int amount) 3445 { 3446 amount >>= PAGE_SHIFT; 3447 sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT)); 3448 __sk_mem_reduce_allocated(sk, amount); 3449 } 3450 EXPORT_SYMBOL(__sk_mem_reclaim); 3451 3452 void __sk_charge(struct sock *sk, gfp_t gfp) 3453 { 3454 int amt; 3455 3456 gfp |= __GFP_NOFAIL; 3457 if (mem_cgroup_from_sk(sk)) { 3458 /* The socket has not been accepted yet, no need 3459 * to look at newsk->sk_wmem_queued. 3460 */ 3461 amt = sk_mem_pages(sk->sk_forward_alloc + 3462 atomic_read(&sk->sk_rmem_alloc)); 3463 if (amt) 3464 mem_cgroup_sk_charge(sk, amt, gfp); 3465 } 3466 3467 kmem_cache_charge(sk, gfp); 3468 } 3469 3470 int sk_set_peek_off(struct sock *sk, int val) 3471 { 3472 WRITE_ONCE(sk->sk_peek_off, val); 3473 return 0; 3474 } 3475 EXPORT_SYMBOL_GPL(sk_set_peek_off); 3476 3477 /* 3478 * Set of default routines for initialising struct proto_ops when 3479 * the protocol does not support a particular function. In certain 3480 * cases where it makes no sense for a protocol to have a "do nothing" 3481 * function, some default processing is provided. 3482 */ 3483 3484 int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len) 3485 { 3486 return -EOPNOTSUPP; 3487 } 3488 EXPORT_SYMBOL(sock_no_bind); 3489 3490 int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr, 3491 int len, int flags) 3492 { 3493 return -EOPNOTSUPP; 3494 } 3495 EXPORT_SYMBOL(sock_no_connect); 3496 3497 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 3498 { 3499 return -EOPNOTSUPP; 3500 } 3501 EXPORT_SYMBOL(sock_no_socketpair); 3502 3503 int sock_no_accept(struct socket *sock, struct socket *newsock, 3504 struct proto_accept_arg *arg) 3505 { 3506 return -EOPNOTSUPP; 3507 } 3508 EXPORT_SYMBOL(sock_no_accept); 3509 3510 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 3511 int peer) 3512 { 3513 return -EOPNOTSUPP; 3514 } 3515 EXPORT_SYMBOL(sock_no_getname); 3516 3517 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3518 { 3519 return -EOPNOTSUPP; 3520 } 3521 EXPORT_SYMBOL(sock_no_ioctl); 3522 3523 int sock_no_listen(struct socket *sock, int backlog) 3524 { 3525 return -EOPNOTSUPP; 3526 } 3527 EXPORT_SYMBOL(sock_no_listen); 3528 3529 int sock_no_shutdown(struct socket *sock, int how) 3530 { 3531 return -EOPNOTSUPP; 3532 } 3533 EXPORT_SYMBOL(sock_no_shutdown); 3534 3535 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 3536 { 3537 return -EOPNOTSUPP; 3538 } 3539 EXPORT_SYMBOL(sock_no_sendmsg); 3540 3541 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 3542 { 3543 return -EOPNOTSUPP; 3544 } 3545 EXPORT_SYMBOL(sock_no_sendmsg_locked); 3546 3547 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 3548 int flags) 3549 { 3550 return -EOPNOTSUPP; 3551 } 3552 EXPORT_SYMBOL(sock_no_recvmsg); 3553 3554 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 3555 { 3556 /* Mirror missing mmap method error code */ 3557 return -ENODEV; 3558 } 3559 EXPORT_SYMBOL(sock_no_mmap); 3560 3561 /* 3562 * When a file is received (via SCM_RIGHTS, etc), we must bump the 3563 * various sock-based usage counts. 3564 */ 3565 void __receive_sock(struct file *file) 3566 { 3567 struct socket *sock; 3568 3569 sock = sock_from_file(file); 3570 if (sock) { 3571 sock_update_netprioidx(&sock->sk->sk_cgrp_data); 3572 sock_update_classid(&sock->sk->sk_cgrp_data); 3573 } 3574 } 3575 3576 /* 3577 * Default Socket Callbacks 3578 */ 3579 3580 static void sock_def_wakeup(struct sock *sk) 3581 { 3582 struct socket_wq *wq; 3583 3584 rcu_read_lock(); 3585 wq = rcu_dereference(sk->sk_wq); 3586 if (skwq_has_sleeper(wq)) 3587 wake_up_interruptible_all(&wq->wait); 3588 rcu_read_unlock(); 3589 } 3590 3591 static void sock_def_error_report(struct sock *sk) 3592 { 3593 struct socket_wq *wq; 3594 3595 rcu_read_lock(); 3596 wq = rcu_dereference(sk->sk_wq); 3597 if (skwq_has_sleeper(wq)) 3598 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 3599 sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR); 3600 rcu_read_unlock(); 3601 } 3602 3603 void sock_def_readable(struct sock *sk) 3604 { 3605 struct socket_wq *wq; 3606 3607 trace_sk_data_ready(sk); 3608 3609 rcu_read_lock(); 3610 wq = rcu_dereference(sk->sk_wq); 3611 if (skwq_has_sleeper(wq)) 3612 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 3613 EPOLLRDNORM | EPOLLRDBAND); 3614 sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); 3615 rcu_read_unlock(); 3616 } 3617 3618 static void sock_def_write_space(struct sock *sk) 3619 { 3620 struct socket_wq *wq; 3621 3622 rcu_read_lock(); 3623 3624 /* Do not wake up a writer until he can make "significant" 3625 * progress. --DaveM 3626 */ 3627 if (sock_writeable(sk)) { 3628 wq = rcu_dereference(sk->sk_wq); 3629 if (skwq_has_sleeper(wq)) 3630 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 3631 EPOLLWRNORM | EPOLLWRBAND); 3632 3633 /* Should agree with poll, otherwise some programs break */ 3634 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 3635 } 3636 3637 rcu_read_unlock(); 3638 } 3639 3640 /* An optimised version of sock_def_write_space(), should only be called 3641 * for SOCK_RCU_FREE sockets under RCU read section and after putting 3642 * ->sk_wmem_alloc. 3643 */ 3644 static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc) 3645 { 3646 /* Do not wake up a writer until he can make "significant" 3647 * progress. --DaveM 3648 */ 3649 if (__sock_writeable(sk, wmem_alloc)) { 3650 struct socket_wq *wq = rcu_dereference(sk->sk_wq); 3651 3652 /* rely on refcount_sub from sock_wfree() */ 3653 smp_mb__after_atomic(); 3654 if (wq && waitqueue_active(&wq->wait)) 3655 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 3656 EPOLLWRNORM | EPOLLWRBAND); 3657 3658 /* Should agree with poll, otherwise some programs break */ 3659 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 3660 } 3661 } 3662 3663 static void sock_def_destruct(struct sock *sk) 3664 { 3665 } 3666 3667 void sk_send_sigurg(struct sock *sk) 3668 { 3669 if (sk->sk_socket && sk->sk_socket->file) 3670 if (send_sigurg(sk->sk_socket->file)) 3671 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 3672 } 3673 EXPORT_SYMBOL(sk_send_sigurg); 3674 3675 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 3676 unsigned long expires) 3677 { 3678 if (!mod_timer(timer, expires)) 3679 sock_hold(sk); 3680 } 3681 EXPORT_SYMBOL(sk_reset_timer); 3682 3683 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 3684 { 3685 if (timer_delete(timer)) 3686 __sock_put(sk); 3687 } 3688 EXPORT_SYMBOL(sk_stop_timer); 3689 3690 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) 3691 { 3692 if (timer_delete_sync(timer)) 3693 __sock_put(sk); 3694 } 3695 EXPORT_SYMBOL(sk_stop_timer_sync); 3696 3697 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) 3698 { 3699 sk_init_common(sk); 3700 sk->sk_send_head = NULL; 3701 3702 timer_setup(&sk->sk_timer, NULL, 0); 3703 3704 sk->sk_allocation = GFP_KERNEL; 3705 sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); 3706 sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); 3707 sk->sk_state = TCP_CLOSE; 3708 sk->sk_use_task_frag = true; 3709 sk_set_socket(sk, sock); 3710 3711 sock_set_flag(sk, SOCK_ZAPPED); 3712 3713 if (sock) { 3714 sk->sk_type = sock->type; 3715 RCU_INIT_POINTER(sk->sk_wq, &sock->wq); 3716 sock->sk = sk; 3717 } else { 3718 RCU_INIT_POINTER(sk->sk_wq, NULL); 3719 } 3720 sk->sk_uid = uid; 3721 3722 sk->sk_state_change = sock_def_wakeup; 3723 sk->sk_data_ready = sock_def_readable; 3724 sk->sk_write_space = sock_def_write_space; 3725 sk->sk_error_report = sock_def_error_report; 3726 sk->sk_destruct = sock_def_destruct; 3727 3728 sk->sk_frag.page = NULL; 3729 sk->sk_frag.offset = 0; 3730 sk->sk_peek_off = -1; 3731 3732 sk->sk_peer_pid = NULL; 3733 sk->sk_peer_cred = NULL; 3734 spin_lock_init(&sk->sk_peer_lock); 3735 3736 sk->sk_write_pending = 0; 3737 sk->sk_rcvlowat = 1; 3738 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 3739 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 3740 3741 sk->sk_stamp = SK_DEFAULT_STAMP; 3742 #if BITS_PER_LONG==32 3743 seqlock_init(&sk->sk_stamp_seq); 3744 #endif 3745 atomic_set(&sk->sk_zckey, 0); 3746 3747 #ifdef CONFIG_NET_RX_BUSY_POLL 3748 sk->sk_napi_id = 0; 3749 sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read); 3750 #endif 3751 3752 sk->sk_max_pacing_rate = ~0UL; 3753 sk->sk_pacing_rate = ~0UL; 3754 WRITE_ONCE(sk->sk_pacing_shift, 10); 3755 sk->sk_incoming_cpu = -1; 3756 3757 sk_rx_queue_clear(sk); 3758 /* 3759 * Before updating sk_refcnt, we must commit prior changes to memory 3760 * (Documentation/RCU/rculist_nulls.rst for details) 3761 */ 3762 smp_wmb(); 3763 refcount_set(&sk->sk_refcnt, 1); 3764 sk_drops_reset(sk); 3765 } 3766 EXPORT_SYMBOL(sock_init_data_uid); 3767 3768 void sock_init_data(struct socket *sock, struct sock *sk) 3769 { 3770 kuid_t uid = sock ? 3771 SOCK_INODE(sock)->i_uid : 3772 make_kuid(sock_net(sk)->user_ns, 0); 3773 3774 sock_init_data_uid(sock, sk, uid); 3775 } 3776 EXPORT_SYMBOL(sock_init_data); 3777 3778 void noinline lock_sock_nested(struct sock *sk, int subclass) 3779 { 3780 /* The sk_lock has mutex_lock() semantics here. */ 3781 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 3782 3783 might_sleep(); 3784 #ifdef CONFIG_64BIT 3785 if (sizeof(struct slock_owned) == sizeof(long)) { 3786 socket_lock_t tmp = { 3787 .slock = __SPIN_LOCK_UNLOCKED(tmp.slock), 3788 .owned = 1, 3789 }; 3790 socket_lock_t old = { 3791 .slock = __SPIN_LOCK_UNLOCKED(old.slock), 3792 .owned = 0, 3793 }; 3794 3795 if (likely(try_cmpxchg(&sk->sk_lock.combined, 3796 &old.combined, tmp.combined))) 3797 return; 3798 } 3799 #endif 3800 spin_lock_bh(&sk->sk_lock.slock); 3801 if (unlikely(sock_owned_by_user_nocheck(sk))) 3802 __lock_sock(sk); 3803 sk->sk_lock.owned = 1; 3804 spin_unlock_bh(&sk->sk_lock.slock); 3805 } 3806 EXPORT_SYMBOL(lock_sock_nested); 3807 3808 void release_sock(struct sock *sk) 3809 { 3810 spin_lock_bh(&sk->sk_lock.slock); 3811 3812 if (unlikely(sk->sk_backlog.tail)) 3813 __release_sock(sk); 3814 3815 if (sk->sk_prot->release_cb) { 3816 if (!tcp_release_cb_cond(sk)) 3817 sk->sk_prot->release_cb(sk); 3818 } 3819 sock_release_ownership(sk); 3820 if (unlikely(waitqueue_active(&sk->sk_lock.wq))) 3821 wake_up(&sk->sk_lock.wq); 3822 3823 spin_unlock_bh(&sk->sk_lock.slock); 3824 } 3825 EXPORT_SYMBOL(release_sock); 3826 3827 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) 3828 { 3829 might_sleep(); 3830 spin_lock_bh(&sk->sk_lock.slock); 3831 3832 if (likely(!sock_owned_by_user_nocheck(sk))) { 3833 /* 3834 * Fast path return with bottom halves disabled and 3835 * sock::sk_lock.slock held. 3836 * 3837 * The 'mutex' is not contended and holding 3838 * sock::sk_lock.slock prevents all other lockers to 3839 * proceed so the corresponding unlock_sock_fast() can 3840 * avoid the slow path of release_sock() completely and 3841 * just release slock. 3842 * 3843 * From a semantical POV this is equivalent to 'acquiring' 3844 * the 'mutex', hence the corresponding lockdep 3845 * mutex_release() has to happen in the fast path of 3846 * unlock_sock_fast(). 3847 */ 3848 return false; 3849 } 3850 3851 __lock_sock(sk); 3852 sk->sk_lock.owned = 1; 3853 __acquire(&sk->sk_lock.slock); 3854 spin_unlock_bh(&sk->sk_lock.slock); 3855 return true; 3856 } 3857 EXPORT_SYMBOL(__lock_sock_fast); 3858 3859 int sock_gettstamp(struct socket *sock, void __user *userstamp, 3860 bool timeval, bool time32) 3861 { 3862 struct sock *sk = sock->sk; 3863 struct timespec64 ts; 3864 3865 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 3866 ts = ktime_to_timespec64(sock_read_timestamp(sk)); 3867 if (ts.tv_sec == -1) 3868 return -ENOENT; 3869 if (ts.tv_sec == 0) { 3870 ktime_t kt = ktime_get_real(); 3871 sock_write_timestamp(sk, kt); 3872 ts = ktime_to_timespec64(kt); 3873 } 3874 3875 if (timeval) 3876 ts.tv_nsec /= 1000; 3877 3878 #ifdef CONFIG_COMPAT_32BIT_TIME 3879 if (time32) 3880 return put_old_timespec32(&ts, userstamp); 3881 #endif 3882 #ifdef CONFIG_SPARC64 3883 /* beware of padding in sparc64 timeval */ 3884 if (timeval && !in_compat_syscall()) { 3885 struct __kernel_old_timeval __user tv = { 3886 .tv_sec = ts.tv_sec, 3887 .tv_usec = ts.tv_nsec, 3888 }; 3889 if (copy_to_user(userstamp, &tv, sizeof(tv))) 3890 return -EFAULT; 3891 return 0; 3892 } 3893 #endif 3894 return put_timespec64(&ts, userstamp); 3895 } 3896 EXPORT_SYMBOL(sock_gettstamp); 3897 3898 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) 3899 { 3900 if (!sock_flag(sk, flag)) { 3901 unsigned long previous_flags = sk->sk_flags; 3902 3903 sock_set_flag(sk, flag); 3904 /* 3905 * we just set one of the two flags which require net 3906 * time stamping, but time stamping might have been on 3907 * already because of the other one 3908 */ 3909 if (sock_needs_netstamp(sk) && 3910 !(previous_flags & SK_FLAGS_TIMESTAMP)) 3911 net_enable_timestamp(); 3912 } 3913 } 3914 3915 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 3916 int level, int type) 3917 { 3918 struct sock_extended_err ee; 3919 struct sk_buff *skb; 3920 int copied, err; 3921 3922 err = -EAGAIN; 3923 skb = sock_dequeue_err_skb(sk); 3924 if (skb == NULL) 3925 goto out; 3926 3927 copied = skb->len; 3928 if (copied > len) { 3929 msg->msg_flags |= MSG_TRUNC; 3930 copied = len; 3931 } 3932 err = skb_copy_datagram_msg(skb, 0, msg, copied); 3933 if (err) 3934 goto out_free_skb; 3935 3936 sock_recv_timestamp(msg, sk, skb); 3937 3938 /* We must use a bounce buffer for CONFIG_HARDENED_USERCOPY=y */ 3939 ee = SKB_EXT_ERR(skb)->ee; 3940 put_cmsg(msg, level, type, sizeof(ee), &ee); 3941 3942 msg->msg_flags |= MSG_ERRQUEUE; 3943 err = copied; 3944 3945 out_free_skb: 3946 kfree_skb(skb); 3947 out: 3948 return err; 3949 } 3950 EXPORT_SYMBOL(sock_recv_errqueue); 3951 3952 /* 3953 * Get a socket option on an socket. 3954 * 3955 * FIX: POSIX 1003.1g is very ambiguous here. It states that 3956 * asynchronous errors should be reported by getsockopt. We assume 3957 * this means if you specify SO_ERROR (otherwise what is the point of it). 3958 */ 3959 int sock_common_getsockopt(struct socket *sock, int level, int optname, 3960 char __user *optval, int __user *optlen) 3961 { 3962 struct sock *sk = sock->sk; 3963 3964 /* IPV6_ADDRFORM can change sk->sk_prot under us. */ 3965 return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen); 3966 } 3967 EXPORT_SYMBOL(sock_common_getsockopt); 3968 3969 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 3970 int flags) 3971 { 3972 struct sock *sk = sock->sk; 3973 3974 return sk->sk_prot->recvmsg(sk, msg, size, flags); 3975 } 3976 EXPORT_SYMBOL(sock_common_recvmsg); 3977 3978 /* 3979 * Set socket options on an inet socket. 3980 */ 3981 int sock_common_setsockopt(struct socket *sock, int level, int optname, 3982 sockptr_t optval, unsigned int optlen) 3983 { 3984 struct sock *sk = sock->sk; 3985 3986 /* IPV6_ADDRFORM can change sk->sk_prot under us. */ 3987 return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen); 3988 } 3989 EXPORT_SYMBOL(sock_common_setsockopt); 3990 3991 void sk_common_release(struct sock *sk) 3992 { 3993 if (sk->sk_prot->destroy) 3994 sk->sk_prot->destroy(sk); 3995 3996 /* 3997 * Observation: when sk_common_release is called, processes have 3998 * no access to socket. But net still has. 3999 * Step one, detach it from networking: 4000 * 4001 * A. Remove from hash tables. 4002 */ 4003 4004 sk->sk_prot->unhash(sk); 4005 4006 /* 4007 * In this point socket cannot receive new packets, but it is possible 4008 * that some packets are in flight because some CPU runs receiver and 4009 * did hash table lookup before we unhashed socket. They will achieve 4010 * receive queue and will be purged by socket destructor. 4011 * 4012 * Also we still have packets pending on receive queue and probably, 4013 * our own packets waiting in device queues. sock_destroy will drain 4014 * receive queue, but transmitted packets will delay socket destruction 4015 * until the last reference will be released. 4016 */ 4017 4018 sock_orphan(sk); 4019 4020 xfrm_sk_free_policy(sk); 4021 4022 sock_put(sk); 4023 } 4024 EXPORT_SYMBOL(sk_common_release); 4025 4026 void sk_get_meminfo(const struct sock *sk, u32 *mem) 4027 { 4028 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 4029 4030 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 4031 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); 4032 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 4033 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); 4034 mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc); 4035 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); 4036 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 4037 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); 4038 mem[SK_MEMINFO_DROPS] = sk_drops_read(sk); 4039 } 4040 4041 #ifdef CONFIG_PROC_FS 4042 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 4043 4044 int sock_prot_inuse_get(struct net *net, struct proto *prot) 4045 { 4046 int cpu, idx = prot->inuse_idx; 4047 int res = 0; 4048 4049 for_each_possible_cpu(cpu) 4050 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 4051 4052 return res >= 0 ? res : 0; 4053 } 4054 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 4055 4056 int sock_inuse_get(struct net *net) 4057 { 4058 int cpu, res = 0; 4059 4060 for_each_possible_cpu(cpu) 4061 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all; 4062 4063 return res; 4064 } 4065 4066 EXPORT_SYMBOL_GPL(sock_inuse_get); 4067 4068 static int __net_init sock_inuse_init_net(struct net *net) 4069 { 4070 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 4071 if (net->core.prot_inuse == NULL) 4072 return -ENOMEM; 4073 return 0; 4074 } 4075 4076 static void __net_exit sock_inuse_exit_net(struct net *net) 4077 { 4078 free_percpu(net->core.prot_inuse); 4079 } 4080 4081 static struct pernet_operations net_inuse_ops = { 4082 .init = sock_inuse_init_net, 4083 .exit = sock_inuse_exit_net, 4084 }; 4085 4086 static __init int net_inuse_init(void) 4087 { 4088 if (register_pernet_subsys(&net_inuse_ops)) 4089 panic("Cannot initialize net inuse counters"); 4090 4091 return 0; 4092 } 4093 4094 core_initcall(net_inuse_init); 4095 4096 static int assign_proto_idx(struct proto *prot) 4097 { 4098 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 4099 4100 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) { 4101 pr_err("PROTO_INUSE_NR exhausted\n"); 4102 return -ENOSPC; 4103 } 4104 4105 set_bit(prot->inuse_idx, proto_inuse_idx); 4106 return 0; 4107 } 4108 4109 static void release_proto_idx(struct proto *prot) 4110 { 4111 if (prot->inuse_idx != PROTO_INUSE_NR) 4112 clear_bit(prot->inuse_idx, proto_inuse_idx); 4113 } 4114 #else 4115 static inline int assign_proto_idx(struct proto *prot) 4116 { 4117 return 0; 4118 } 4119 4120 static inline void release_proto_idx(struct proto *prot) 4121 { 4122 } 4123 4124 #endif 4125 4126 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) 4127 { 4128 if (!twsk_prot) 4129 return; 4130 kfree(twsk_prot->twsk_slab_name); 4131 twsk_prot->twsk_slab_name = NULL; 4132 kmem_cache_destroy(twsk_prot->twsk_slab); 4133 twsk_prot->twsk_slab = NULL; 4134 } 4135 4136 static int tw_prot_init(const struct proto *prot) 4137 { 4138 struct timewait_sock_ops *twsk_prot = prot->twsk_prot; 4139 4140 if (!twsk_prot) 4141 return 0; 4142 4143 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", 4144 prot->name); 4145 if (!twsk_prot->twsk_slab_name) 4146 return -ENOMEM; 4147 4148 twsk_prot->twsk_slab = 4149 kmem_cache_create(twsk_prot->twsk_slab_name, 4150 twsk_prot->twsk_obj_size, 0, 4151 SLAB_ACCOUNT | prot->slab_flags, 4152 NULL); 4153 if (!twsk_prot->twsk_slab) { 4154 pr_crit("%s: Can't create timewait sock SLAB cache!\n", 4155 prot->name); 4156 return -ENOMEM; 4157 } 4158 4159 return 0; 4160 } 4161 4162 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 4163 { 4164 if (!rsk_prot) 4165 return; 4166 kfree(rsk_prot->slab_name); 4167 rsk_prot->slab_name = NULL; 4168 kmem_cache_destroy(rsk_prot->slab); 4169 rsk_prot->slab = NULL; 4170 } 4171 4172 static int req_prot_init(const struct proto *prot) 4173 { 4174 struct request_sock_ops *rsk_prot = prot->rsk_prot; 4175 4176 if (!rsk_prot) 4177 return 0; 4178 4179 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 4180 prot->name); 4181 if (!rsk_prot->slab_name) 4182 return -ENOMEM; 4183 4184 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 4185 rsk_prot->obj_size, 0, 4186 SLAB_ACCOUNT | prot->slab_flags, 4187 NULL); 4188 4189 if (!rsk_prot->slab) { 4190 pr_crit("%s: Can't create request sock SLAB cache!\n", 4191 prot->name); 4192 return -ENOMEM; 4193 } 4194 return 0; 4195 } 4196 4197 int proto_register(struct proto *prot, int alloc_slab) 4198 { 4199 int ret = -ENOBUFS; 4200 4201 if (prot->memory_allocated && !prot->sysctl_mem) { 4202 pr_err("%s: missing sysctl_mem\n", prot->name); 4203 return -EINVAL; 4204 } 4205 if (prot->memory_allocated && !prot->per_cpu_fw_alloc) { 4206 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name); 4207 return -EINVAL; 4208 } 4209 if (alloc_slab) { 4210 struct kmem_cache_args args = { 4211 .useroffset = prot->useroffset, 4212 .usersize = prot->usersize, 4213 .freeptr_offset = prot->freeptr_offset, 4214 .use_freeptr_offset = !!prot->freeptr_offset, 4215 }; 4216 4217 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 4218 &args, 4219 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | 4220 prot->slab_flags); 4221 if (prot->slab == NULL) { 4222 pr_crit("%s: Can't create sock SLAB cache!\n", 4223 prot->name); 4224 goto out; 4225 } 4226 4227 if (req_prot_init(prot)) 4228 goto out_free_request_sock_slab; 4229 4230 if (tw_prot_init(prot)) 4231 goto out_free_timewait_sock_slab; 4232 } 4233 4234 mutex_lock(&proto_list_mutex); 4235 ret = assign_proto_idx(prot); 4236 if (ret) { 4237 mutex_unlock(&proto_list_mutex); 4238 goto out_free_timewait_sock_slab; 4239 } 4240 list_add(&prot->node, &proto_list); 4241 mutex_unlock(&proto_list_mutex); 4242 return ret; 4243 4244 out_free_timewait_sock_slab: 4245 if (alloc_slab) 4246 tw_prot_cleanup(prot->twsk_prot); 4247 out_free_request_sock_slab: 4248 if (alloc_slab) { 4249 req_prot_cleanup(prot->rsk_prot); 4250 4251 kmem_cache_destroy(prot->slab); 4252 prot->slab = NULL; 4253 } 4254 out: 4255 return ret; 4256 } 4257 EXPORT_SYMBOL(proto_register); 4258 4259 void proto_unregister(struct proto *prot) 4260 { 4261 mutex_lock(&proto_list_mutex); 4262 release_proto_idx(prot); 4263 list_del(&prot->node); 4264 mutex_unlock(&proto_list_mutex); 4265 4266 kmem_cache_destroy(prot->slab); 4267 prot->slab = NULL; 4268 4269 req_prot_cleanup(prot->rsk_prot); 4270 tw_prot_cleanup(prot->twsk_prot); 4271 } 4272 EXPORT_SYMBOL(proto_unregister); 4273 4274 int sock_load_diag_module(int family, int protocol) 4275 { 4276 if (!protocol) { 4277 if (!sock_is_registered(family)) 4278 return -ENOENT; 4279 4280 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 4281 NETLINK_SOCK_DIAG, family); 4282 } 4283 4284 #ifdef CONFIG_INET 4285 if (family == AF_INET && 4286 protocol != IPPROTO_RAW && 4287 protocol < MAX_INET_PROTOS && 4288 !rcu_access_pointer(inet_protos[protocol])) 4289 return -ENOENT; 4290 #endif 4291 4292 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 4293 NETLINK_SOCK_DIAG, family, protocol); 4294 } 4295 EXPORT_SYMBOL(sock_load_diag_module); 4296 4297 #ifdef CONFIG_PROC_FS 4298 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 4299 __acquires(proto_list_mutex) 4300 { 4301 mutex_lock(&proto_list_mutex); 4302 return seq_list_start_head(&proto_list, *pos); 4303 } 4304 4305 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 4306 { 4307 return seq_list_next(v, &proto_list, pos); 4308 } 4309 4310 static void proto_seq_stop(struct seq_file *seq, void *v) 4311 __releases(proto_list_mutex) 4312 { 4313 mutex_unlock(&proto_list_mutex); 4314 } 4315 4316 static char proto_method_implemented(const void *method) 4317 { 4318 return method == NULL ? 'n' : 'y'; 4319 } 4320 static long sock_prot_memory_allocated(struct proto *proto) 4321 { 4322 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 4323 } 4324 4325 static const char *sock_prot_memory_pressure(struct proto *proto) 4326 { 4327 return proto->memory_pressure != NULL ? 4328 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 4329 } 4330 4331 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 4332 { 4333 4334 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 4335 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 4336 proto->name, 4337 proto->obj_size, 4338 sock_prot_inuse_get(seq_file_net(seq), proto), 4339 sock_prot_memory_allocated(proto), 4340 sock_prot_memory_pressure(proto), 4341 proto->max_header, 4342 proto->slab == NULL ? "no" : "yes", 4343 module_name(proto->owner), 4344 proto_method_implemented(proto->close), 4345 proto_method_implemented(proto->connect), 4346 proto_method_implemented(proto->disconnect), 4347 proto_method_implemented(proto->accept), 4348 proto_method_implemented(proto->ioctl), 4349 proto_method_implemented(proto->init), 4350 proto_method_implemented(proto->destroy), 4351 proto_method_implemented(proto->shutdown), 4352 proto_method_implemented(proto->setsockopt), 4353 proto_method_implemented(proto->getsockopt), 4354 proto_method_implemented(proto->sendmsg), 4355 proto_method_implemented(proto->recvmsg), 4356 proto_method_implemented(proto->bind), 4357 proto_method_implemented(proto->backlog_rcv), 4358 proto_method_implemented(proto->hash), 4359 proto_method_implemented(proto->unhash), 4360 proto_method_implemented(proto->get_port), 4361 proto_method_implemented(proto->enter_memory_pressure)); 4362 } 4363 4364 static int proto_seq_show(struct seq_file *seq, void *v) 4365 { 4366 if (v == &proto_list) 4367 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 4368 "protocol", 4369 "size", 4370 "sockets", 4371 "memory", 4372 "press", 4373 "maxhdr", 4374 "slab", 4375 "module", 4376 "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n"); 4377 else 4378 proto_seq_printf(seq, list_entry(v, struct proto, node)); 4379 return 0; 4380 } 4381 4382 static const struct seq_operations proto_seq_ops = { 4383 .start = proto_seq_start, 4384 .next = proto_seq_next, 4385 .stop = proto_seq_stop, 4386 .show = proto_seq_show, 4387 }; 4388 4389 static __net_init int proto_init_net(struct net *net) 4390 { 4391 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, 4392 sizeof(struct seq_net_private))) 4393 return -ENOMEM; 4394 4395 return 0; 4396 } 4397 4398 static __net_exit void proto_exit_net(struct net *net) 4399 { 4400 remove_proc_entry("protocols", net->proc_net); 4401 } 4402 4403 4404 static __net_initdata struct pernet_operations proto_net_ops = { 4405 .init = proto_init_net, 4406 .exit = proto_exit_net, 4407 }; 4408 4409 static int __init proto_init(void) 4410 { 4411 return register_pernet_subsys(&proto_net_ops); 4412 } 4413 4414 subsys_initcall(proto_init); 4415 4416 #endif /* PROC_FS */ 4417 4418 #ifdef CONFIG_NET_RX_BUSY_POLL 4419 bool sk_busy_loop_end(void *p, unsigned long start_time) 4420 { 4421 struct sock *sk = p; 4422 4423 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 4424 return true; 4425 4426 if (sk_is_udp(sk) && 4427 !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) 4428 return true; 4429 4430 return sk_busy_loop_timeout(sk, start_time); 4431 } 4432 EXPORT_SYMBOL(sk_busy_loop_end); 4433 #endif /* CONFIG_NET_RX_BUSY_POLL */ 4434 4435 int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len) 4436 { 4437 if (!sk->sk_prot->bind_add) 4438 return -EOPNOTSUPP; 4439 return sk->sk_prot->bind_add(sk, addr, addr_len); 4440 } 4441 EXPORT_SYMBOL(sock_bind_add); 4442 4443 /* Copy 'size' bytes from userspace and return `size` back to userspace */ 4444 int sock_ioctl_inout(struct sock *sk, unsigned int cmd, 4445 void __user *arg, void *karg, size_t size) 4446 { 4447 int ret; 4448 4449 if (copy_from_user(karg, arg, size)) 4450 return -EFAULT; 4451 4452 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg); 4453 if (ret) 4454 return ret; 4455 4456 if (copy_to_user(arg, karg, size)) 4457 return -EFAULT; 4458 4459 return 0; 4460 } 4461 EXPORT_SYMBOL(sock_ioctl_inout); 4462 4463 /* This is the most common ioctl prep function, where the result (4 bytes) is 4464 * copied back to userspace if the ioctl() returns successfully. No input is 4465 * copied from userspace as input argument. 4466 */ 4467 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg) 4468 { 4469 int ret, karg = 0; 4470 4471 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg); 4472 if (ret) 4473 return ret; 4474 4475 return put_user(karg, (int __user *)arg); 4476 } 4477 4478 /* A wrapper around sock ioctls, which copies the data from userspace 4479 * (depending on the protocol/ioctl), and copies back the result to userspace. 4480 * The main motivation for this function is to pass kernel memory to the 4481 * protocol ioctl callbacks, instead of userspace memory. 4482 */ 4483 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) 4484 { 4485 int rc = 1; 4486 4487 if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET) 4488 rc = ipmr_sk_ioctl(sk, cmd, arg); 4489 else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6) 4490 rc = ip6mr_sk_ioctl(sk, cmd, arg); 4491 else if (sk_is_phonet(sk)) 4492 rc = phonet_sk_ioctl(sk, cmd, arg); 4493 4494 /* If ioctl was processed, returns its value */ 4495 if (rc <= 0) 4496 return rc; 4497 4498 /* Otherwise call the default handler */ 4499 return sock_ioctl_out(sk, cmd, arg); 4500 } 4501 EXPORT_SYMBOL(sk_ioctl); 4502 4503 static int __init sock_struct_check(void) 4504 { 4505 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops); 4506 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off); 4507 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue); 4508 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue); 4509 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog); 4510 4511 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst); 4512 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex); 4513 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie); 4514 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf); 4515 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter); 4516 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq); 4517 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready); 4518 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo); 4519 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat); 4520 4521 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err); 4522 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket); 4523 #ifdef CONFIG_MEMCG 4524 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); 4525 #endif 4526 4527 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); 4528 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); 4529 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc); 4530 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags); 4531 4532 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); 4533 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); 4534 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft); 4535 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued); 4536 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc); 4537 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags); 4538 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head); 4539 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue); 4540 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending); 4541 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag); 4542 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer); 4543 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate); 4544 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey); 4545 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey); 4546 4547 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_pending_confirm); 4548 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_status); 4549 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate); 4550 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo); 4551 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority); 4552 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark); 4553 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid); 4554 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol); 4555 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache); 4556 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps); 4557 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type); 4558 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size); 4559 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation); 4560 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash); 4561 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf); 4562 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs); 4563 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift); 4564 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag); 4565 return 0; 4566 } 4567 4568 core_initcall(sock_struct_check); 4569