1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic socket support routines. Memory allocators, socket lock/release 8 * handler for protocols to use and generic option handler. 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 */ 85 86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 87 88 #include <linux/unaligned.h> 89 #include <linux/capability.h> 90 #include <linux/errno.h> 91 #include <linux/errqueue.h> 92 #include <linux/types.h> 93 #include <linux/socket.h> 94 #include <linux/in.h> 95 #include <linux/kernel.h> 96 #include <linux/module.h> 97 #include <linux/proc_fs.h> 98 #include <linux/seq_file.h> 99 #include <linux/sched.h> 100 #include <linux/sched/mm.h> 101 #include <linux/timer.h> 102 #include <linux/string.h> 103 #include <linux/sockios.h> 104 #include <linux/net.h> 105 #include <linux/mm.h> 106 #include <linux/slab.h> 107 #include <linux/interrupt.h> 108 #include <linux/poll.h> 109 #include <linux/tcp.h> 110 #include <linux/udp.h> 111 #include <linux/init.h> 112 #include <linux/highmem.h> 113 #include <linux/user_namespace.h> 114 #include <linux/static_key.h> 115 #include <linux/memcontrol.h> 116 #include <linux/prefetch.h> 117 #include <linux/compat.h> 118 #include <linux/mroute.h> 119 #include <linux/mroute6.h> 120 #include <linux/icmpv6.h> 121 122 #include <linux/uaccess.h> 123 124 #include <linux/netdevice.h> 125 #include <net/protocol.h> 126 #include <linux/skbuff.h> 127 #include <linux/skbuff_ref.h> 128 #include <net/net_namespace.h> 129 #include <net/request_sock.h> 130 #include <net/sock.h> 131 #include <net/proto_memory.h> 132 #include <linux/net_tstamp.h> 133 #include <net/xfrm.h> 134 #include <linux/ipsec.h> 135 #include <net/cls_cgroup.h> 136 #include <net/netprio_cgroup.h> 137 #include <linux/sock_diag.h> 138 139 #include <linux/filter.h> 140 #include <net/sock_reuseport.h> 141 #include <net/bpf_sk_storage.h> 142 143 #include <trace/events/sock.h> 144 145 #include <net/tcp.h> 146 #include <net/busy_poll.h> 147 #include <net/phonet/phonet.h> 148 149 #include <linux/ethtool.h> 150 151 #include <uapi/linux/pidfd.h> 152 153 #include "dev.h" 154 155 static DEFINE_MUTEX(proto_list_mutex); 156 static LIST_HEAD(proto_list); 157 158 static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc); 159 static void sock_def_write_space(struct sock *sk); 160 161 /** 162 * sk_ns_capable - General socket capability test 163 * @sk: Socket to use a capability on or through 164 * @user_ns: The user namespace of the capability to use 165 * @cap: The capability to use 166 * 167 * Test to see if the opener of the socket had when the socket was 168 * created and the current process has the capability @cap in the user 169 * namespace @user_ns. 170 */ 171 bool sk_ns_capable(const struct sock *sk, 172 struct user_namespace *user_ns, int cap) 173 { 174 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 175 ns_capable(user_ns, cap); 176 } 177 EXPORT_SYMBOL(sk_ns_capable); 178 179 /** 180 * sk_capable - Socket global capability test 181 * @sk: Socket to use a capability on or through 182 * @cap: The global capability to use 183 * 184 * Test to see if the opener of the socket had when the socket was 185 * created and the current process has the capability @cap in all user 186 * namespaces. 187 */ 188 bool sk_capable(const struct sock *sk, int cap) 189 { 190 return sk_ns_capable(sk, &init_user_ns, cap); 191 } 192 EXPORT_SYMBOL(sk_capable); 193 194 /** 195 * sk_net_capable - Network namespace socket capability test 196 * @sk: Socket to use a capability on or through 197 * @cap: The capability to use 198 * 199 * Test to see if the opener of the socket had when the socket was created 200 * and the current process has the capability @cap over the network namespace 201 * the socket is a member of. 202 */ 203 bool sk_net_capable(const struct sock *sk, int cap) 204 { 205 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 206 } 207 EXPORT_SYMBOL(sk_net_capable); 208 209 /* 210 * Each address family might have different locking rules, so we have 211 * one slock key per address family and separate keys for internal and 212 * userspace sockets. 213 */ 214 static struct lock_class_key af_family_keys[AF_MAX]; 215 static struct lock_class_key af_family_kern_keys[AF_MAX]; 216 static struct lock_class_key af_family_slock_keys[AF_MAX]; 217 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 218 219 /* 220 * Make lock validator output more readable. (we pre-construct these 221 * strings build-time, so that runtime initialization of socket 222 * locks is fast): 223 */ 224 225 #define _sock_locks(x) \ 226 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 227 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 228 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 229 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 230 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 231 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 232 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 233 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 234 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 235 x "27" , x "28" , x "AF_CAN" , \ 236 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 237 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 238 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 239 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 240 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 241 x "AF_MCTP" , \ 242 x "AF_MAX" 243 244 static const char *const af_family_key_strings[AF_MAX+1] = { 245 _sock_locks("sk_lock-") 246 }; 247 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 248 _sock_locks("slock-") 249 }; 250 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 251 _sock_locks("clock-") 252 }; 253 254 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 255 _sock_locks("k-sk_lock-") 256 }; 257 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 258 _sock_locks("k-slock-") 259 }; 260 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 261 _sock_locks("k-clock-") 262 }; 263 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 264 _sock_locks("rlock-") 265 }; 266 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 267 _sock_locks("wlock-") 268 }; 269 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 270 _sock_locks("elock-") 271 }; 272 273 /* 274 * sk_callback_lock and sk queues locking rules are per-address-family, 275 * so split the lock classes by using a per-AF key: 276 */ 277 static struct lock_class_key af_callback_keys[AF_MAX]; 278 static struct lock_class_key af_rlock_keys[AF_MAX]; 279 static struct lock_class_key af_wlock_keys[AF_MAX]; 280 static struct lock_class_key af_elock_keys[AF_MAX]; 281 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 282 283 /* Run time adjustable parameters. */ 284 __u32 sysctl_wmem_max __read_mostly = 4 << 20; 285 EXPORT_SYMBOL(sysctl_wmem_max); 286 __u32 sysctl_rmem_max __read_mostly = 4 << 20; 287 EXPORT_SYMBOL(sysctl_rmem_max); 288 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT; 289 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT; 290 291 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 292 EXPORT_SYMBOL_GPL(memalloc_socks_key); 293 294 /** 295 * sk_set_memalloc - sets %SOCK_MEMALLOC 296 * @sk: socket to set it on 297 * 298 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 299 * It's the responsibility of the admin to adjust min_free_kbytes 300 * to meet the requirements 301 */ 302 void sk_set_memalloc(struct sock *sk) 303 { 304 sock_set_flag(sk, SOCK_MEMALLOC); 305 sk->sk_allocation |= __GFP_MEMALLOC; 306 static_branch_inc(&memalloc_socks_key); 307 } 308 EXPORT_SYMBOL_GPL(sk_set_memalloc); 309 310 void sk_clear_memalloc(struct sock *sk) 311 { 312 sock_reset_flag(sk, SOCK_MEMALLOC); 313 sk->sk_allocation &= ~__GFP_MEMALLOC; 314 static_branch_dec(&memalloc_socks_key); 315 316 /* 317 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 318 * progress of swapping. SOCK_MEMALLOC may be cleared while 319 * it has rmem allocations due to the last swapfile being deactivated 320 * but there is a risk that the socket is unusable due to exceeding 321 * the rmem limits. Reclaim the reserves and obey rmem limits again. 322 */ 323 sk_mem_reclaim(sk); 324 } 325 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 326 327 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 328 { 329 int ret; 330 unsigned int noreclaim_flag; 331 332 /* these should have been dropped before queueing */ 333 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 334 335 noreclaim_flag = memalloc_noreclaim_save(); 336 ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv, 337 tcp_v6_do_rcv, 338 tcp_v4_do_rcv, 339 sk, skb); 340 memalloc_noreclaim_restore(noreclaim_flag); 341 342 return ret; 343 } 344 EXPORT_SYMBOL(__sk_backlog_rcv); 345 346 void sk_error_report(struct sock *sk) 347 { 348 sk->sk_error_report(sk); 349 350 switch (sk->sk_family) { 351 case AF_INET: 352 fallthrough; 353 case AF_INET6: 354 trace_inet_sk_error_report(sk); 355 break; 356 default: 357 break; 358 } 359 } 360 EXPORT_SYMBOL(sk_error_report); 361 362 int sock_get_timeout(long timeo, void *optval, bool old_timeval) 363 { 364 struct __kernel_sock_timeval tv; 365 366 if (timeo == MAX_SCHEDULE_TIMEOUT) { 367 tv.tv_sec = 0; 368 tv.tv_usec = 0; 369 } else { 370 tv.tv_sec = timeo / HZ; 371 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; 372 } 373 374 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 375 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; 376 *(struct old_timeval32 *)optval = tv32; 377 return sizeof(tv32); 378 } 379 380 if (old_timeval) { 381 struct __kernel_old_timeval old_tv; 382 old_tv.tv_sec = tv.tv_sec; 383 old_tv.tv_usec = tv.tv_usec; 384 *(struct __kernel_old_timeval *)optval = old_tv; 385 return sizeof(old_tv); 386 } 387 388 *(struct __kernel_sock_timeval *)optval = tv; 389 return sizeof(tv); 390 } 391 EXPORT_SYMBOL(sock_get_timeout); 392 393 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, 394 sockptr_t optval, int optlen, bool old_timeval) 395 { 396 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 397 struct old_timeval32 tv32; 398 399 if (optlen < sizeof(tv32)) 400 return -EINVAL; 401 402 if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) 403 return -EFAULT; 404 tv->tv_sec = tv32.tv_sec; 405 tv->tv_usec = tv32.tv_usec; 406 } else if (old_timeval) { 407 struct __kernel_old_timeval old_tv; 408 409 if (optlen < sizeof(old_tv)) 410 return -EINVAL; 411 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) 412 return -EFAULT; 413 tv->tv_sec = old_tv.tv_sec; 414 tv->tv_usec = old_tv.tv_usec; 415 } else { 416 if (optlen < sizeof(*tv)) 417 return -EINVAL; 418 if (copy_from_sockptr(tv, optval, sizeof(*tv))) 419 return -EFAULT; 420 } 421 422 return 0; 423 } 424 EXPORT_SYMBOL(sock_copy_user_timeval); 425 426 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, 427 bool old_timeval) 428 { 429 struct __kernel_sock_timeval tv; 430 int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval); 431 long val; 432 433 if (err) 434 return err; 435 436 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 437 return -EDOM; 438 439 if (tv.tv_sec < 0) { 440 static int warned __read_mostly; 441 442 WRITE_ONCE(*timeo_p, 0); 443 if (warned < 10 && net_ratelimit()) { 444 warned++; 445 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 446 __func__, current->comm, task_pid_nr(current)); 447 } 448 return 0; 449 } 450 val = MAX_SCHEDULE_TIMEOUT; 451 if ((tv.tv_sec || tv.tv_usec) && 452 (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))) 453 val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, 454 USEC_PER_SEC / HZ); 455 WRITE_ONCE(*timeo_p, val); 456 return 0; 457 } 458 459 static bool sk_set_prio_allowed(const struct sock *sk, int val) 460 { 461 return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) || 462 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || 463 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); 464 } 465 466 static bool sock_needs_netstamp(const struct sock *sk) 467 { 468 switch (sk->sk_family) { 469 case AF_UNSPEC: 470 case AF_UNIX: 471 return false; 472 default: 473 return true; 474 } 475 } 476 477 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 478 { 479 if (sk->sk_flags & flags) { 480 sk->sk_flags &= ~flags; 481 if (sock_needs_netstamp(sk) && 482 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 483 net_disable_timestamp(); 484 } 485 } 486 487 488 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 489 { 490 unsigned long flags; 491 struct sk_buff_head *list = &sk->sk_receive_queue; 492 493 if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { 494 sk_drops_inc(sk); 495 trace_sock_rcvqueue_full(sk, skb); 496 return -ENOMEM; 497 } 498 499 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 500 sk_drops_inc(sk); 501 return -ENOBUFS; 502 } 503 504 skb->dev = NULL; 505 skb_set_owner_r(skb, sk); 506 507 /* we escape from rcu protected region, make sure we dont leak 508 * a norefcounted dst 509 */ 510 skb_dst_force(skb); 511 512 spin_lock_irqsave(&list->lock, flags); 513 sock_skb_set_dropcount(sk, skb); 514 __skb_queue_tail(list, skb); 515 spin_unlock_irqrestore(&list->lock, flags); 516 517 if (!sock_flag(sk, SOCK_DEAD)) 518 sk->sk_data_ready(sk); 519 return 0; 520 } 521 EXPORT_SYMBOL(__sock_queue_rcv_skb); 522 523 enum skb_drop_reason 524 sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb) 525 { 526 enum skb_drop_reason drop_reason; 527 int err; 528 529 drop_reason = sk_filter_reason(sk, skb); 530 if (drop_reason) 531 return drop_reason; 532 533 err = __sock_queue_rcv_skb(sk, skb); 534 switch (err) { 535 case -ENOMEM: 536 return SKB_DROP_REASON_SOCKET_RCVBUFF; 537 case -ENOBUFS: 538 return SKB_DROP_REASON_PROTO_MEM; 539 } 540 return SKB_NOT_DROPPED_YET; 541 } 542 EXPORT_SYMBOL(sock_queue_rcv_skb_reason); 543 544 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 545 const int nested, unsigned int trim_cap, bool refcounted) 546 { 547 enum skb_drop_reason reason; 548 int rc = NET_RX_SUCCESS; 549 int err; 550 551 reason = sk_filter_trim_cap(sk, skb, trim_cap); 552 if (reason) 553 goto discard_and_relse; 554 555 skb->dev = NULL; 556 557 if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) { 558 sk_drops_inc(sk); 559 reason = SKB_DROP_REASON_SOCKET_RCVBUFF; 560 goto discard_and_relse; 561 } 562 if (nested) 563 bh_lock_sock_nested(sk); 564 else 565 bh_lock_sock(sk); 566 if (!sock_owned_by_user(sk)) { 567 /* 568 * trylock + unlock semantics: 569 */ 570 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 571 572 rc = sk_backlog_rcv(sk, skb); 573 574 mutex_release(&sk->sk_lock.dep_map, _RET_IP_); 575 } else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) { 576 bh_unlock_sock(sk); 577 if (err == -ENOMEM) 578 reason = SKB_DROP_REASON_PFMEMALLOC; 579 if (err == -ENOBUFS) 580 reason = SKB_DROP_REASON_SOCKET_BACKLOG; 581 sk_drops_inc(sk); 582 goto discard_and_relse; 583 } 584 585 bh_unlock_sock(sk); 586 out: 587 if (refcounted) 588 sock_put(sk); 589 return rc; 590 discard_and_relse: 591 sk_skb_reason_drop(sk, skb, reason); 592 goto out; 593 } 594 EXPORT_SYMBOL(__sk_receive_skb); 595 596 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, 597 u32)); 598 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 599 u32)); 600 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 601 { 602 struct dst_entry *dst = __sk_dst_get(sk); 603 604 if (dst && READ_ONCE(dst->obsolete) && 605 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 606 dst, cookie) == NULL) { 607 sk_tx_queue_clear(sk); 608 WRITE_ONCE(sk->sk_dst_pending_confirm, 0); 609 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 610 dst_release(dst); 611 return NULL; 612 } 613 614 return dst; 615 } 616 EXPORT_SYMBOL(__sk_dst_check); 617 618 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 619 { 620 struct dst_entry *dst = sk_dst_get(sk); 621 622 if (dst && READ_ONCE(dst->obsolete) && 623 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 624 dst, cookie) == NULL) { 625 sk_dst_reset(sk); 626 dst_release(dst); 627 return NULL; 628 } 629 630 return dst; 631 } 632 EXPORT_SYMBOL(sk_dst_check); 633 634 static int sock_bindtoindex_locked(struct sock *sk, int ifindex) 635 { 636 int ret = -ENOPROTOOPT; 637 #ifdef CONFIG_NETDEVICES 638 struct net *net = sock_net(sk); 639 640 /* Sorry... */ 641 ret = -EPERM; 642 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) 643 goto out; 644 645 ret = -EINVAL; 646 if (ifindex < 0) 647 goto out; 648 649 /* Paired with all READ_ONCE() done locklessly. */ 650 WRITE_ONCE(sk->sk_bound_dev_if, ifindex); 651 652 if (sk->sk_prot->rehash) 653 sk->sk_prot->rehash(sk); 654 sk_dst_reset(sk); 655 656 ret = 0; 657 658 out: 659 #endif 660 661 return ret; 662 } 663 664 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) 665 { 666 int ret; 667 668 if (lock_sk) 669 lock_sock(sk); 670 ret = sock_bindtoindex_locked(sk, ifindex); 671 if (lock_sk) 672 release_sock(sk); 673 674 return ret; 675 } 676 EXPORT_SYMBOL(sock_bindtoindex); 677 678 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) 679 { 680 int ret = -ENOPROTOOPT; 681 #ifdef CONFIG_NETDEVICES 682 struct net *net = sock_net(sk); 683 char devname[IFNAMSIZ]; 684 int index; 685 686 ret = -EINVAL; 687 if (optlen < 0) 688 goto out; 689 690 /* Bind this socket to a particular device like "eth0", 691 * as specified in the passed interface name. If the 692 * name is "" or the option length is zero the socket 693 * is not bound. 694 */ 695 if (optlen > IFNAMSIZ - 1) 696 optlen = IFNAMSIZ - 1; 697 memset(devname, 0, sizeof(devname)); 698 699 ret = -EFAULT; 700 if (copy_from_sockptr(devname, optval, optlen)) 701 goto out; 702 703 index = 0; 704 if (devname[0] != '\0') { 705 struct net_device *dev; 706 707 rcu_read_lock(); 708 dev = dev_get_by_name_rcu(net, devname); 709 if (dev) 710 index = dev->ifindex; 711 rcu_read_unlock(); 712 ret = -ENODEV; 713 if (!dev) 714 goto out; 715 } 716 717 sockopt_lock_sock(sk); 718 ret = sock_bindtoindex_locked(sk, index); 719 sockopt_release_sock(sk); 720 out: 721 #endif 722 723 return ret; 724 } 725 726 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval, 727 sockptr_t optlen, int len) 728 { 729 int ret = -ENOPROTOOPT; 730 #ifdef CONFIG_NETDEVICES 731 int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); 732 struct net *net = sock_net(sk); 733 char devname[IFNAMSIZ]; 734 735 if (bound_dev_if == 0) { 736 len = 0; 737 goto zero; 738 } 739 740 ret = -EINVAL; 741 if (len < IFNAMSIZ) 742 goto out; 743 744 ret = netdev_get_name(net, devname, bound_dev_if); 745 if (ret) 746 goto out; 747 748 len = strlen(devname) + 1; 749 750 ret = -EFAULT; 751 if (copy_to_sockptr(optval, devname, len)) 752 goto out; 753 754 zero: 755 ret = -EFAULT; 756 if (copy_to_sockptr(optlen, &len, sizeof(int))) 757 goto out; 758 759 ret = 0; 760 761 out: 762 #endif 763 764 return ret; 765 } 766 767 bool sk_mc_loop(const struct sock *sk) 768 { 769 if (dev_recursion_level()) 770 return false; 771 if (!sk) 772 return true; 773 /* IPV6_ADDRFORM can change sk->sk_family under us. */ 774 switch (READ_ONCE(sk->sk_family)) { 775 case AF_INET: 776 return inet_test_bit(MC_LOOP, sk); 777 #if IS_ENABLED(CONFIG_IPV6) 778 case AF_INET6: 779 return inet6_test_bit(MC6_LOOP, sk); 780 #endif 781 } 782 WARN_ON_ONCE(1); 783 return true; 784 } 785 EXPORT_SYMBOL(sk_mc_loop); 786 787 void sock_set_reuseaddr(struct sock *sk) 788 { 789 lock_sock(sk); 790 sk->sk_reuse = SK_CAN_REUSE; 791 release_sock(sk); 792 } 793 EXPORT_SYMBOL(sock_set_reuseaddr); 794 795 void sock_set_reuseport(struct sock *sk) 796 { 797 lock_sock(sk); 798 sk->sk_reuseport = true; 799 release_sock(sk); 800 } 801 EXPORT_SYMBOL(sock_set_reuseport); 802 803 void sock_no_linger(struct sock *sk) 804 { 805 lock_sock(sk); 806 WRITE_ONCE(sk->sk_lingertime, 0); 807 sock_set_flag(sk, SOCK_LINGER); 808 release_sock(sk); 809 } 810 EXPORT_SYMBOL(sock_no_linger); 811 812 void sock_set_priority(struct sock *sk, u32 priority) 813 { 814 WRITE_ONCE(sk->sk_priority, priority); 815 } 816 EXPORT_SYMBOL(sock_set_priority); 817 818 void sock_set_sndtimeo(struct sock *sk, s64 secs) 819 { 820 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) 821 WRITE_ONCE(sk->sk_sndtimeo, secs * HZ); 822 else 823 WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT); 824 } 825 EXPORT_SYMBOL(sock_set_sndtimeo); 826 827 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) 828 { 829 sock_valbool_flag(sk, SOCK_RCVTSTAMP, val); 830 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns); 831 if (val) { 832 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); 833 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 834 } 835 } 836 837 void sock_set_timestamp(struct sock *sk, int optname, bool valbool) 838 { 839 switch (optname) { 840 case SO_TIMESTAMP_OLD: 841 __sock_set_timestamps(sk, valbool, false, false); 842 break; 843 case SO_TIMESTAMP_NEW: 844 __sock_set_timestamps(sk, valbool, true, false); 845 break; 846 case SO_TIMESTAMPNS_OLD: 847 __sock_set_timestamps(sk, valbool, false, true); 848 break; 849 case SO_TIMESTAMPNS_NEW: 850 __sock_set_timestamps(sk, valbool, true, true); 851 break; 852 } 853 } 854 855 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) 856 { 857 struct net *net = sock_net(sk); 858 struct net_device *dev = NULL; 859 bool match = false; 860 int *vclock_index; 861 int i, num; 862 863 if (sk->sk_bound_dev_if) 864 dev = dev_get_by_index(net, sk->sk_bound_dev_if); 865 866 if (!dev) { 867 pr_err("%s: sock not bind to device\n", __func__); 868 return -EOPNOTSUPP; 869 } 870 871 num = ethtool_get_phc_vclocks(dev, &vclock_index); 872 dev_put(dev); 873 874 for (i = 0; i < num; i++) { 875 if (*(vclock_index + i) == phc_index) { 876 match = true; 877 break; 878 } 879 } 880 881 if (num > 0) 882 kfree(vclock_index); 883 884 if (!match) 885 return -EINVAL; 886 887 WRITE_ONCE(sk->sk_bind_phc, phc_index); 888 889 return 0; 890 } 891 892 int sock_set_timestamping(struct sock *sk, int optname, 893 struct so_timestamping timestamping) 894 { 895 int val = timestamping.flags; 896 int ret; 897 898 if (val & ~SOF_TIMESTAMPING_MASK) 899 return -EINVAL; 900 901 if (val & SOF_TIMESTAMPING_OPT_ID_TCP && 902 !(val & SOF_TIMESTAMPING_OPT_ID)) 903 return -EINVAL; 904 905 if (val & SOF_TIMESTAMPING_OPT_ID && 906 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 907 if (sk_is_tcp(sk)) { 908 if ((1 << sk->sk_state) & 909 (TCPF_CLOSE | TCPF_LISTEN)) 910 return -EINVAL; 911 if (val & SOF_TIMESTAMPING_OPT_ID_TCP) 912 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); 913 else 914 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); 915 } else { 916 atomic_set(&sk->sk_tskey, 0); 917 } 918 } 919 920 if (val & SOF_TIMESTAMPING_OPT_STATS && 921 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) 922 return -EINVAL; 923 924 if (val & SOF_TIMESTAMPING_BIND_PHC) { 925 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc); 926 if (ret) 927 return ret; 928 } 929 930 WRITE_ONCE(sk->sk_tsflags, val); 931 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); 932 sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY)); 933 934 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 935 sock_enable_timestamp(sk, 936 SOCK_TIMESTAMPING_RX_SOFTWARE); 937 else 938 sock_disable_timestamp(sk, 939 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 940 return 0; 941 } 942 943 #if defined(CONFIG_CGROUP_BPF) 944 void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op) 945 { 946 struct bpf_sock_ops_kern sock_ops; 947 948 memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); 949 sock_ops.op = op; 950 sock_ops.is_fullsock = 1; 951 sock_ops.sk = sk; 952 bpf_skops_init_skb(&sock_ops, skb, 0); 953 __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS); 954 } 955 #endif 956 957 void sock_set_keepalive(struct sock *sk) 958 { 959 lock_sock(sk); 960 if (sk->sk_prot->keepalive) 961 sk->sk_prot->keepalive(sk, true); 962 sock_valbool_flag(sk, SOCK_KEEPOPEN, true); 963 release_sock(sk); 964 } 965 EXPORT_SYMBOL(sock_set_keepalive); 966 967 static void __sock_set_rcvbuf(struct sock *sk, int val) 968 { 969 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it 970 * as a negative value. 971 */ 972 val = min_t(int, val, INT_MAX / 2); 973 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 974 975 /* We double it on the way in to account for "struct sk_buff" etc. 976 * overhead. Applications assume that the SO_RCVBUF setting they make 977 * will allow that much actual data to be received on that socket. 978 * 979 * Applications are unaware that "struct sk_buff" and other overheads 980 * allocate from the receive buffer during socket buffer allocation. 981 * 982 * And after considering the possible alternatives, returning the value 983 * we actually used in getsockopt is the most desirable behavior. 984 */ 985 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); 986 } 987 988 void sock_set_rcvbuf(struct sock *sk, int val) 989 { 990 lock_sock(sk); 991 __sock_set_rcvbuf(sk, val); 992 release_sock(sk); 993 } 994 EXPORT_SYMBOL(sock_set_rcvbuf); 995 996 static void __sock_set_mark(struct sock *sk, u32 val) 997 { 998 if (val != sk->sk_mark) { 999 WRITE_ONCE(sk->sk_mark, val); 1000 sk_dst_reset(sk); 1001 } 1002 } 1003 1004 void sock_set_mark(struct sock *sk, u32 val) 1005 { 1006 lock_sock(sk); 1007 __sock_set_mark(sk, val); 1008 release_sock(sk); 1009 } 1010 EXPORT_SYMBOL(sock_set_mark); 1011 1012 static void sock_release_reserved_memory(struct sock *sk, int bytes) 1013 { 1014 /* Round down bytes to multiple of pages */ 1015 bytes = round_down(bytes, PAGE_SIZE); 1016 1017 WARN_ON(bytes > sk->sk_reserved_mem); 1018 WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes); 1019 sk_mem_reclaim(sk); 1020 } 1021 1022 static int sock_reserve_memory(struct sock *sk, int bytes) 1023 { 1024 long allocated; 1025 bool charged; 1026 int pages; 1027 1028 if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk)) 1029 return -EOPNOTSUPP; 1030 1031 if (!bytes) 1032 return 0; 1033 1034 pages = sk_mem_pages(bytes); 1035 1036 /* pre-charge to memcg */ 1037 charged = mem_cgroup_sk_charge(sk, pages, 1038 GFP_KERNEL | __GFP_RETRY_MAYFAIL); 1039 if (!charged) 1040 return -ENOMEM; 1041 1042 if (sk->sk_bypass_prot_mem) 1043 goto success; 1044 1045 /* pre-charge to forward_alloc */ 1046 sk_memory_allocated_add(sk, pages); 1047 allocated = sk_memory_allocated(sk); 1048 1049 /* If the system goes into memory pressure with this 1050 * precharge, give up and return error. 1051 */ 1052 if (allocated > sk_prot_mem_limits(sk, 1)) { 1053 sk_memory_allocated_sub(sk, pages); 1054 mem_cgroup_sk_uncharge(sk, pages); 1055 return -ENOMEM; 1056 } 1057 1058 success: 1059 sk_forward_alloc_add(sk, pages << PAGE_SHIFT); 1060 1061 WRITE_ONCE(sk->sk_reserved_mem, 1062 sk->sk_reserved_mem + (pages << PAGE_SHIFT)); 1063 1064 return 0; 1065 } 1066 1067 #ifdef CONFIG_PAGE_POOL 1068 1069 /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED 1070 * in 1 syscall. The limit exists to limit the amount of memory the kernel 1071 * allocates to copy these tokens, and to prevent looping over the frags for 1072 * too long. 1073 */ 1074 #define MAX_DONTNEED_TOKENS 128 1075 #define MAX_DONTNEED_FRAGS 1024 1076 1077 static noinline_for_stack int 1078 sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) 1079 { 1080 unsigned int num_tokens, i, j, k, netmem_num = 0; 1081 struct dmabuf_token *tokens; 1082 int ret = 0, num_frags = 0; 1083 netmem_ref netmems[16]; 1084 1085 if (!sk_is_tcp(sk)) 1086 return -EBADF; 1087 1088 if (optlen % sizeof(*tokens) || 1089 optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS) 1090 return -EINVAL; 1091 1092 num_tokens = optlen / sizeof(*tokens); 1093 tokens = kvmalloc_objs(*tokens, num_tokens); 1094 if (!tokens) 1095 return -ENOMEM; 1096 1097 if (copy_from_sockptr(tokens, optval, optlen)) { 1098 kvfree(tokens); 1099 return -EFAULT; 1100 } 1101 1102 xa_lock_bh(&sk->sk_user_frags); 1103 for (i = 0; i < num_tokens; i++) { 1104 for (j = 0; j < tokens[i].token_count; j++) { 1105 if (++num_frags > MAX_DONTNEED_FRAGS) 1106 goto frag_limit_reached; 1107 1108 netmem_ref netmem = (__force netmem_ref)__xa_erase( 1109 &sk->sk_user_frags, tokens[i].token_start + j); 1110 1111 if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem))) 1112 continue; 1113 1114 netmems[netmem_num++] = netmem; 1115 if (netmem_num == ARRAY_SIZE(netmems)) { 1116 xa_unlock_bh(&sk->sk_user_frags); 1117 for (k = 0; k < netmem_num; k++) 1118 WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); 1119 netmem_num = 0; 1120 xa_lock_bh(&sk->sk_user_frags); 1121 } 1122 ret++; 1123 } 1124 } 1125 1126 frag_limit_reached: 1127 xa_unlock_bh(&sk->sk_user_frags); 1128 for (k = 0; k < netmem_num; k++) 1129 WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); 1130 1131 kvfree(tokens); 1132 return ret; 1133 } 1134 #endif 1135 1136 void sockopt_lock_sock(struct sock *sk) 1137 { 1138 /* When current->bpf_ctx is set, the setsockopt is called from 1139 * a bpf prog. bpf has ensured the sk lock has been 1140 * acquired before calling setsockopt(). 1141 */ 1142 if (has_current_bpf_ctx()) 1143 return; 1144 1145 lock_sock(sk); 1146 } 1147 EXPORT_SYMBOL(sockopt_lock_sock); 1148 1149 void sockopt_release_sock(struct sock *sk) 1150 { 1151 if (has_current_bpf_ctx()) 1152 return; 1153 1154 release_sock(sk); 1155 } 1156 EXPORT_SYMBOL(sockopt_release_sock); 1157 1158 bool sockopt_ns_capable(struct user_namespace *ns, int cap) 1159 { 1160 return has_current_bpf_ctx() || ns_capable(ns, cap); 1161 } 1162 EXPORT_SYMBOL(sockopt_ns_capable); 1163 1164 bool sockopt_capable(int cap) 1165 { 1166 return has_current_bpf_ctx() || capable(cap); 1167 } 1168 EXPORT_SYMBOL(sockopt_capable); 1169 1170 static int sockopt_validate_clockid(__kernel_clockid_t value) 1171 { 1172 switch (value) { 1173 case CLOCK_REALTIME: 1174 case CLOCK_MONOTONIC: 1175 case CLOCK_TAI: 1176 return 0; 1177 } 1178 return -EINVAL; 1179 } 1180 1181 /* 1182 * This is meant for all protocols to use and covers goings on 1183 * at the socket level. Everything here is generic. 1184 */ 1185 1186 int sk_setsockopt(struct sock *sk, int level, int optname, 1187 sockptr_t optval, unsigned int optlen) 1188 { 1189 struct so_timestamping timestamping; 1190 struct socket *sock = sk->sk_socket; 1191 struct sock_txtime sk_txtime; 1192 int val; 1193 int valbool; 1194 struct linger ling; 1195 int ret = 0; 1196 1197 /* 1198 * Options without arguments 1199 */ 1200 1201 if (optname == SO_BINDTODEVICE) 1202 return sock_setbindtodevice(sk, optval, optlen); 1203 1204 if (optlen < sizeof(int)) 1205 return -EINVAL; 1206 1207 if (copy_from_sockptr(&val, optval, sizeof(val))) 1208 return -EFAULT; 1209 1210 valbool = val ? 1 : 0; 1211 1212 /* handle options which do not require locking the socket. */ 1213 switch (optname) { 1214 case SO_PRIORITY: 1215 if (sk_set_prio_allowed(sk, val)) { 1216 sock_set_priority(sk, val); 1217 return 0; 1218 } 1219 return -EPERM; 1220 case SO_TYPE: 1221 case SO_PROTOCOL: 1222 case SO_DOMAIN: 1223 case SO_ERROR: 1224 return -ENOPROTOOPT; 1225 #ifdef CONFIG_NET_RX_BUSY_POLL 1226 case SO_BUSY_POLL: 1227 if (val < 0) 1228 return -EINVAL; 1229 WRITE_ONCE(sk->sk_ll_usec, val); 1230 return 0; 1231 case SO_PREFER_BUSY_POLL: 1232 if (valbool && !sockopt_capable(CAP_NET_ADMIN)) 1233 return -EPERM; 1234 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); 1235 return 0; 1236 case SO_BUSY_POLL_BUDGET: 1237 if (val > READ_ONCE(sk->sk_busy_poll_budget) && 1238 !sockopt_capable(CAP_NET_ADMIN)) 1239 return -EPERM; 1240 if (val < 0 || val > U16_MAX) 1241 return -EINVAL; 1242 WRITE_ONCE(sk->sk_busy_poll_budget, val); 1243 return 0; 1244 #endif 1245 case SO_MAX_PACING_RATE: 1246 { 1247 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; 1248 unsigned long pacing_rate; 1249 1250 if (sizeof(ulval) != sizeof(val) && 1251 optlen >= sizeof(ulval) && 1252 copy_from_sockptr(&ulval, optval, sizeof(ulval))) { 1253 return -EFAULT; 1254 } 1255 if (ulval != ~0UL) 1256 cmpxchg(&sk->sk_pacing_status, 1257 SK_PACING_NONE, 1258 SK_PACING_NEEDED); 1259 /* Pairs with READ_ONCE() from sk_getsockopt() */ 1260 WRITE_ONCE(sk->sk_max_pacing_rate, ulval); 1261 pacing_rate = READ_ONCE(sk->sk_pacing_rate); 1262 if (ulval < pacing_rate) 1263 WRITE_ONCE(sk->sk_pacing_rate, ulval); 1264 return 0; 1265 } 1266 case SO_TXREHASH: 1267 if (!sk_is_tcp(sk)) 1268 return -EOPNOTSUPP; 1269 if (val < -1 || val > 1) 1270 return -EINVAL; 1271 if ((u8)val == SOCK_TXREHASH_DEFAULT) 1272 val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); 1273 /* Paired with READ_ONCE() in tcp_rtx_synack() 1274 * and sk_getsockopt(). 1275 */ 1276 WRITE_ONCE(sk->sk_txrehash, (u8)val); 1277 return 0; 1278 case SO_PEEK_OFF: 1279 { 1280 int (*set_peek_off)(struct sock *sk, int val); 1281 1282 set_peek_off = READ_ONCE(sock->ops)->set_peek_off; 1283 if (set_peek_off) 1284 ret = set_peek_off(sk, val); 1285 else 1286 ret = -EOPNOTSUPP; 1287 return ret; 1288 } 1289 #ifdef CONFIG_PAGE_POOL 1290 case SO_DEVMEM_DONTNEED: 1291 return sock_devmem_dontneed(sk, optval, optlen); 1292 #endif 1293 case SO_SNDTIMEO_OLD: 1294 case SO_SNDTIMEO_NEW: 1295 return sock_set_timeout(&sk->sk_sndtimeo, optval, 1296 optlen, optname == SO_SNDTIMEO_OLD); 1297 case SO_RCVTIMEO_OLD: 1298 case SO_RCVTIMEO_NEW: 1299 return sock_set_timeout(&sk->sk_rcvtimeo, optval, 1300 optlen, optname == SO_RCVTIMEO_OLD); 1301 } 1302 1303 sockopt_lock_sock(sk); 1304 1305 switch (optname) { 1306 case SO_DEBUG: 1307 if (val && !sockopt_capable(CAP_NET_ADMIN)) 1308 ret = -EACCES; 1309 else 1310 sock_valbool_flag(sk, SOCK_DBG, valbool); 1311 break; 1312 case SO_REUSEADDR: 1313 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 1314 break; 1315 case SO_REUSEPORT: 1316 if (valbool && !sk_is_inet(sk)) 1317 ret = -EOPNOTSUPP; 1318 else 1319 sk->sk_reuseport = valbool; 1320 break; 1321 case SO_DONTROUTE: 1322 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 1323 sk_dst_reset(sk); 1324 break; 1325 case SO_BROADCAST: 1326 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 1327 break; 1328 case SO_SNDBUF: 1329 /* Don't error on this BSD doesn't and if you think 1330 * about it this is right. Otherwise apps have to 1331 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1332 * are treated in BSD as hints 1333 */ 1334 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); 1335 set_sndbuf: 1336 /* Ensure val * 2 fits into an int, to prevent max_t() 1337 * from treating it as a negative value. 1338 */ 1339 val = min_t(int, val, INT_MAX / 2); 1340 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 1341 WRITE_ONCE(sk->sk_sndbuf, 1342 max_t(int, val * 2, SOCK_MIN_SNDBUF)); 1343 /* Wake up sending tasks if we upped the value. */ 1344 sk->sk_write_space(sk); 1345 break; 1346 1347 case SO_SNDBUFFORCE: 1348 if (!sockopt_capable(CAP_NET_ADMIN)) { 1349 ret = -EPERM; 1350 break; 1351 } 1352 1353 /* No negative values (to prevent underflow, as val will be 1354 * multiplied by 2). 1355 */ 1356 if (val < 0) 1357 val = 0; 1358 goto set_sndbuf; 1359 1360 case SO_RCVBUF: 1361 /* Don't error on this BSD doesn't and if you think 1362 * about it this is right. Otherwise apps have to 1363 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1364 * are treated in BSD as hints 1365 */ 1366 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max))); 1367 break; 1368 1369 case SO_RCVBUFFORCE: 1370 if (!sockopt_capable(CAP_NET_ADMIN)) { 1371 ret = -EPERM; 1372 break; 1373 } 1374 1375 /* No negative values (to prevent underflow, as val will be 1376 * multiplied by 2). 1377 */ 1378 __sock_set_rcvbuf(sk, max(val, 0)); 1379 break; 1380 1381 case SO_KEEPALIVE: 1382 if (sk->sk_prot->keepalive) 1383 sk->sk_prot->keepalive(sk, valbool); 1384 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 1385 break; 1386 1387 case SO_OOBINLINE: 1388 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 1389 break; 1390 1391 case SO_NO_CHECK: 1392 sk->sk_no_check_tx = valbool; 1393 break; 1394 1395 case SO_LINGER: 1396 if (optlen < sizeof(ling)) { 1397 ret = -EINVAL; /* 1003.1g */ 1398 break; 1399 } 1400 if (copy_from_sockptr(&ling, optval, sizeof(ling))) { 1401 ret = -EFAULT; 1402 break; 1403 } 1404 if (!ling.l_onoff) { 1405 sock_reset_flag(sk, SOCK_LINGER); 1406 } else { 1407 unsigned long t_sec = ling.l_linger; 1408 1409 if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ) 1410 WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT); 1411 else 1412 WRITE_ONCE(sk->sk_lingertime, t_sec * HZ); 1413 sock_set_flag(sk, SOCK_LINGER); 1414 } 1415 break; 1416 1417 case SO_BSDCOMPAT: 1418 break; 1419 1420 case SO_TIMESTAMP_OLD: 1421 case SO_TIMESTAMP_NEW: 1422 case SO_TIMESTAMPNS_OLD: 1423 case SO_TIMESTAMPNS_NEW: 1424 sock_set_timestamp(sk, optname, valbool); 1425 break; 1426 1427 case SO_TIMESTAMPING_NEW: 1428 case SO_TIMESTAMPING_OLD: 1429 if (optlen == sizeof(timestamping)) { 1430 if (copy_from_sockptr(×tamping, optval, 1431 sizeof(timestamping))) { 1432 ret = -EFAULT; 1433 break; 1434 } 1435 } else { 1436 memset(×tamping, 0, sizeof(timestamping)); 1437 timestamping.flags = val; 1438 } 1439 ret = sock_set_timestamping(sk, optname, timestamping); 1440 break; 1441 1442 case SO_RCVLOWAT: 1443 { 1444 int (*set_rcvlowat)(struct sock *sk, int val) = NULL; 1445 1446 if (val < 0) 1447 val = INT_MAX; 1448 if (sock) 1449 set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat; 1450 if (set_rcvlowat) 1451 ret = set_rcvlowat(sk, val); 1452 else 1453 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); 1454 break; 1455 } 1456 case SO_ATTACH_FILTER: { 1457 struct sock_fprog fprog; 1458 1459 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1460 if (!ret) 1461 ret = sk_attach_filter(&fprog, sk); 1462 break; 1463 } 1464 case SO_ATTACH_BPF: 1465 ret = -EINVAL; 1466 if (optlen == sizeof(u32)) { 1467 u32 ufd; 1468 1469 ret = -EFAULT; 1470 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1471 break; 1472 1473 ret = sk_attach_bpf(ufd, sk); 1474 } 1475 break; 1476 1477 case SO_ATTACH_REUSEPORT_CBPF: { 1478 struct sock_fprog fprog; 1479 1480 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1481 if (!ret) 1482 ret = sk_reuseport_attach_filter(&fprog, sk); 1483 break; 1484 } 1485 case SO_ATTACH_REUSEPORT_EBPF: 1486 ret = -EINVAL; 1487 if (optlen == sizeof(u32)) { 1488 u32 ufd; 1489 1490 ret = -EFAULT; 1491 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1492 break; 1493 1494 ret = sk_reuseport_attach_bpf(ufd, sk); 1495 } 1496 break; 1497 1498 case SO_DETACH_REUSEPORT_BPF: 1499 ret = reuseport_detach_prog(sk); 1500 break; 1501 1502 case SO_DETACH_FILTER: 1503 ret = sk_detach_filter(sk); 1504 break; 1505 1506 case SO_LOCK_FILTER: 1507 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 1508 ret = -EPERM; 1509 else 1510 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 1511 break; 1512 1513 case SO_MARK: 1514 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 1515 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1516 ret = -EPERM; 1517 break; 1518 } 1519 1520 __sock_set_mark(sk, val); 1521 break; 1522 case SO_RCVMARK: 1523 sock_valbool_flag(sk, SOCK_RCVMARK, valbool); 1524 break; 1525 1526 case SO_RCVPRIORITY: 1527 sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool); 1528 break; 1529 1530 case SO_RXQ_OVFL: 1531 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 1532 break; 1533 1534 case SO_WIFI_STATUS: 1535 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1536 break; 1537 1538 case SO_NOFCS: 1539 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1540 break; 1541 1542 case SO_SELECT_ERR_QUEUE: 1543 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1544 break; 1545 1546 case SO_PASSCRED: 1547 if (sk_may_scm_recv(sk)) 1548 sk->sk_scm_credentials = valbool; 1549 else 1550 ret = -EOPNOTSUPP; 1551 break; 1552 1553 case SO_PASSSEC: 1554 if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk)) 1555 sk->sk_scm_security = valbool; 1556 else 1557 ret = -EOPNOTSUPP; 1558 break; 1559 1560 case SO_PASSPIDFD: 1561 if (sk_is_unix(sk)) 1562 sk->sk_scm_pidfd = valbool; 1563 else 1564 ret = -EOPNOTSUPP; 1565 break; 1566 1567 case SO_PASSRIGHTS: 1568 if (sk_is_unix(sk)) 1569 sk->sk_scm_rights = valbool; 1570 else 1571 ret = -EOPNOTSUPP; 1572 break; 1573 1574 case SO_INCOMING_CPU: 1575 reuseport_update_incoming_cpu(sk, val); 1576 break; 1577 1578 case SO_CNX_ADVICE: 1579 if (val == 1) 1580 dst_negative_advice(sk); 1581 break; 1582 1583 case SO_ZEROCOPY: 1584 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1585 if (!(sk_is_tcp(sk) || 1586 (sk->sk_type == SOCK_DGRAM && 1587 sk->sk_protocol == IPPROTO_UDP))) 1588 ret = -EOPNOTSUPP; 1589 } else if (sk->sk_family != PF_RDS) { 1590 ret = -EOPNOTSUPP; 1591 } 1592 if (!ret) { 1593 if (val < 0 || val > 1) 1594 ret = -EINVAL; 1595 else 1596 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1597 } 1598 break; 1599 1600 case SO_TXTIME: 1601 if (optlen != sizeof(struct sock_txtime)) { 1602 ret = -EINVAL; 1603 break; 1604 } else if (copy_from_sockptr(&sk_txtime, optval, 1605 sizeof(struct sock_txtime))) { 1606 ret = -EFAULT; 1607 break; 1608 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { 1609 ret = -EINVAL; 1610 break; 1611 } 1612 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet 1613 * scheduler has enough safe guards. 1614 */ 1615 if (sk_txtime.clockid != CLOCK_MONOTONIC && 1616 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1617 ret = -EPERM; 1618 break; 1619 } 1620 1621 ret = sockopt_validate_clockid(sk_txtime.clockid); 1622 if (ret) 1623 break; 1624 1625 sock_valbool_flag(sk, SOCK_TXTIME, true); 1626 sk->sk_clockid = sk_txtime.clockid; 1627 sk->sk_txtime_deadline_mode = 1628 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); 1629 sk->sk_txtime_report_errors = 1630 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); 1631 break; 1632 1633 case SO_BINDTOIFINDEX: 1634 ret = sock_bindtoindex_locked(sk, val); 1635 break; 1636 1637 case SO_BUF_LOCK: 1638 if (val & ~SOCK_BUF_LOCK_MASK) { 1639 ret = -EINVAL; 1640 break; 1641 } 1642 sk->sk_userlocks = val | (sk->sk_userlocks & 1643 ~SOCK_BUF_LOCK_MASK); 1644 break; 1645 1646 case SO_RESERVE_MEM: 1647 { 1648 int delta; 1649 1650 if (val < 0) { 1651 ret = -EINVAL; 1652 break; 1653 } 1654 1655 delta = val - sk->sk_reserved_mem; 1656 if (delta < 0) 1657 sock_release_reserved_memory(sk, -delta); 1658 else 1659 ret = sock_reserve_memory(sk, delta); 1660 break; 1661 } 1662 1663 default: 1664 ret = -ENOPROTOOPT; 1665 break; 1666 } 1667 sockopt_release_sock(sk); 1668 return ret; 1669 } 1670 1671 int sock_setsockopt(struct socket *sock, int level, int optname, 1672 sockptr_t optval, unsigned int optlen) 1673 { 1674 return sk_setsockopt(sock->sk, level, optname, 1675 optval, optlen); 1676 } 1677 EXPORT_SYMBOL(sock_setsockopt); 1678 1679 static const struct cred *sk_get_peer_cred(struct sock *sk) 1680 { 1681 const struct cred *cred; 1682 1683 spin_lock(&sk->sk_peer_lock); 1684 cred = get_cred(sk->sk_peer_cred); 1685 spin_unlock(&sk->sk_peer_lock); 1686 1687 return cred; 1688 } 1689 1690 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1691 struct ucred *ucred) 1692 { 1693 ucred->pid = pid_vnr(pid); 1694 ucred->uid = ucred->gid = -1; 1695 if (cred) { 1696 struct user_namespace *current_ns = current_user_ns(); 1697 1698 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1699 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1700 } 1701 } 1702 1703 static int groups_to_user(sockptr_t dst, const struct group_info *src) 1704 { 1705 struct user_namespace *user_ns = current_user_ns(); 1706 int i; 1707 1708 for (i = 0; i < src->ngroups; i++) { 1709 gid_t gid = from_kgid_munged(user_ns, src->gid[i]); 1710 1711 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid))) 1712 return -EFAULT; 1713 } 1714 1715 return 0; 1716 } 1717 1718 int sk_getsockopt(struct sock *sk, int level, int optname, 1719 sockptr_t optval, sockptr_t optlen) 1720 { 1721 struct socket *sock = sk->sk_socket; 1722 1723 union { 1724 int val; 1725 u64 val64; 1726 unsigned long ulval; 1727 struct linger ling; 1728 struct old_timeval32 tm32; 1729 struct __kernel_old_timeval tm; 1730 struct __kernel_sock_timeval stm; 1731 struct sock_txtime txtime; 1732 struct so_timestamping timestamping; 1733 } v; 1734 1735 int lv = sizeof(int); 1736 int len; 1737 1738 if (copy_from_sockptr(&len, optlen, sizeof(int))) 1739 return -EFAULT; 1740 if (len < 0) 1741 return -EINVAL; 1742 1743 memset(&v, 0, sizeof(v)); 1744 1745 switch (optname) { 1746 case SO_DEBUG: 1747 v.val = sock_flag(sk, SOCK_DBG); 1748 break; 1749 1750 case SO_DONTROUTE: 1751 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1752 break; 1753 1754 case SO_BROADCAST: 1755 v.val = sock_flag(sk, SOCK_BROADCAST); 1756 break; 1757 1758 case SO_SNDBUF: 1759 v.val = READ_ONCE(sk->sk_sndbuf); 1760 break; 1761 1762 case SO_RCVBUF: 1763 v.val = READ_ONCE(sk->sk_rcvbuf); 1764 break; 1765 1766 case SO_REUSEADDR: 1767 v.val = sk->sk_reuse; 1768 break; 1769 1770 case SO_REUSEPORT: 1771 v.val = sk->sk_reuseport; 1772 break; 1773 1774 case SO_KEEPALIVE: 1775 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1776 break; 1777 1778 case SO_TYPE: 1779 v.val = sk->sk_type; 1780 break; 1781 1782 case SO_PROTOCOL: 1783 v.val = sk->sk_protocol; 1784 break; 1785 1786 case SO_DOMAIN: 1787 v.val = sk->sk_family; 1788 break; 1789 1790 case SO_ERROR: 1791 v.val = -sock_error(sk); 1792 if (v.val == 0) 1793 v.val = xchg(&sk->sk_err_soft, 0); 1794 break; 1795 1796 case SO_OOBINLINE: 1797 v.val = sock_flag(sk, SOCK_URGINLINE); 1798 break; 1799 1800 case SO_NO_CHECK: 1801 v.val = sk->sk_no_check_tx; 1802 break; 1803 1804 case SO_PRIORITY: 1805 v.val = READ_ONCE(sk->sk_priority); 1806 break; 1807 1808 case SO_LINGER: 1809 lv = sizeof(v.ling); 1810 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1811 v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ; 1812 break; 1813 1814 case SO_BSDCOMPAT: 1815 break; 1816 1817 case SO_TIMESTAMP_OLD: 1818 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1819 !sock_flag(sk, SOCK_TSTAMP_NEW) && 1820 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1821 break; 1822 1823 case SO_TIMESTAMPNS_OLD: 1824 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); 1825 break; 1826 1827 case SO_TIMESTAMP_NEW: 1828 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); 1829 break; 1830 1831 case SO_TIMESTAMPNS_NEW: 1832 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); 1833 break; 1834 1835 case SO_TIMESTAMPING_OLD: 1836 case SO_TIMESTAMPING_NEW: 1837 lv = sizeof(v.timestamping); 1838 /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only 1839 * returning the flags when they were set through the same option. 1840 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD. 1841 */ 1842 if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) { 1843 v.timestamping.flags = READ_ONCE(sk->sk_tsflags); 1844 v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); 1845 } 1846 break; 1847 1848 case SO_RCVTIMEO_OLD: 1849 case SO_RCVTIMEO_NEW: 1850 lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v, 1851 SO_RCVTIMEO_OLD == optname); 1852 break; 1853 1854 case SO_SNDTIMEO_OLD: 1855 case SO_SNDTIMEO_NEW: 1856 lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v, 1857 SO_SNDTIMEO_OLD == optname); 1858 break; 1859 1860 case SO_RCVLOWAT: 1861 v.val = READ_ONCE(sk->sk_rcvlowat); 1862 break; 1863 1864 case SO_SNDLOWAT: 1865 v.val = 1; 1866 break; 1867 1868 case SO_PASSCRED: 1869 if (!sk_may_scm_recv(sk)) 1870 return -EOPNOTSUPP; 1871 1872 v.val = sk->sk_scm_credentials; 1873 break; 1874 1875 case SO_PASSPIDFD: 1876 if (!sk_is_unix(sk)) 1877 return -EOPNOTSUPP; 1878 1879 v.val = sk->sk_scm_pidfd; 1880 break; 1881 1882 case SO_PASSRIGHTS: 1883 if (!sk_is_unix(sk)) 1884 return -EOPNOTSUPP; 1885 1886 v.val = sk->sk_scm_rights; 1887 break; 1888 1889 case SO_PEERCRED: 1890 { 1891 struct ucred peercred; 1892 if (len > sizeof(peercred)) 1893 len = sizeof(peercred); 1894 1895 spin_lock(&sk->sk_peer_lock); 1896 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1897 spin_unlock(&sk->sk_peer_lock); 1898 1899 if (copy_to_sockptr(optval, &peercred, len)) 1900 return -EFAULT; 1901 goto lenout; 1902 } 1903 1904 case SO_PEERPIDFD: 1905 { 1906 struct pid *peer_pid; 1907 struct file *pidfd_file = NULL; 1908 unsigned int flags = 0; 1909 int pidfd; 1910 1911 if (len > sizeof(pidfd)) 1912 len = sizeof(pidfd); 1913 1914 spin_lock(&sk->sk_peer_lock); 1915 peer_pid = get_pid(sk->sk_peer_pid); 1916 spin_unlock(&sk->sk_peer_lock); 1917 1918 if (!peer_pid) 1919 return -ENODATA; 1920 1921 /* The use of PIDFD_STALE requires stashing of struct pid 1922 * on pidfs with pidfs_register_pid() and only AF_UNIX 1923 * were prepared for this. 1924 */ 1925 if (sk->sk_family == AF_UNIX) 1926 flags = PIDFD_STALE; 1927 1928 pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file); 1929 put_pid(peer_pid); 1930 if (pidfd < 0) 1931 return pidfd; 1932 1933 if (copy_to_sockptr(optval, &pidfd, len) || 1934 copy_to_sockptr(optlen, &len, sizeof(int))) { 1935 put_unused_fd(pidfd); 1936 fput(pidfd_file); 1937 1938 return -EFAULT; 1939 } 1940 1941 fd_install(pidfd, pidfd_file); 1942 return 0; 1943 } 1944 1945 case SO_PEERGROUPS: 1946 { 1947 const struct cred *cred; 1948 int ret, n; 1949 1950 cred = sk_get_peer_cred(sk); 1951 if (!cred) 1952 return -ENODATA; 1953 1954 n = cred->group_info->ngroups; 1955 if (len < n * sizeof(gid_t)) { 1956 len = n * sizeof(gid_t); 1957 put_cred(cred); 1958 return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE; 1959 } 1960 len = n * sizeof(gid_t); 1961 1962 ret = groups_to_user(optval, cred->group_info); 1963 put_cred(cred); 1964 if (ret) 1965 return ret; 1966 goto lenout; 1967 } 1968 1969 case SO_PEERNAME: 1970 { 1971 struct sockaddr_storage address; 1972 1973 lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2); 1974 if (lv < 0) 1975 return -ENOTCONN; 1976 if (lv < len) 1977 return -EINVAL; 1978 if (copy_to_sockptr(optval, &address, len)) 1979 return -EFAULT; 1980 goto lenout; 1981 } 1982 1983 /* Dubious BSD thing... Probably nobody even uses it, but 1984 * the UNIX standard wants it for whatever reason... -DaveM 1985 */ 1986 case SO_ACCEPTCONN: 1987 v.val = sk->sk_state == TCP_LISTEN; 1988 break; 1989 1990 case SO_PASSSEC: 1991 if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk)) 1992 return -EOPNOTSUPP; 1993 1994 v.val = sk->sk_scm_security; 1995 break; 1996 1997 case SO_PEERSEC: 1998 return security_socket_getpeersec_stream(sock, 1999 optval, optlen, len); 2000 2001 case SO_MARK: 2002 v.val = READ_ONCE(sk->sk_mark); 2003 break; 2004 2005 case SO_RCVMARK: 2006 v.val = sock_flag(sk, SOCK_RCVMARK); 2007 break; 2008 2009 case SO_RCVPRIORITY: 2010 v.val = sock_flag(sk, SOCK_RCVPRIORITY); 2011 break; 2012 2013 case SO_RXQ_OVFL: 2014 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 2015 break; 2016 2017 case SO_WIFI_STATUS: 2018 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 2019 break; 2020 2021 case SO_PEEK_OFF: 2022 if (!READ_ONCE(sock->ops)->set_peek_off) 2023 return -EOPNOTSUPP; 2024 2025 v.val = READ_ONCE(sk->sk_peek_off); 2026 break; 2027 case SO_NOFCS: 2028 v.val = sock_flag(sk, SOCK_NOFCS); 2029 break; 2030 2031 case SO_BINDTODEVICE: 2032 return sock_getbindtodevice(sk, optval, optlen, len); 2033 2034 case SO_GET_FILTER: 2035 len = sk_get_filter(sk, optval, len); 2036 if (len < 0) 2037 return len; 2038 2039 goto lenout; 2040 2041 case SO_LOCK_FILTER: 2042 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 2043 break; 2044 2045 case SO_BPF_EXTENSIONS: 2046 v.val = bpf_tell_extensions(); 2047 break; 2048 2049 case SO_SELECT_ERR_QUEUE: 2050 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 2051 break; 2052 2053 #ifdef CONFIG_NET_RX_BUSY_POLL 2054 case SO_BUSY_POLL: 2055 v.val = READ_ONCE(sk->sk_ll_usec); 2056 break; 2057 case SO_PREFER_BUSY_POLL: 2058 v.val = READ_ONCE(sk->sk_prefer_busy_poll); 2059 break; 2060 #endif 2061 2062 case SO_MAX_PACING_RATE: 2063 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */ 2064 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { 2065 lv = sizeof(v.ulval); 2066 v.ulval = READ_ONCE(sk->sk_max_pacing_rate); 2067 } else { 2068 /* 32bit version */ 2069 v.val = min_t(unsigned long, ~0U, 2070 READ_ONCE(sk->sk_max_pacing_rate)); 2071 } 2072 break; 2073 2074 case SO_INCOMING_CPU: 2075 v.val = READ_ONCE(sk->sk_incoming_cpu); 2076 break; 2077 2078 case SO_MEMINFO: 2079 { 2080 u32 meminfo[SK_MEMINFO_VARS]; 2081 2082 sk_get_meminfo(sk, meminfo); 2083 2084 len = min_t(unsigned int, len, sizeof(meminfo)); 2085 if (copy_to_sockptr(optval, &meminfo, len)) 2086 return -EFAULT; 2087 2088 goto lenout; 2089 } 2090 2091 #ifdef CONFIG_NET_RX_BUSY_POLL 2092 case SO_INCOMING_NAPI_ID: 2093 v.val = READ_ONCE(sk->sk_napi_id); 2094 2095 /* aggregate non-NAPI IDs down to 0 */ 2096 if (!napi_id_valid(v.val)) 2097 v.val = 0; 2098 2099 break; 2100 #endif 2101 2102 case SO_COOKIE: 2103 lv = sizeof(u64); 2104 if (len < lv) 2105 return -EINVAL; 2106 v.val64 = sock_gen_cookie(sk); 2107 break; 2108 2109 case SO_ZEROCOPY: 2110 v.val = sock_flag(sk, SOCK_ZEROCOPY); 2111 break; 2112 2113 case SO_TXTIME: 2114 lv = sizeof(v.txtime); 2115 v.txtime.clockid = sk->sk_clockid; 2116 v.txtime.flags |= sk->sk_txtime_deadline_mode ? 2117 SOF_TXTIME_DEADLINE_MODE : 0; 2118 v.txtime.flags |= sk->sk_txtime_report_errors ? 2119 SOF_TXTIME_REPORT_ERRORS : 0; 2120 break; 2121 2122 case SO_BINDTOIFINDEX: 2123 v.val = READ_ONCE(sk->sk_bound_dev_if); 2124 break; 2125 2126 case SO_NETNS_COOKIE: 2127 lv = sizeof(u64); 2128 if (len != lv) 2129 return -EINVAL; 2130 v.val64 = sock_net(sk)->net_cookie; 2131 break; 2132 2133 case SO_BUF_LOCK: 2134 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; 2135 break; 2136 2137 case SO_RESERVE_MEM: 2138 v.val = READ_ONCE(sk->sk_reserved_mem); 2139 break; 2140 2141 case SO_TXREHASH: 2142 if (!sk_is_tcp(sk)) 2143 return -EOPNOTSUPP; 2144 2145 /* Paired with WRITE_ONCE() in sk_setsockopt() */ 2146 v.val = READ_ONCE(sk->sk_txrehash); 2147 break; 2148 2149 default: 2150 /* We implement the SO_SNDLOWAT etc to not be settable 2151 * (1003.1g 7). 2152 */ 2153 return -ENOPROTOOPT; 2154 } 2155 2156 if (len > lv) 2157 len = lv; 2158 if (copy_to_sockptr(optval, &v, len)) 2159 return -EFAULT; 2160 lenout: 2161 if (copy_to_sockptr(optlen, &len, sizeof(int))) 2162 return -EFAULT; 2163 return 0; 2164 } 2165 2166 /* 2167 * Initialize an sk_lock. 2168 * 2169 * (We also register the sk_lock with the lock validator.) 2170 */ 2171 static inline void sock_lock_init(struct sock *sk) 2172 { 2173 sk_owner_clear(sk); 2174 2175 if (sk->sk_kern_sock) 2176 sock_lock_init_class_and_name( 2177 sk, 2178 af_family_kern_slock_key_strings[sk->sk_family], 2179 af_family_kern_slock_keys + sk->sk_family, 2180 af_family_kern_key_strings[sk->sk_family], 2181 af_family_kern_keys + sk->sk_family); 2182 else 2183 sock_lock_init_class_and_name( 2184 sk, 2185 af_family_slock_key_strings[sk->sk_family], 2186 af_family_slock_keys + sk->sk_family, 2187 af_family_key_strings[sk->sk_family], 2188 af_family_keys + sk->sk_family); 2189 } 2190 2191 /* 2192 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 2193 * even temporarily, because of RCU lookups. sk_node should also be left as is. 2194 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 2195 */ 2196 static void sock_copy(struct sock *nsk, const struct sock *osk) 2197 { 2198 const struct proto *prot = READ_ONCE(osk->sk_prot); 2199 #ifdef CONFIG_SECURITY_NETWORK 2200 void *sptr = nsk->sk_security; 2201 #endif 2202 2203 /* If we move sk_tx_queue_mapping out of the private section, 2204 * we must check if sk_tx_queue_clear() is called after 2205 * sock_copy() in sk_clone_lock(). 2206 */ 2207 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < 2208 offsetof(struct sock, sk_dontcopy_begin) || 2209 offsetof(struct sock, sk_tx_queue_mapping) >= 2210 offsetof(struct sock, sk_dontcopy_end)); 2211 2212 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 2213 2214 unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 2215 prot->obj_size - offsetof(struct sock, sk_dontcopy_end), 2216 /* alloc is larger than struct, see sk_prot_alloc() */); 2217 2218 #ifdef CONFIG_SECURITY_NETWORK 2219 nsk->sk_security = sptr; 2220 security_sk_clone(osk, nsk); 2221 #endif 2222 } 2223 2224 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 2225 int family) 2226 { 2227 struct sock *sk; 2228 struct kmem_cache *slab; 2229 2230 slab = prot->slab; 2231 if (slab != NULL) { 2232 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 2233 if (!sk) 2234 return sk; 2235 if (want_init_on_alloc(priority)) 2236 sk_prot_clear_nulls(sk, prot->obj_size); 2237 } else 2238 sk = kmalloc(prot->obj_size, priority); 2239 2240 if (sk != NULL) { 2241 if (security_sk_alloc(sk, family, priority)) 2242 goto out_free; 2243 2244 if (!try_module_get(prot->owner)) 2245 goto out_free_sec; 2246 } 2247 2248 return sk; 2249 2250 out_free_sec: 2251 security_sk_free(sk); 2252 out_free: 2253 if (slab != NULL) 2254 kmem_cache_free(slab, sk); 2255 else 2256 kfree(sk); 2257 return NULL; 2258 } 2259 2260 static void sk_prot_free(struct proto *prot, struct sock *sk) 2261 { 2262 struct kmem_cache *slab; 2263 struct module *owner; 2264 2265 owner = prot->owner; 2266 slab = prot->slab; 2267 2268 cgroup_sk_free(&sk->sk_cgrp_data); 2269 mem_cgroup_sk_free(sk); 2270 security_sk_free(sk); 2271 2272 sk_owner_put(sk); 2273 2274 if (slab != NULL) 2275 kmem_cache_free(slab, sk); 2276 else 2277 kfree(sk); 2278 module_put(owner); 2279 } 2280 2281 /** 2282 * sk_alloc - All socket objects are allocated here 2283 * @net: the applicable net namespace 2284 * @family: protocol family 2285 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 2286 * @prot: struct proto associated with this new sock instance 2287 * @kern: is this to be a kernel socket? 2288 */ 2289 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 2290 struct proto *prot, int kern) 2291 { 2292 struct sock *sk; 2293 2294 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 2295 if (sk) { 2296 sk->sk_family = family; 2297 /* 2298 * See comment in struct sock definition to understand 2299 * why we need sk_prot_creator -acme 2300 */ 2301 sk->sk_prot = sk->sk_prot_creator = prot; 2302 2303 if (READ_ONCE(net->core.sysctl_bypass_prot_mem)) 2304 sk->sk_bypass_prot_mem = 1; 2305 2306 sk->sk_kern_sock = kern; 2307 sock_lock_init(sk); 2308 2309 sk->sk_net_refcnt = kern ? 0 : 1; 2310 if (likely(sk->sk_net_refcnt)) { 2311 get_net_track(net, &sk->ns_tracker, priority); 2312 sock_inuse_add(net, 1); 2313 } else { 2314 net_passive_inc(net); 2315 __netns_tracker_alloc(net, &sk->ns_tracker, 2316 false, priority); 2317 } 2318 2319 sock_net_set(sk, net); 2320 refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); 2321 2322 mem_cgroup_sk_alloc(sk); 2323 cgroup_sk_alloc(&sk->sk_cgrp_data); 2324 sock_update_classid(&sk->sk_cgrp_data); 2325 sock_update_netprioidx(&sk->sk_cgrp_data); 2326 sk_tx_queue_clear(sk); 2327 } 2328 2329 return sk; 2330 } 2331 EXPORT_SYMBOL(sk_alloc); 2332 2333 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 2334 * grace period. This is the case for UDP sockets and TCP listeners. 2335 */ 2336 static void __sk_destruct(struct rcu_head *head) 2337 { 2338 struct sock *sk = container_of(head, struct sock, sk_rcu); 2339 struct net *net = sock_net(sk); 2340 struct sk_filter *filter; 2341 2342 if (sk->sk_destruct) 2343 sk->sk_destruct(sk); 2344 2345 filter = rcu_dereference_check(sk->sk_filter, 2346 refcount_read(&sk->sk_wmem_alloc) == 0); 2347 if (filter) { 2348 sk_filter_uncharge(sk, filter); 2349 RCU_INIT_POINTER(sk->sk_filter, NULL); 2350 } 2351 2352 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 2353 2354 #ifdef CONFIG_BPF_SYSCALL 2355 bpf_sk_storage_free(sk); 2356 #endif 2357 2358 if (atomic_read(&sk->sk_omem_alloc)) 2359 pr_debug("%s: optmem leakage (%d bytes) detected\n", 2360 __func__, atomic_read(&sk->sk_omem_alloc)); 2361 2362 if (sk->sk_frag.page) { 2363 put_page(sk->sk_frag.page); 2364 sk->sk_frag.page = NULL; 2365 } 2366 2367 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ 2368 put_cred(sk->sk_peer_cred); 2369 put_pid(sk->sk_peer_pid); 2370 2371 if (likely(sk->sk_net_refcnt)) { 2372 put_net_track(net, &sk->ns_tracker); 2373 } else { 2374 __netns_tracker_free(net, &sk->ns_tracker, false); 2375 net_passive_dec(net); 2376 } 2377 sk_prot_free(sk->sk_prot_creator, sk); 2378 } 2379 2380 void sk_net_refcnt_upgrade(struct sock *sk) 2381 { 2382 struct net *net = sock_net(sk); 2383 2384 WARN_ON_ONCE(sk->sk_net_refcnt); 2385 __netns_tracker_free(net, &sk->ns_tracker, false); 2386 net_passive_dec(net); 2387 sk->sk_net_refcnt = 1; 2388 get_net_track(net, &sk->ns_tracker, GFP_KERNEL); 2389 sock_inuse_add(net, 1); 2390 } 2391 EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade); 2392 2393 void sk_destruct(struct sock *sk) 2394 { 2395 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); 2396 2397 if (rcu_access_pointer(sk->sk_reuseport_cb)) { 2398 reuseport_detach_sock(sk); 2399 use_call_rcu = true; 2400 } 2401 2402 if (use_call_rcu) 2403 call_rcu(&sk->sk_rcu, __sk_destruct); 2404 else 2405 __sk_destruct(&sk->sk_rcu); 2406 } 2407 2408 static void __sk_free(struct sock *sk) 2409 { 2410 if (likely(sk->sk_net_refcnt)) 2411 sock_inuse_add(sock_net(sk), -1); 2412 2413 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 2414 sock_diag_broadcast_destroy(sk); 2415 else 2416 sk_destruct(sk); 2417 } 2418 2419 void sk_free(struct sock *sk) 2420 { 2421 /* 2422 * We subtract one from sk_wmem_alloc and can know if 2423 * some packets are still in some tx queue. 2424 * If not null, sock_wfree() will call __sk_free(sk) later 2425 */ 2426 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 2427 __sk_free(sk); 2428 } 2429 EXPORT_SYMBOL(sk_free); 2430 2431 static void sk_init_common(struct sock *sk) 2432 { 2433 skb_queue_head_init(&sk->sk_receive_queue); 2434 skb_queue_head_init(&sk->sk_write_queue); 2435 skb_queue_head_init(&sk->sk_error_queue); 2436 2437 rwlock_init(&sk->sk_callback_lock); 2438 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 2439 af_rlock_keys + sk->sk_family, 2440 af_family_rlock_key_strings[sk->sk_family]); 2441 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 2442 af_wlock_keys + sk->sk_family, 2443 af_family_wlock_key_strings[sk->sk_family]); 2444 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 2445 af_elock_keys + sk->sk_family, 2446 af_family_elock_key_strings[sk->sk_family]); 2447 if (sk->sk_kern_sock) 2448 lockdep_set_class_and_name(&sk->sk_callback_lock, 2449 af_kern_callback_keys + sk->sk_family, 2450 af_family_kern_clock_key_strings[sk->sk_family]); 2451 else 2452 lockdep_set_class_and_name(&sk->sk_callback_lock, 2453 af_callback_keys + sk->sk_family, 2454 af_family_clock_key_strings[sk->sk_family]); 2455 } 2456 2457 /** 2458 * sk_clone - clone a socket 2459 * @sk: the socket to clone 2460 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 2461 * @lock: if true, lock the cloned sk 2462 * 2463 * If @lock is true, the clone is locked by bh_lock_sock(), and 2464 * caller must unlock socket even in error path by bh_unlock_sock(). 2465 */ 2466 struct sock *sk_clone(const struct sock *sk, const gfp_t priority, 2467 bool lock) 2468 { 2469 struct proto *prot = READ_ONCE(sk->sk_prot); 2470 struct sk_filter *filter; 2471 bool is_charged = true; 2472 struct sock *newsk; 2473 2474 newsk = sk_prot_alloc(prot, priority, sk->sk_family); 2475 if (!newsk) 2476 goto out; 2477 2478 sock_copy(newsk, sk); 2479 2480 newsk->sk_prot_creator = prot; 2481 2482 /* SANITY */ 2483 if (likely(newsk->sk_net_refcnt)) { 2484 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority); 2485 sock_inuse_add(sock_net(newsk), 1); 2486 } else { 2487 /* Kernel sockets are not elevating the struct net refcount. 2488 * Instead, use a tracker to more easily detect if a layer 2489 * is not properly dismantling its kernel sockets at netns 2490 * destroy time. 2491 */ 2492 net_passive_inc(sock_net(newsk)); 2493 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, 2494 false, priority); 2495 } 2496 2497 sk_node_init(&newsk->sk_node); 2498 sock_lock_init(newsk); 2499 2500 if (lock) 2501 bh_lock_sock(newsk); 2502 2503 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 2504 newsk->sk_backlog.len = 0; 2505 2506 atomic_set(&newsk->sk_rmem_alloc, 0); 2507 2508 refcount_set(&newsk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); 2509 2510 atomic_set(&newsk->sk_omem_alloc, 0); 2511 sk_init_common(newsk); 2512 2513 newsk->sk_dst_cache = NULL; 2514 newsk->sk_dst_pending_confirm = 0; 2515 newsk->sk_wmem_queued = 0; 2516 newsk->sk_forward_alloc = 0; 2517 newsk->sk_reserved_mem = 0; 2518 DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters); 2519 sk_drops_reset(newsk); 2520 newsk->sk_send_head = NULL; 2521 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 2522 atomic_set(&newsk->sk_zckey, 0); 2523 2524 sock_reset_flag(newsk, SOCK_DONE); 2525 2526 #ifdef CONFIG_MEMCG 2527 /* sk->sk_memcg will be populated at accept() time */ 2528 newsk->sk_memcg = NULL; 2529 #endif 2530 2531 cgroup_sk_clone(&newsk->sk_cgrp_data); 2532 2533 rcu_read_lock(); 2534 filter = rcu_dereference(sk->sk_filter); 2535 if (filter != NULL) 2536 /* though it's an empty new sock, the charging may fail 2537 * if sysctl_optmem_max was changed between creation of 2538 * original socket and cloning 2539 */ 2540 is_charged = sk_filter_charge(newsk, filter); 2541 RCU_INIT_POINTER(newsk->sk_filter, filter); 2542 rcu_read_unlock(); 2543 2544 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 2545 /* We need to make sure that we don't uncharge the new 2546 * socket if we couldn't charge it in the first place 2547 * as otherwise we uncharge the parent's filter. 2548 */ 2549 if (!is_charged) 2550 RCU_INIT_POINTER(newsk->sk_filter, NULL); 2551 2552 goto free; 2553 } 2554 2555 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 2556 2557 if (bpf_sk_storage_clone(sk, newsk)) 2558 goto free; 2559 2560 /* Clear sk_user_data if parent had the pointer tagged 2561 * as not suitable for copying when cloning. 2562 */ 2563 if (sk_user_data_is_nocopy(newsk)) 2564 newsk->sk_user_data = NULL; 2565 2566 newsk->sk_err = 0; 2567 newsk->sk_err_soft = 0; 2568 newsk->sk_priority = 0; 2569 newsk->sk_incoming_cpu = raw_smp_processor_id(); 2570 2571 /* Before updating sk_refcnt, we must commit prior changes to memory 2572 * (Documentation/RCU/rculist_nulls.rst for details) 2573 */ 2574 smp_wmb(); 2575 refcount_set(&newsk->sk_refcnt, 2); 2576 2577 sk_set_socket(newsk, NULL); 2578 sk_tx_queue_clear(newsk); 2579 sk_rx_queue_clear(newsk); 2580 RCU_INIT_POINTER(newsk->sk_wq, NULL); 2581 2582 if (newsk->sk_prot->sockets_allocated) 2583 sk_sockets_allocated_inc(newsk); 2584 2585 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) 2586 net_enable_timestamp(); 2587 out: 2588 return newsk; 2589 free: 2590 /* It is still raw copy of parent, so invalidate 2591 * destructor and make plain sk_free() 2592 */ 2593 newsk->sk_destruct = NULL; 2594 if (lock) 2595 bh_unlock_sock(newsk); 2596 sk_free(newsk); 2597 newsk = NULL; 2598 goto out; 2599 } 2600 EXPORT_SYMBOL_GPL(sk_clone); 2601 2602 static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev) 2603 { 2604 bool is_ipv6 = false; 2605 u32 max_size; 2606 2607 #if IS_ENABLED(CONFIG_IPV6) 2608 is_ipv6 = (sk->sk_family == AF_INET6 && 2609 !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); 2610 #endif 2611 /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ 2612 max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) : 2613 READ_ONCE(dev->gso_ipv4_max_size); 2614 if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) 2615 max_size = GSO_LEGACY_MAX_SIZE; 2616 2617 return max_size - (MAX_TCP_HEADER + 1); 2618 } 2619 2620 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 2621 { 2622 const struct net_device *dev; 2623 u32 max_segs = 1; 2624 2625 rcu_read_lock(); 2626 dev = dst_dev_rcu(dst); 2627 sk->sk_route_caps = dev->features; 2628 if (sk_is_tcp(sk)) { 2629 struct inet_connection_sock *icsk = inet_csk(sk); 2630 2631 sk->sk_route_caps |= NETIF_F_GSO; 2632 icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK); 2633 } 2634 if (sk->sk_route_caps & NETIF_F_GSO) 2635 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 2636 if (unlikely(sk->sk_gso_disabled)) 2637 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2638 if (sk_can_gso(sk)) { 2639 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 2640 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2641 } else { 2642 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 2643 sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev); 2644 /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ 2645 max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1); 2646 } 2647 } 2648 sk->sk_gso_max_segs = max_segs; 2649 sk_dst_set(sk, dst); 2650 rcu_read_unlock(); 2651 } 2652 EXPORT_SYMBOL_GPL(sk_setup_caps); 2653 2654 /* 2655 * Simple resource managers for sockets. 2656 */ 2657 2658 2659 /* 2660 * Write buffer destructor automatically called from kfree_skb. 2661 */ 2662 void sock_wfree(struct sk_buff *skb) 2663 { 2664 unsigned int len = skb->truesize; 2665 struct sock *sk = skb->sk; 2666 bool free; 2667 int old; 2668 2669 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 2670 if (sock_flag(sk, SOCK_RCU_FREE) && 2671 sk->sk_write_space == sock_def_write_space) { 2672 rcu_read_lock(); 2673 free = __refcount_sub_and_test(len, &sk->sk_wmem_alloc, 2674 &old); 2675 sock_def_write_space_wfree(sk, old - len); 2676 rcu_read_unlock(); 2677 if (unlikely(free)) 2678 __sk_free(sk); 2679 return; 2680 } 2681 2682 /* 2683 * Keep a reference on sk_wmem_alloc, this will be released 2684 * after sk_write_space() call 2685 */ 2686 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 2687 sk->sk_write_space(sk); 2688 len = 1; 2689 } 2690 /* 2691 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 2692 * could not do because of in-flight packets 2693 */ 2694 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 2695 __sk_free(sk); 2696 } 2697 EXPORT_SYMBOL(sock_wfree); 2698 2699 /* This variant of sock_wfree() is used by TCP, 2700 * since it sets SOCK_USE_WRITE_QUEUE. 2701 */ 2702 void __sock_wfree(struct sk_buff *skb) 2703 { 2704 struct sock *sk = skb->sk; 2705 2706 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 2707 __sk_free(sk); 2708 } 2709 2710 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 2711 { 2712 int old_wmem; 2713 2714 skb_orphan(skb); 2715 #ifdef CONFIG_INET 2716 if (unlikely(!sk_fullsock(sk))) 2717 return skb_set_owner_edemux(skb, sk); 2718 #endif 2719 skb->sk = sk; 2720 skb->destructor = sock_wfree; 2721 skb_set_hash_from_sk(skb, sk); 2722 /* 2723 * We used to take a refcount on sk, but following operation 2724 * is enough to guarantee sk_free() won't free this sock until 2725 * all in-flight packets are completed 2726 */ 2727 __refcount_add(skb->truesize, &sk->sk_wmem_alloc, &old_wmem); 2728 2729 /* (old_wmem == SK_WMEM_ALLOC_BIAS) if no other TX packet for this socket 2730 * is in a host queue (qdisc, NIC queue). 2731 * Set skb->ooo_okay so that netdev_pick_tx() can choose a TX queue 2732 * based on XPS for better performance. 2733 * Otherwise clear ooo_okay to not risk Out Of Order delivery. 2734 */ 2735 skb->ooo_okay = (old_wmem == SK_WMEM_ALLOC_BIAS); 2736 } 2737 EXPORT_SYMBOL(skb_set_owner_w); 2738 2739 static bool can_skb_orphan_partial(const struct sk_buff *skb) 2740 { 2741 /* Drivers depend on in-order delivery for crypto offload, 2742 * partial orphan breaks out-of-order-OK logic. 2743 */ 2744 if (skb_is_decrypted(skb)) 2745 return false; 2746 2747 return (skb->destructor == sock_wfree || 2748 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); 2749 } 2750 2751 /* This helper is used by netem, as it can hold packets in its 2752 * delay queue. We want to allow the owner socket to send more 2753 * packets, as if they were already TX completed by a typical driver. 2754 * But we also want to keep skb->sk set because some packet schedulers 2755 * rely on it (sch_fq for example). 2756 */ 2757 void skb_orphan_partial(struct sk_buff *skb) 2758 { 2759 if (skb_is_tcp_pure_ack(skb)) 2760 return; 2761 2762 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) 2763 return; 2764 2765 skb_orphan(skb); 2766 } 2767 EXPORT_SYMBOL(skb_orphan_partial); 2768 2769 /* 2770 * Read buffer destructor automatically called from kfree_skb. 2771 */ 2772 void sock_rfree(struct sk_buff *skb) 2773 { 2774 struct sock *sk = skb->sk; 2775 unsigned int len = skb->truesize; 2776 2777 atomic_sub(len, &sk->sk_rmem_alloc); 2778 sk_mem_uncharge(sk, len); 2779 } 2780 EXPORT_SYMBOL(sock_rfree); 2781 2782 /* 2783 * Buffer destructor for skbs that are not used directly in read or write 2784 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 2785 */ 2786 void sock_efree(struct sk_buff *skb) 2787 { 2788 sock_put(skb->sk); 2789 } 2790 EXPORT_SYMBOL(sock_efree); 2791 2792 /* Buffer destructor for prefetch/receive path where reference count may 2793 * not be held, e.g. for listen sockets. 2794 */ 2795 #ifdef CONFIG_INET 2796 void sock_pfree(struct sk_buff *skb) 2797 { 2798 struct sock *sk = skb->sk; 2799 2800 if (!sk_is_refcounted(sk)) 2801 return; 2802 2803 if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) { 2804 inet_reqsk(sk)->rsk_listener = NULL; 2805 reqsk_free(inet_reqsk(sk)); 2806 return; 2807 } 2808 2809 sock_gen_put(sk); 2810 } 2811 EXPORT_SYMBOL(sock_pfree); 2812 #endif /* CONFIG_INET */ 2813 2814 /* 2815 * Allocate a skb from the socket's send buffer. 2816 */ 2817 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 2818 gfp_t priority) 2819 { 2820 if (force || 2821 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { 2822 struct sk_buff *skb = alloc_skb(size, priority); 2823 2824 if (skb) { 2825 skb_set_owner_w(skb, sk); 2826 return skb; 2827 } 2828 } 2829 return NULL; 2830 } 2831 EXPORT_SYMBOL(sock_wmalloc); 2832 2833 static void sock_ofree(struct sk_buff *skb) 2834 { 2835 struct sock *sk = skb->sk; 2836 2837 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 2838 } 2839 2840 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 2841 gfp_t priority) 2842 { 2843 struct sk_buff *skb; 2844 2845 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 2846 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 2847 READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) 2848 return NULL; 2849 2850 skb = alloc_skb(size, priority); 2851 if (!skb) 2852 return NULL; 2853 2854 atomic_add(skb->truesize, &sk->sk_omem_alloc); 2855 skb->sk = sk; 2856 skb->destructor = sock_ofree; 2857 return skb; 2858 } 2859 2860 /* 2861 * Allocate a memory block from the socket's option memory buffer. 2862 */ 2863 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 2864 { 2865 int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); 2866 2867 if ((unsigned int)size <= optmem_max && 2868 atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { 2869 void *mem; 2870 /* First do the add, to avoid the race if kmalloc 2871 * might sleep. 2872 */ 2873 atomic_add(size, &sk->sk_omem_alloc); 2874 mem = kmalloc(size, priority); 2875 if (mem) 2876 return mem; 2877 atomic_sub(size, &sk->sk_omem_alloc); 2878 } 2879 return NULL; 2880 } 2881 EXPORT_SYMBOL(sock_kmalloc); 2882 2883 /* 2884 * Duplicate the input "src" memory block using the socket's 2885 * option memory buffer. 2886 */ 2887 void *sock_kmemdup(struct sock *sk, const void *src, 2888 int size, gfp_t priority) 2889 { 2890 void *mem; 2891 2892 mem = sock_kmalloc(sk, size, priority); 2893 if (mem) 2894 memcpy(mem, src, size); 2895 return mem; 2896 } 2897 EXPORT_SYMBOL(sock_kmemdup); 2898 2899 /* Free an option memory block. Note, we actually want the inline 2900 * here as this allows gcc to detect the nullify and fold away the 2901 * condition entirely. 2902 */ 2903 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2904 const bool nullify) 2905 { 2906 if (WARN_ON_ONCE(!mem)) 2907 return; 2908 if (nullify) 2909 kfree_sensitive(mem); 2910 else 2911 kfree(mem); 2912 atomic_sub(size, &sk->sk_omem_alloc); 2913 } 2914 2915 void sock_kfree_s(struct sock *sk, void *mem, int size) 2916 { 2917 __sock_kfree_s(sk, mem, size, false); 2918 } 2919 EXPORT_SYMBOL(sock_kfree_s); 2920 2921 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2922 { 2923 __sock_kfree_s(sk, mem, size, true); 2924 } 2925 EXPORT_SYMBOL(sock_kzfree_s); 2926 2927 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2928 I think, these locks should be removed for datagram sockets. 2929 */ 2930 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2931 { 2932 DEFINE_WAIT(wait); 2933 2934 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2935 for (;;) { 2936 if (!timeo) 2937 break; 2938 if (signal_pending(current)) 2939 break; 2940 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2941 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2942 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) 2943 break; 2944 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2945 break; 2946 if (READ_ONCE(sk->sk_err)) 2947 break; 2948 timeo = schedule_timeout(timeo); 2949 } 2950 finish_wait(sk_sleep(sk), &wait); 2951 return timeo; 2952 } 2953 2954 2955 /* 2956 * Generic send/receive buffer handlers 2957 */ 2958 2959 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2960 unsigned long data_len, int noblock, 2961 int *errcode, int max_page_order) 2962 { 2963 struct sk_buff *skb; 2964 long timeo; 2965 int err; 2966 2967 timeo = sock_sndtimeo(sk, noblock); 2968 for (;;) { 2969 err = sock_error(sk); 2970 if (err != 0) 2971 goto failure; 2972 2973 err = -EPIPE; 2974 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2975 goto failure; 2976 2977 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) 2978 break; 2979 2980 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2981 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2982 err = -EAGAIN; 2983 if (!timeo) 2984 goto failure; 2985 if (signal_pending(current)) 2986 goto interrupted; 2987 timeo = sock_wait_for_wmem(sk, timeo); 2988 } 2989 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2990 errcode, sk->sk_allocation); 2991 if (skb) 2992 skb_set_owner_w(skb, sk); 2993 return skb; 2994 2995 interrupted: 2996 err = sock_intr_errno(timeo); 2997 failure: 2998 *errcode = err; 2999 return NULL; 3000 } 3001 EXPORT_SYMBOL(sock_alloc_send_pskb); 3002 3003 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, 3004 struct sockcm_cookie *sockc) 3005 { 3006 u32 tsflags; 3007 3008 BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31)); 3009 3010 switch (cmsg->cmsg_type) { 3011 case SO_MARK: 3012 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 3013 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3014 return -EPERM; 3015 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3016 return -EINVAL; 3017 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 3018 break; 3019 case SO_TIMESTAMPING_OLD: 3020 case SO_TIMESTAMPING_NEW: 3021 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3022 return -EINVAL; 3023 3024 tsflags = *(u32 *)CMSG_DATA(cmsg); 3025 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 3026 return -EINVAL; 3027 3028 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 3029 sockc->tsflags |= tsflags; 3030 break; 3031 case SCM_TXTIME: 3032 if (!sock_flag(sk, SOCK_TXTIME)) 3033 return -EINVAL; 3034 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 3035 return -EINVAL; 3036 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 3037 break; 3038 case SCM_TS_OPT_ID: 3039 if (sk_is_tcp(sk)) 3040 return -EINVAL; 3041 tsflags = READ_ONCE(sk->sk_tsflags); 3042 if (!(tsflags & SOF_TIMESTAMPING_OPT_ID)) 3043 return -EINVAL; 3044 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3045 return -EINVAL; 3046 sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg); 3047 sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID; 3048 break; 3049 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 3050 case SCM_RIGHTS: 3051 case SCM_CREDENTIALS: 3052 break; 3053 case SO_PRIORITY: 3054 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3055 return -EINVAL; 3056 if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg))) 3057 return -EPERM; 3058 sockc->priority = *(u32 *)CMSG_DATA(cmsg); 3059 break; 3060 case SCM_DEVMEM_DMABUF: 3061 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3062 return -EINVAL; 3063 sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg); 3064 break; 3065 default: 3066 return -EINVAL; 3067 } 3068 return 0; 3069 } 3070 EXPORT_SYMBOL(__sock_cmsg_send); 3071 3072 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 3073 struct sockcm_cookie *sockc) 3074 { 3075 struct cmsghdr *cmsg; 3076 int ret; 3077 3078 for_each_cmsghdr(cmsg, msg) { 3079 if (!CMSG_OK(msg, cmsg)) 3080 return -EINVAL; 3081 if (cmsg->cmsg_level != SOL_SOCKET) 3082 continue; 3083 ret = __sock_cmsg_send(sk, cmsg, sockc); 3084 if (ret) 3085 return ret; 3086 } 3087 return 0; 3088 } 3089 EXPORT_SYMBOL(sock_cmsg_send); 3090 3091 static void sk_enter_memory_pressure(struct sock *sk) 3092 { 3093 if (!sk->sk_prot->enter_memory_pressure) 3094 return; 3095 3096 sk->sk_prot->enter_memory_pressure(sk); 3097 } 3098 3099 static void sk_leave_memory_pressure(struct sock *sk) 3100 { 3101 if (sk->sk_prot->leave_memory_pressure) { 3102 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure, 3103 tcp_leave_memory_pressure, sk); 3104 } else { 3105 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 3106 3107 if (memory_pressure && READ_ONCE(*memory_pressure)) 3108 WRITE_ONCE(*memory_pressure, 0); 3109 } 3110 } 3111 3112 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); 3113 3114 /** 3115 * skb_page_frag_refill - check that a page_frag contains enough room 3116 * @sz: minimum size of the fragment we want to get 3117 * @pfrag: pointer to page_frag 3118 * @gfp: priority for memory allocation 3119 * 3120 * Note: While this allocator tries to use high order pages, there is 3121 * no guarantee that allocations succeed. Therefore, @sz MUST be 3122 * less or equal than PAGE_SIZE. 3123 */ 3124 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 3125 { 3126 if (pfrag->page) { 3127 if (page_ref_count(pfrag->page) == 1) { 3128 pfrag->offset = 0; 3129 return true; 3130 } 3131 if (pfrag->offset + sz <= pfrag->size) 3132 return true; 3133 put_page(pfrag->page); 3134 } 3135 3136 pfrag->offset = 0; 3137 if (SKB_FRAG_PAGE_ORDER && 3138 !static_branch_unlikely(&net_high_order_alloc_disable_key)) { 3139 /* Avoid direct reclaim but allow kswapd to wake */ 3140 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 3141 __GFP_COMP | __GFP_NOWARN | 3142 __GFP_NORETRY, 3143 SKB_FRAG_PAGE_ORDER); 3144 if (likely(pfrag->page)) { 3145 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 3146 return true; 3147 } 3148 } 3149 pfrag->page = alloc_page(gfp); 3150 if (likely(pfrag->page)) { 3151 pfrag->size = PAGE_SIZE; 3152 return true; 3153 } 3154 return false; 3155 } 3156 EXPORT_SYMBOL(skb_page_frag_refill); 3157 3158 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 3159 { 3160 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 3161 return true; 3162 3163 if (!sk->sk_bypass_prot_mem) 3164 sk_enter_memory_pressure(sk); 3165 3166 sk_stream_moderate_sndbuf(sk); 3167 3168 return false; 3169 } 3170 EXPORT_SYMBOL(sk_page_frag_refill); 3171 3172 static void __lock_sock(struct sock *sk) 3173 __releases(&sk->sk_lock.slock) 3174 __acquires(&sk->sk_lock.slock) 3175 { 3176 DEFINE_WAIT(wait); 3177 3178 for (;;) { 3179 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 3180 TASK_UNINTERRUPTIBLE); 3181 spin_unlock_bh(&sk->sk_lock.slock); 3182 schedule(); 3183 spin_lock_bh(&sk->sk_lock.slock); 3184 if (!sock_owned_by_user(sk)) 3185 break; 3186 } 3187 finish_wait(&sk->sk_lock.wq, &wait); 3188 } 3189 3190 void __release_sock(struct sock *sk) 3191 __releases(&sk->sk_lock.slock) 3192 __acquires(&sk->sk_lock.slock) 3193 { 3194 struct sk_buff *skb, *next; 3195 int nb = 0; 3196 3197 while ((skb = sk->sk_backlog.head) != NULL) { 3198 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 3199 3200 spin_unlock_bh(&sk->sk_lock.slock); 3201 3202 while (1) { 3203 next = skb->next; 3204 prefetch(next); 3205 DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb)); 3206 skb_mark_not_on_list(skb); 3207 sk_backlog_rcv(sk, skb); 3208 3209 skb = next; 3210 if (!skb) 3211 break; 3212 3213 if (!(++nb & 15)) 3214 cond_resched(); 3215 } 3216 3217 spin_lock_bh(&sk->sk_lock.slock); 3218 } 3219 3220 /* 3221 * Doing the zeroing here guarantee we can not loop forever 3222 * while a wild producer attempts to flood us. 3223 */ 3224 sk->sk_backlog.len = 0; 3225 } 3226 3227 void __sk_flush_backlog(struct sock *sk) 3228 { 3229 spin_lock_bh(&sk->sk_lock.slock); 3230 __release_sock(sk); 3231 3232 if (sk->sk_prot->release_cb) 3233 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, 3234 tcp_release_cb, sk); 3235 3236 spin_unlock_bh(&sk->sk_lock.slock); 3237 } 3238 EXPORT_SYMBOL_GPL(__sk_flush_backlog); 3239 3240 /** 3241 * sk_wait_data - wait for data to arrive at sk_receive_queue 3242 * @sk: sock to wait on 3243 * @timeo: for how long 3244 * @skb: last skb seen on sk_receive_queue 3245 * 3246 * Now socket state including sk->sk_err is changed only under lock, 3247 * hence we may omit checks after joining wait queue. 3248 * We check receive queue before schedule() only as optimization; 3249 * it is very likely that release_sock() added new data. 3250 */ 3251 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 3252 { 3253 DEFINE_WAIT_FUNC(wait, woken_wake_function); 3254 int rc; 3255 3256 add_wait_queue(sk_sleep(sk), &wait); 3257 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 3258 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 3259 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 3260 remove_wait_queue(sk_sleep(sk), &wait); 3261 return rc; 3262 } 3263 EXPORT_SYMBOL(sk_wait_data); 3264 3265 /** 3266 * __sk_mem_raise_allocated - increase memory_allocated 3267 * @sk: socket 3268 * @size: memory size to allocate 3269 * @amt: pages to allocate 3270 * @kind: allocation type 3271 * 3272 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc. 3273 * 3274 * Unlike the globally shared limits among the sockets under same protocol, 3275 * consuming the budget of a memcg won't have direct effect on other ones. 3276 * So be optimistic about memcg's tolerance, and leave the callers to decide 3277 * whether or not to raise allocated through sk_under_memory_pressure() or 3278 * its variants. 3279 */ 3280 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 3281 { 3282 bool memcg_enabled = false, charged = false; 3283 struct proto *prot = sk->sk_prot; 3284 long allocated = 0; 3285 3286 if (!sk->sk_bypass_prot_mem) { 3287 sk_memory_allocated_add(sk, amt); 3288 allocated = sk_memory_allocated(sk); 3289 } 3290 3291 if (mem_cgroup_sk_enabled(sk)) { 3292 memcg_enabled = true; 3293 charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge()); 3294 if (!charged) 3295 goto suppress_allocation; 3296 } 3297 3298 if (!allocated) 3299 return 1; 3300 3301 /* Under limit. */ 3302 if (allocated <= sk_prot_mem_limits(sk, 0)) { 3303 sk_leave_memory_pressure(sk); 3304 return 1; 3305 } 3306 3307 /* Under pressure. */ 3308 if (allocated > sk_prot_mem_limits(sk, 1)) 3309 sk_enter_memory_pressure(sk); 3310 3311 /* Over hard limit. */ 3312 if (allocated > sk_prot_mem_limits(sk, 2)) 3313 goto suppress_allocation; 3314 3315 /* Guarantee minimum buffer size under pressure (either global 3316 * or memcg) to make sure features described in RFC 7323 (TCP 3317 * Extensions for High Performance) work properly. 3318 * 3319 * This rule does NOT stand when exceeds global or memcg's hard 3320 * limit, or else a DoS attack can be taken place by spawning 3321 * lots of sockets whose usage are under minimum buffer size. 3322 */ 3323 if (kind == SK_MEM_RECV) { 3324 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 3325 return 1; 3326 3327 } else { /* SK_MEM_SEND */ 3328 int wmem0 = sk_get_wmem0(sk, prot); 3329 3330 if (sk->sk_type == SOCK_STREAM) { 3331 if (sk->sk_wmem_queued < wmem0) 3332 return 1; 3333 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 3334 return 1; 3335 } 3336 } 3337 3338 if (sk_has_memory_pressure(sk)) { 3339 u64 alloc; 3340 3341 /* The following 'average' heuristic is within the 3342 * scope of global accounting, so it only makes 3343 * sense for global memory pressure. 3344 */ 3345 if (!sk_under_global_memory_pressure(sk)) 3346 return 1; 3347 3348 /* Try to be fair among all the sockets under global 3349 * pressure by allowing the ones that below average 3350 * usage to raise. 3351 */ 3352 alloc = sk_sockets_allocated_read_positive(sk); 3353 if (sk_prot_mem_limits(sk, 2) > alloc * 3354 sk_mem_pages(sk->sk_wmem_queued + 3355 atomic_read(&sk->sk_rmem_alloc) + 3356 sk->sk_forward_alloc)) 3357 return 1; 3358 } 3359 3360 suppress_allocation: 3361 3362 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 3363 sk_stream_moderate_sndbuf(sk); 3364 3365 /* Fail only if socket is _under_ its sndbuf. 3366 * In this case we cannot block, so that we have to fail. 3367 */ 3368 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { 3369 /* Force charge with __GFP_NOFAIL */ 3370 if (memcg_enabled && !charged) 3371 mem_cgroup_sk_charge(sk, amt, 3372 gfp_memcg_charge() | __GFP_NOFAIL); 3373 return 1; 3374 } 3375 } 3376 3377 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 3378 3379 if (allocated) 3380 sk_memory_allocated_sub(sk, amt); 3381 3382 if (charged) 3383 mem_cgroup_sk_uncharge(sk, amt); 3384 3385 return 0; 3386 } 3387 3388 /** 3389 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 3390 * @sk: socket 3391 * @size: memory size to allocate 3392 * @kind: allocation type 3393 * 3394 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 3395 * rmem allocation. This function assumes that protocols which have 3396 * memory_pressure use sk_wmem_queued as write buffer accounting. 3397 */ 3398 int __sk_mem_schedule(struct sock *sk, int size, int kind) 3399 { 3400 int ret, amt = sk_mem_pages(size); 3401 3402 sk_forward_alloc_add(sk, amt << PAGE_SHIFT); 3403 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 3404 if (!ret) 3405 sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT)); 3406 return ret; 3407 } 3408 EXPORT_SYMBOL(__sk_mem_schedule); 3409 3410 /** 3411 * __sk_mem_reduce_allocated - reclaim memory_allocated 3412 * @sk: socket 3413 * @amount: number of quanta 3414 * 3415 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 3416 */ 3417 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 3418 { 3419 if (mem_cgroup_sk_enabled(sk)) 3420 mem_cgroup_sk_uncharge(sk, amount); 3421 3422 if (sk->sk_bypass_prot_mem) 3423 return; 3424 3425 sk_memory_allocated_sub(sk, amount); 3426 3427 if (sk_under_global_memory_pressure(sk) && 3428 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 3429 sk_leave_memory_pressure(sk); 3430 } 3431 3432 /** 3433 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 3434 * @sk: socket 3435 * @amount: number of bytes (rounded down to a PAGE_SIZE multiple) 3436 */ 3437 void __sk_mem_reclaim(struct sock *sk, int amount) 3438 { 3439 amount >>= PAGE_SHIFT; 3440 sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT)); 3441 __sk_mem_reduce_allocated(sk, amount); 3442 } 3443 EXPORT_SYMBOL(__sk_mem_reclaim); 3444 3445 void __sk_charge(struct sock *sk, gfp_t gfp) 3446 { 3447 int amt; 3448 3449 gfp |= __GFP_NOFAIL; 3450 if (mem_cgroup_from_sk(sk)) { 3451 /* The socket has not been accepted yet, no need 3452 * to look at newsk->sk_wmem_queued. 3453 */ 3454 amt = sk_mem_pages(sk->sk_forward_alloc + 3455 atomic_read(&sk->sk_rmem_alloc)); 3456 if (amt) 3457 mem_cgroup_sk_charge(sk, amt, gfp); 3458 } 3459 3460 kmem_cache_charge(sk, gfp); 3461 } 3462 3463 int sk_set_peek_off(struct sock *sk, int val) 3464 { 3465 WRITE_ONCE(sk->sk_peek_off, val); 3466 return 0; 3467 } 3468 EXPORT_SYMBOL_GPL(sk_set_peek_off); 3469 3470 /* 3471 * Set of default routines for initialising struct proto_ops when 3472 * the protocol does not support a particular function. In certain 3473 * cases where it makes no sense for a protocol to have a "do nothing" 3474 * function, some default processing is provided. 3475 */ 3476 3477 int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len) 3478 { 3479 return -EOPNOTSUPP; 3480 } 3481 EXPORT_SYMBOL(sock_no_bind); 3482 3483 int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr, 3484 int len, int flags) 3485 { 3486 return -EOPNOTSUPP; 3487 } 3488 EXPORT_SYMBOL(sock_no_connect); 3489 3490 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 3491 { 3492 return -EOPNOTSUPP; 3493 } 3494 EXPORT_SYMBOL(sock_no_socketpair); 3495 3496 int sock_no_accept(struct socket *sock, struct socket *newsock, 3497 struct proto_accept_arg *arg) 3498 { 3499 return -EOPNOTSUPP; 3500 } 3501 EXPORT_SYMBOL(sock_no_accept); 3502 3503 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 3504 int peer) 3505 { 3506 return -EOPNOTSUPP; 3507 } 3508 EXPORT_SYMBOL(sock_no_getname); 3509 3510 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3511 { 3512 return -EOPNOTSUPP; 3513 } 3514 EXPORT_SYMBOL(sock_no_ioctl); 3515 3516 int sock_no_listen(struct socket *sock, int backlog) 3517 { 3518 return -EOPNOTSUPP; 3519 } 3520 EXPORT_SYMBOL(sock_no_listen); 3521 3522 int sock_no_shutdown(struct socket *sock, int how) 3523 { 3524 return -EOPNOTSUPP; 3525 } 3526 EXPORT_SYMBOL(sock_no_shutdown); 3527 3528 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 3529 { 3530 return -EOPNOTSUPP; 3531 } 3532 EXPORT_SYMBOL(sock_no_sendmsg); 3533 3534 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 3535 { 3536 return -EOPNOTSUPP; 3537 } 3538 EXPORT_SYMBOL(sock_no_sendmsg_locked); 3539 3540 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 3541 int flags) 3542 { 3543 return -EOPNOTSUPP; 3544 } 3545 EXPORT_SYMBOL(sock_no_recvmsg); 3546 3547 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 3548 { 3549 /* Mirror missing mmap method error code */ 3550 return -ENODEV; 3551 } 3552 EXPORT_SYMBOL(sock_no_mmap); 3553 3554 /* 3555 * When a file is received (via SCM_RIGHTS, etc), we must bump the 3556 * various sock-based usage counts. 3557 */ 3558 void __receive_sock(struct file *file) 3559 { 3560 struct socket *sock; 3561 3562 sock = sock_from_file(file); 3563 if (sock) { 3564 sock_update_netprioidx(&sock->sk->sk_cgrp_data); 3565 sock_update_classid(&sock->sk->sk_cgrp_data); 3566 } 3567 } 3568 3569 /* 3570 * Default Socket Callbacks 3571 */ 3572 3573 static void sock_def_wakeup(struct sock *sk) 3574 { 3575 struct socket_wq *wq; 3576 3577 rcu_read_lock(); 3578 wq = rcu_dereference(sk->sk_wq); 3579 if (skwq_has_sleeper(wq)) 3580 wake_up_interruptible_all(&wq->wait); 3581 rcu_read_unlock(); 3582 } 3583 3584 static void sock_def_error_report(struct sock *sk) 3585 { 3586 struct socket_wq *wq; 3587 3588 rcu_read_lock(); 3589 wq = rcu_dereference(sk->sk_wq); 3590 if (skwq_has_sleeper(wq)) 3591 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 3592 sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR); 3593 rcu_read_unlock(); 3594 } 3595 3596 void sock_def_readable(struct sock *sk) 3597 { 3598 struct socket_wq *wq; 3599 3600 trace_sk_data_ready(sk); 3601 3602 rcu_read_lock(); 3603 wq = rcu_dereference(sk->sk_wq); 3604 if (skwq_has_sleeper(wq)) 3605 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 3606 EPOLLRDNORM | EPOLLRDBAND); 3607 sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); 3608 rcu_read_unlock(); 3609 } 3610 3611 static void sock_def_write_space(struct sock *sk) 3612 { 3613 struct socket_wq *wq; 3614 3615 rcu_read_lock(); 3616 3617 /* Do not wake up a writer until he can make "significant" 3618 * progress. --DaveM 3619 */ 3620 if (sock_writeable(sk)) { 3621 wq = rcu_dereference(sk->sk_wq); 3622 if (skwq_has_sleeper(wq)) 3623 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 3624 EPOLLWRNORM | EPOLLWRBAND); 3625 3626 /* Should agree with poll, otherwise some programs break */ 3627 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 3628 } 3629 3630 rcu_read_unlock(); 3631 } 3632 3633 /* An optimised version of sock_def_write_space(), should only be called 3634 * for SOCK_RCU_FREE sockets under RCU read section and after putting 3635 * ->sk_wmem_alloc. 3636 */ 3637 static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc) 3638 { 3639 /* Do not wake up a writer until he can make "significant" 3640 * progress. --DaveM 3641 */ 3642 if (__sock_writeable(sk, wmem_alloc)) { 3643 struct socket_wq *wq = rcu_dereference(sk->sk_wq); 3644 3645 /* rely on refcount_sub from sock_wfree() */ 3646 smp_mb__after_atomic(); 3647 if (wq && waitqueue_active(&wq->wait)) 3648 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 3649 EPOLLWRNORM | EPOLLWRBAND); 3650 3651 /* Should agree with poll, otherwise some programs break */ 3652 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 3653 } 3654 } 3655 3656 static void sock_def_destruct(struct sock *sk) 3657 { 3658 } 3659 3660 void sk_send_sigurg(struct sock *sk) 3661 { 3662 if (sk->sk_socket && sk->sk_socket->file) 3663 if (send_sigurg(sk->sk_socket->file)) 3664 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 3665 } 3666 EXPORT_SYMBOL(sk_send_sigurg); 3667 3668 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 3669 unsigned long expires) 3670 { 3671 if (!mod_timer(timer, expires)) 3672 sock_hold(sk); 3673 } 3674 EXPORT_SYMBOL(sk_reset_timer); 3675 3676 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 3677 { 3678 if (timer_delete(timer)) 3679 __sock_put(sk); 3680 } 3681 EXPORT_SYMBOL(sk_stop_timer); 3682 3683 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) 3684 { 3685 if (timer_delete_sync(timer)) 3686 __sock_put(sk); 3687 } 3688 EXPORT_SYMBOL(sk_stop_timer_sync); 3689 3690 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) 3691 { 3692 sk_init_common(sk); 3693 sk->sk_send_head = NULL; 3694 3695 timer_setup(&sk->sk_timer, NULL, 0); 3696 3697 sk->sk_allocation = GFP_KERNEL; 3698 sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); 3699 sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); 3700 sk->sk_state = TCP_CLOSE; 3701 sk->sk_use_task_frag = true; 3702 sk_set_socket(sk, sock); 3703 3704 sock_set_flag(sk, SOCK_ZAPPED); 3705 3706 if (sock) { 3707 sk->sk_type = sock->type; 3708 RCU_INIT_POINTER(sk->sk_wq, &sock->wq); 3709 sock->sk = sk; 3710 } else { 3711 RCU_INIT_POINTER(sk->sk_wq, NULL); 3712 } 3713 sk->sk_uid = uid; 3714 3715 sk->sk_state_change = sock_def_wakeup; 3716 sk->sk_data_ready = sock_def_readable; 3717 sk->sk_write_space = sock_def_write_space; 3718 sk->sk_error_report = sock_def_error_report; 3719 sk->sk_destruct = sock_def_destruct; 3720 3721 sk->sk_frag.page = NULL; 3722 sk->sk_frag.offset = 0; 3723 sk->sk_peek_off = -1; 3724 3725 sk->sk_peer_pid = NULL; 3726 sk->sk_peer_cred = NULL; 3727 spin_lock_init(&sk->sk_peer_lock); 3728 3729 sk->sk_write_pending = 0; 3730 sk->sk_rcvlowat = 1; 3731 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 3732 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 3733 3734 sk->sk_stamp = SK_DEFAULT_STAMP; 3735 #if BITS_PER_LONG==32 3736 seqlock_init(&sk->sk_stamp_seq); 3737 #endif 3738 atomic_set(&sk->sk_zckey, 0); 3739 3740 #ifdef CONFIG_NET_RX_BUSY_POLL 3741 sk->sk_napi_id = 0; 3742 sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read); 3743 #endif 3744 3745 sk->sk_max_pacing_rate = ~0UL; 3746 sk->sk_pacing_rate = ~0UL; 3747 WRITE_ONCE(sk->sk_pacing_shift, 10); 3748 sk->sk_incoming_cpu = -1; 3749 3750 sk_rx_queue_clear(sk); 3751 /* 3752 * Before updating sk_refcnt, we must commit prior changes to memory 3753 * (Documentation/RCU/rculist_nulls.rst for details) 3754 */ 3755 smp_wmb(); 3756 refcount_set(&sk->sk_refcnt, 1); 3757 sk_drops_reset(sk); 3758 } 3759 EXPORT_SYMBOL(sock_init_data_uid); 3760 3761 void sock_init_data(struct socket *sock, struct sock *sk) 3762 { 3763 kuid_t uid = sock ? 3764 SOCK_INODE(sock)->i_uid : 3765 make_kuid(sock_net(sk)->user_ns, 0); 3766 3767 sock_init_data_uid(sock, sk, uid); 3768 } 3769 EXPORT_SYMBOL(sock_init_data); 3770 3771 void noinline lock_sock_nested(struct sock *sk, int subclass) 3772 { 3773 /* The sk_lock has mutex_lock() semantics here. */ 3774 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 3775 3776 might_sleep(); 3777 #ifdef CONFIG_64BIT 3778 if (sizeof(struct slock_owned) == sizeof(long)) { 3779 socket_lock_t tmp = { 3780 .slock = __SPIN_LOCK_UNLOCKED(tmp.slock), 3781 .owned = 1, 3782 }; 3783 socket_lock_t old = { 3784 .slock = __SPIN_LOCK_UNLOCKED(old.slock), 3785 .owned = 0, 3786 }; 3787 3788 if (likely(try_cmpxchg(&sk->sk_lock.combined, 3789 &old.combined, tmp.combined))) 3790 return; 3791 } 3792 #endif 3793 spin_lock_bh(&sk->sk_lock.slock); 3794 if (unlikely(sock_owned_by_user_nocheck(sk))) 3795 __lock_sock(sk); 3796 sk->sk_lock.owned = 1; 3797 spin_unlock_bh(&sk->sk_lock.slock); 3798 } 3799 EXPORT_SYMBOL(lock_sock_nested); 3800 3801 void release_sock(struct sock *sk) 3802 { 3803 spin_lock_bh(&sk->sk_lock.slock); 3804 3805 if (unlikely(sk->sk_backlog.tail)) 3806 __release_sock(sk); 3807 3808 if (sk->sk_prot->release_cb) { 3809 if (!tcp_release_cb_cond(sk)) 3810 sk->sk_prot->release_cb(sk); 3811 } 3812 sock_release_ownership(sk); 3813 if (unlikely(waitqueue_active(&sk->sk_lock.wq))) 3814 wake_up(&sk->sk_lock.wq); 3815 3816 spin_unlock_bh(&sk->sk_lock.slock); 3817 } 3818 EXPORT_SYMBOL(release_sock); 3819 3820 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) 3821 { 3822 might_sleep(); 3823 spin_lock_bh(&sk->sk_lock.slock); 3824 3825 if (likely(!sock_owned_by_user_nocheck(sk))) { 3826 /* 3827 * Fast path return with bottom halves disabled and 3828 * sock::sk_lock.slock held. 3829 * 3830 * The 'mutex' is not contended and holding 3831 * sock::sk_lock.slock prevents all other lockers to 3832 * proceed so the corresponding unlock_sock_fast() can 3833 * avoid the slow path of release_sock() completely and 3834 * just release slock. 3835 * 3836 * From a semantical POV this is equivalent to 'acquiring' 3837 * the 'mutex', hence the corresponding lockdep 3838 * mutex_release() has to happen in the fast path of 3839 * unlock_sock_fast(). 3840 */ 3841 return false; 3842 } 3843 3844 __lock_sock(sk); 3845 sk->sk_lock.owned = 1; 3846 __acquire(&sk->sk_lock.slock); 3847 spin_unlock_bh(&sk->sk_lock.slock); 3848 return true; 3849 } 3850 EXPORT_SYMBOL(__lock_sock_fast); 3851 3852 int sock_gettstamp(struct socket *sock, void __user *userstamp, 3853 bool timeval, bool time32) 3854 { 3855 struct sock *sk = sock->sk; 3856 struct timespec64 ts; 3857 3858 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 3859 ts = ktime_to_timespec64(sock_read_timestamp(sk)); 3860 if (ts.tv_sec == -1) 3861 return -ENOENT; 3862 if (ts.tv_sec == 0) { 3863 ktime_t kt = ktime_get_real(); 3864 sock_write_timestamp(sk, kt); 3865 ts = ktime_to_timespec64(kt); 3866 } 3867 3868 if (timeval) 3869 ts.tv_nsec /= 1000; 3870 3871 #ifdef CONFIG_COMPAT_32BIT_TIME 3872 if (time32) 3873 return put_old_timespec32(&ts, userstamp); 3874 #endif 3875 #ifdef CONFIG_SPARC64 3876 /* beware of padding in sparc64 timeval */ 3877 if (timeval && !in_compat_syscall()) { 3878 struct __kernel_old_timeval __user tv = { 3879 .tv_sec = ts.tv_sec, 3880 .tv_usec = ts.tv_nsec, 3881 }; 3882 if (copy_to_user(userstamp, &tv, sizeof(tv))) 3883 return -EFAULT; 3884 return 0; 3885 } 3886 #endif 3887 return put_timespec64(&ts, userstamp); 3888 } 3889 EXPORT_SYMBOL(sock_gettstamp); 3890 3891 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) 3892 { 3893 if (!sock_flag(sk, flag)) { 3894 unsigned long previous_flags = sk->sk_flags; 3895 3896 sock_set_flag(sk, flag); 3897 /* 3898 * we just set one of the two flags which require net 3899 * time stamping, but time stamping might have been on 3900 * already because of the other one 3901 */ 3902 if (sock_needs_netstamp(sk) && 3903 !(previous_flags & SK_FLAGS_TIMESTAMP)) 3904 net_enable_timestamp(); 3905 } 3906 } 3907 3908 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 3909 int level, int type) 3910 { 3911 struct sock_extended_err ee; 3912 struct sk_buff *skb; 3913 int copied, err; 3914 3915 err = -EAGAIN; 3916 skb = sock_dequeue_err_skb(sk); 3917 if (skb == NULL) 3918 goto out; 3919 3920 copied = skb->len; 3921 if (copied > len) { 3922 msg->msg_flags |= MSG_TRUNC; 3923 copied = len; 3924 } 3925 err = skb_copy_datagram_msg(skb, 0, msg, copied); 3926 if (err) 3927 goto out_free_skb; 3928 3929 sock_recv_timestamp(msg, sk, skb); 3930 3931 /* We must use a bounce buffer for CONFIG_HARDENED_USERCOPY=y */ 3932 ee = SKB_EXT_ERR(skb)->ee; 3933 put_cmsg(msg, level, type, sizeof(ee), &ee); 3934 3935 msg->msg_flags |= MSG_ERRQUEUE; 3936 err = copied; 3937 3938 out_free_skb: 3939 kfree_skb(skb); 3940 out: 3941 return err; 3942 } 3943 EXPORT_SYMBOL(sock_recv_errqueue); 3944 3945 /* 3946 * Get a socket option on an socket. 3947 * 3948 * FIX: POSIX 1003.1g is very ambiguous here. It states that 3949 * asynchronous errors should be reported by getsockopt. We assume 3950 * this means if you specify SO_ERROR (otherwise what is the point of it). 3951 */ 3952 int sock_common_getsockopt(struct socket *sock, int level, int optname, 3953 char __user *optval, int __user *optlen) 3954 { 3955 struct sock *sk = sock->sk; 3956 3957 /* IPV6_ADDRFORM can change sk->sk_prot under us. */ 3958 return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen); 3959 } 3960 EXPORT_SYMBOL(sock_common_getsockopt); 3961 3962 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 3963 int flags) 3964 { 3965 struct sock *sk = sock->sk; 3966 3967 return sk->sk_prot->recvmsg(sk, msg, size, flags); 3968 } 3969 EXPORT_SYMBOL(sock_common_recvmsg); 3970 3971 /* 3972 * Set socket options on an inet socket. 3973 */ 3974 int sock_common_setsockopt(struct socket *sock, int level, int optname, 3975 sockptr_t optval, unsigned int optlen) 3976 { 3977 struct sock *sk = sock->sk; 3978 3979 /* IPV6_ADDRFORM can change sk->sk_prot under us. */ 3980 return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen); 3981 } 3982 EXPORT_SYMBOL(sock_common_setsockopt); 3983 3984 void sk_common_release(struct sock *sk) 3985 { 3986 if (sk->sk_prot->destroy) 3987 sk->sk_prot->destroy(sk); 3988 3989 /* 3990 * Observation: when sk_common_release is called, processes have 3991 * no access to socket. But net still has. 3992 * Step one, detach it from networking: 3993 * 3994 * A. Remove from hash tables. 3995 */ 3996 3997 sk->sk_prot->unhash(sk); 3998 3999 /* 4000 * In this point socket cannot receive new packets, but it is possible 4001 * that some packets are in flight because some CPU runs receiver and 4002 * did hash table lookup before we unhashed socket. They will achieve 4003 * receive queue and will be purged by socket destructor. 4004 * 4005 * Also we still have packets pending on receive queue and probably, 4006 * our own packets waiting in device queues. sock_destroy will drain 4007 * receive queue, but transmitted packets will delay socket destruction 4008 * until the last reference will be released. 4009 */ 4010 4011 sock_orphan(sk); 4012 4013 xfrm_sk_free_policy(sk); 4014 4015 sock_put(sk); 4016 } 4017 EXPORT_SYMBOL(sk_common_release); 4018 4019 void sk_get_meminfo(const struct sock *sk, u32 *mem) 4020 { 4021 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 4022 4023 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 4024 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); 4025 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 4026 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); 4027 mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc); 4028 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); 4029 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 4030 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); 4031 mem[SK_MEMINFO_DROPS] = sk_drops_read(sk); 4032 } 4033 4034 #ifdef CONFIG_PROC_FS 4035 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 4036 4037 int sock_prot_inuse_get(struct net *net, struct proto *prot) 4038 { 4039 int cpu, idx = prot->inuse_idx; 4040 int res = 0; 4041 4042 for_each_possible_cpu(cpu) 4043 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 4044 4045 return res >= 0 ? res : 0; 4046 } 4047 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 4048 4049 int sock_inuse_get(struct net *net) 4050 { 4051 int cpu, res = 0; 4052 4053 for_each_possible_cpu(cpu) 4054 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all; 4055 4056 return res; 4057 } 4058 4059 EXPORT_SYMBOL_GPL(sock_inuse_get); 4060 4061 static int __net_init sock_inuse_init_net(struct net *net) 4062 { 4063 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 4064 if (net->core.prot_inuse == NULL) 4065 return -ENOMEM; 4066 return 0; 4067 } 4068 4069 static void __net_exit sock_inuse_exit_net(struct net *net) 4070 { 4071 free_percpu(net->core.prot_inuse); 4072 } 4073 4074 static struct pernet_operations net_inuse_ops = { 4075 .init = sock_inuse_init_net, 4076 .exit = sock_inuse_exit_net, 4077 }; 4078 4079 static __init int net_inuse_init(void) 4080 { 4081 if (register_pernet_subsys(&net_inuse_ops)) 4082 panic("Cannot initialize net inuse counters"); 4083 4084 return 0; 4085 } 4086 4087 core_initcall(net_inuse_init); 4088 4089 static int assign_proto_idx(struct proto *prot) 4090 { 4091 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 4092 4093 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) { 4094 pr_err("PROTO_INUSE_NR exhausted\n"); 4095 return -ENOSPC; 4096 } 4097 4098 set_bit(prot->inuse_idx, proto_inuse_idx); 4099 return 0; 4100 } 4101 4102 static void release_proto_idx(struct proto *prot) 4103 { 4104 if (prot->inuse_idx != PROTO_INUSE_NR) 4105 clear_bit(prot->inuse_idx, proto_inuse_idx); 4106 } 4107 #else 4108 static inline int assign_proto_idx(struct proto *prot) 4109 { 4110 return 0; 4111 } 4112 4113 static inline void release_proto_idx(struct proto *prot) 4114 { 4115 } 4116 4117 #endif 4118 4119 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) 4120 { 4121 if (!twsk_prot) 4122 return; 4123 kfree(twsk_prot->twsk_slab_name); 4124 twsk_prot->twsk_slab_name = NULL; 4125 kmem_cache_destroy(twsk_prot->twsk_slab); 4126 twsk_prot->twsk_slab = NULL; 4127 } 4128 4129 static int tw_prot_init(const struct proto *prot) 4130 { 4131 struct timewait_sock_ops *twsk_prot = prot->twsk_prot; 4132 4133 if (!twsk_prot) 4134 return 0; 4135 4136 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", 4137 prot->name); 4138 if (!twsk_prot->twsk_slab_name) 4139 return -ENOMEM; 4140 4141 twsk_prot->twsk_slab = 4142 kmem_cache_create(twsk_prot->twsk_slab_name, 4143 twsk_prot->twsk_obj_size, 0, 4144 SLAB_ACCOUNT | prot->slab_flags, 4145 NULL); 4146 if (!twsk_prot->twsk_slab) { 4147 pr_crit("%s: Can't create timewait sock SLAB cache!\n", 4148 prot->name); 4149 return -ENOMEM; 4150 } 4151 4152 return 0; 4153 } 4154 4155 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 4156 { 4157 if (!rsk_prot) 4158 return; 4159 kfree(rsk_prot->slab_name); 4160 rsk_prot->slab_name = NULL; 4161 kmem_cache_destroy(rsk_prot->slab); 4162 rsk_prot->slab = NULL; 4163 } 4164 4165 static int req_prot_init(const struct proto *prot) 4166 { 4167 struct request_sock_ops *rsk_prot = prot->rsk_prot; 4168 4169 if (!rsk_prot) 4170 return 0; 4171 4172 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 4173 prot->name); 4174 if (!rsk_prot->slab_name) 4175 return -ENOMEM; 4176 4177 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 4178 rsk_prot->obj_size, 0, 4179 SLAB_ACCOUNT | prot->slab_flags, 4180 NULL); 4181 4182 if (!rsk_prot->slab) { 4183 pr_crit("%s: Can't create request sock SLAB cache!\n", 4184 prot->name); 4185 return -ENOMEM; 4186 } 4187 return 0; 4188 } 4189 4190 int proto_register(struct proto *prot, int alloc_slab) 4191 { 4192 int ret = -ENOBUFS; 4193 4194 if (prot->memory_allocated && !prot->sysctl_mem) { 4195 pr_err("%s: missing sysctl_mem\n", prot->name); 4196 return -EINVAL; 4197 } 4198 if (prot->memory_allocated && !prot->per_cpu_fw_alloc) { 4199 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name); 4200 return -EINVAL; 4201 } 4202 if (alloc_slab) { 4203 struct kmem_cache_args args = { 4204 .useroffset = prot->useroffset, 4205 .usersize = prot->usersize, 4206 .freeptr_offset = prot->freeptr_offset, 4207 .use_freeptr_offset = !!prot->freeptr_offset, 4208 }; 4209 4210 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 4211 &args, 4212 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | 4213 prot->slab_flags); 4214 if (prot->slab == NULL) { 4215 pr_crit("%s: Can't create sock SLAB cache!\n", 4216 prot->name); 4217 goto out; 4218 } 4219 4220 if (req_prot_init(prot)) 4221 goto out_free_request_sock_slab; 4222 4223 if (tw_prot_init(prot)) 4224 goto out_free_timewait_sock_slab; 4225 } 4226 4227 mutex_lock(&proto_list_mutex); 4228 ret = assign_proto_idx(prot); 4229 if (ret) { 4230 mutex_unlock(&proto_list_mutex); 4231 goto out_free_timewait_sock_slab; 4232 } 4233 list_add(&prot->node, &proto_list); 4234 mutex_unlock(&proto_list_mutex); 4235 return ret; 4236 4237 out_free_timewait_sock_slab: 4238 if (alloc_slab) 4239 tw_prot_cleanup(prot->twsk_prot); 4240 out_free_request_sock_slab: 4241 if (alloc_slab) { 4242 req_prot_cleanup(prot->rsk_prot); 4243 4244 kmem_cache_destroy(prot->slab); 4245 prot->slab = NULL; 4246 } 4247 out: 4248 return ret; 4249 } 4250 EXPORT_SYMBOL(proto_register); 4251 4252 void proto_unregister(struct proto *prot) 4253 { 4254 mutex_lock(&proto_list_mutex); 4255 release_proto_idx(prot); 4256 list_del(&prot->node); 4257 mutex_unlock(&proto_list_mutex); 4258 4259 kmem_cache_destroy(prot->slab); 4260 prot->slab = NULL; 4261 4262 req_prot_cleanup(prot->rsk_prot); 4263 tw_prot_cleanup(prot->twsk_prot); 4264 } 4265 EXPORT_SYMBOL(proto_unregister); 4266 4267 int sock_load_diag_module(int family, int protocol) 4268 { 4269 if (!protocol) { 4270 if (!sock_is_registered(family)) 4271 return -ENOENT; 4272 4273 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 4274 NETLINK_SOCK_DIAG, family); 4275 } 4276 4277 #ifdef CONFIG_INET 4278 if (family == AF_INET && 4279 protocol != IPPROTO_RAW && 4280 protocol < MAX_INET_PROTOS && 4281 !rcu_access_pointer(inet_protos[protocol])) 4282 return -ENOENT; 4283 #endif 4284 4285 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 4286 NETLINK_SOCK_DIAG, family, protocol); 4287 } 4288 EXPORT_SYMBOL(sock_load_diag_module); 4289 4290 #ifdef CONFIG_PROC_FS 4291 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 4292 __acquires(proto_list_mutex) 4293 { 4294 mutex_lock(&proto_list_mutex); 4295 return seq_list_start_head(&proto_list, *pos); 4296 } 4297 4298 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 4299 { 4300 return seq_list_next(v, &proto_list, pos); 4301 } 4302 4303 static void proto_seq_stop(struct seq_file *seq, void *v) 4304 __releases(proto_list_mutex) 4305 { 4306 mutex_unlock(&proto_list_mutex); 4307 } 4308 4309 static char proto_method_implemented(const void *method) 4310 { 4311 return method == NULL ? 'n' : 'y'; 4312 } 4313 static long sock_prot_memory_allocated(struct proto *proto) 4314 { 4315 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 4316 } 4317 4318 static const char *sock_prot_memory_pressure(struct proto *proto) 4319 { 4320 return proto->memory_pressure != NULL ? 4321 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 4322 } 4323 4324 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 4325 { 4326 4327 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 4328 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 4329 proto->name, 4330 proto->obj_size, 4331 sock_prot_inuse_get(seq_file_net(seq), proto), 4332 sock_prot_memory_allocated(proto), 4333 sock_prot_memory_pressure(proto), 4334 proto->max_header, 4335 proto->slab == NULL ? "no" : "yes", 4336 module_name(proto->owner), 4337 proto_method_implemented(proto->close), 4338 proto_method_implemented(proto->connect), 4339 proto_method_implemented(proto->disconnect), 4340 proto_method_implemented(proto->accept), 4341 proto_method_implemented(proto->ioctl), 4342 proto_method_implemented(proto->init), 4343 proto_method_implemented(proto->destroy), 4344 proto_method_implemented(proto->shutdown), 4345 proto_method_implemented(proto->setsockopt), 4346 proto_method_implemented(proto->getsockopt), 4347 proto_method_implemented(proto->sendmsg), 4348 proto_method_implemented(proto->recvmsg), 4349 proto_method_implemented(proto->bind), 4350 proto_method_implemented(proto->backlog_rcv), 4351 proto_method_implemented(proto->hash), 4352 proto_method_implemented(proto->unhash), 4353 proto_method_implemented(proto->get_port), 4354 proto_method_implemented(proto->enter_memory_pressure)); 4355 } 4356 4357 static int proto_seq_show(struct seq_file *seq, void *v) 4358 { 4359 if (v == &proto_list) 4360 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 4361 "protocol", 4362 "size", 4363 "sockets", 4364 "memory", 4365 "press", 4366 "maxhdr", 4367 "slab", 4368 "module", 4369 "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n"); 4370 else 4371 proto_seq_printf(seq, list_entry(v, struct proto, node)); 4372 return 0; 4373 } 4374 4375 static const struct seq_operations proto_seq_ops = { 4376 .start = proto_seq_start, 4377 .next = proto_seq_next, 4378 .stop = proto_seq_stop, 4379 .show = proto_seq_show, 4380 }; 4381 4382 static __net_init int proto_init_net(struct net *net) 4383 { 4384 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, 4385 sizeof(struct seq_net_private))) 4386 return -ENOMEM; 4387 4388 return 0; 4389 } 4390 4391 static __net_exit void proto_exit_net(struct net *net) 4392 { 4393 remove_proc_entry("protocols", net->proc_net); 4394 } 4395 4396 4397 static __net_initdata struct pernet_operations proto_net_ops = { 4398 .init = proto_init_net, 4399 .exit = proto_exit_net, 4400 }; 4401 4402 static int __init proto_init(void) 4403 { 4404 return register_pernet_subsys(&proto_net_ops); 4405 } 4406 4407 subsys_initcall(proto_init); 4408 4409 #endif /* PROC_FS */ 4410 4411 #ifdef CONFIG_NET_RX_BUSY_POLL 4412 bool sk_busy_loop_end(void *p, unsigned long start_time) 4413 { 4414 struct sock *sk = p; 4415 4416 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 4417 return true; 4418 4419 if (sk_is_udp(sk) && 4420 !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) 4421 return true; 4422 4423 return sk_busy_loop_timeout(sk, start_time); 4424 } 4425 EXPORT_SYMBOL(sk_busy_loop_end); 4426 #endif /* CONFIG_NET_RX_BUSY_POLL */ 4427 4428 int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len) 4429 { 4430 if (!sk->sk_prot->bind_add) 4431 return -EOPNOTSUPP; 4432 return sk->sk_prot->bind_add(sk, addr, addr_len); 4433 } 4434 EXPORT_SYMBOL(sock_bind_add); 4435 4436 /* Copy 'size' bytes from userspace and return `size` back to userspace */ 4437 int sock_ioctl_inout(struct sock *sk, unsigned int cmd, 4438 void __user *arg, void *karg, size_t size) 4439 { 4440 int ret; 4441 4442 if (copy_from_user(karg, arg, size)) 4443 return -EFAULT; 4444 4445 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg); 4446 if (ret) 4447 return ret; 4448 4449 if (copy_to_user(arg, karg, size)) 4450 return -EFAULT; 4451 4452 return 0; 4453 } 4454 EXPORT_SYMBOL(sock_ioctl_inout); 4455 4456 /* This is the most common ioctl prep function, where the result (4 bytes) is 4457 * copied back to userspace if the ioctl() returns successfully. No input is 4458 * copied from userspace as input argument. 4459 */ 4460 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg) 4461 { 4462 int ret, karg = 0; 4463 4464 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg); 4465 if (ret) 4466 return ret; 4467 4468 return put_user(karg, (int __user *)arg); 4469 } 4470 4471 /* A wrapper around sock ioctls, which copies the data from userspace 4472 * (depending on the protocol/ioctl), and copies back the result to userspace. 4473 * The main motivation for this function is to pass kernel memory to the 4474 * protocol ioctl callbacks, instead of userspace memory. 4475 */ 4476 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) 4477 { 4478 int rc = 1; 4479 4480 if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET) 4481 rc = ipmr_sk_ioctl(sk, cmd, arg); 4482 else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6) 4483 rc = ip6mr_sk_ioctl(sk, cmd, arg); 4484 else if (sk_is_phonet(sk)) 4485 rc = phonet_sk_ioctl(sk, cmd, arg); 4486 4487 /* If ioctl was processed, returns its value */ 4488 if (rc <= 0) 4489 return rc; 4490 4491 /* Otherwise call the default handler */ 4492 return sock_ioctl_out(sk, cmd, arg); 4493 } 4494 EXPORT_SYMBOL(sk_ioctl); 4495 4496 static int __init sock_struct_check(void) 4497 { 4498 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops); 4499 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off); 4500 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue); 4501 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue); 4502 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog); 4503 4504 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst); 4505 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex); 4506 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie); 4507 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf); 4508 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter); 4509 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq); 4510 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready); 4511 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo); 4512 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat); 4513 4514 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err); 4515 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket); 4516 #ifdef CONFIG_MEMCG 4517 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); 4518 #endif 4519 4520 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); 4521 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); 4522 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc); 4523 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags); 4524 4525 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); 4526 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); 4527 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft); 4528 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued); 4529 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc); 4530 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags); 4531 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head); 4532 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue); 4533 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending); 4534 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag); 4535 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer); 4536 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate); 4537 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey); 4538 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey); 4539 4540 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_pending_confirm); 4541 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_status); 4542 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate); 4543 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo); 4544 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority); 4545 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark); 4546 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid); 4547 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol); 4548 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache); 4549 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps); 4550 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type); 4551 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size); 4552 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation); 4553 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash); 4554 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf); 4555 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs); 4556 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift); 4557 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag); 4558 return 0; 4559 } 4560 4561 core_initcall(sock_struct_check); 4562