1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic socket support routines. Memory allocators, socket lock/release 8 * handler for protocols to use and generic option handler. 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 */ 85 86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 87 88 #include <linux/unaligned.h> 89 #include <linux/capability.h> 90 #include <linux/errno.h> 91 #include <linux/errqueue.h> 92 #include <linux/types.h> 93 #include <linux/socket.h> 94 #include <linux/in.h> 95 #include <linux/kernel.h> 96 #include <linux/module.h> 97 #include <linux/proc_fs.h> 98 #include <linux/seq_file.h> 99 #include <linux/sched.h> 100 #include <linux/sched/mm.h> 101 #include <linux/timer.h> 102 #include <linux/string.h> 103 #include <linux/sockios.h> 104 #include <linux/net.h> 105 #include <linux/mm.h> 106 #include <linux/slab.h> 107 #include <linux/interrupt.h> 108 #include <linux/poll.h> 109 #include <linux/tcp.h> 110 #include <linux/udp.h> 111 #include <linux/init.h> 112 #include <linux/highmem.h> 113 #include <linux/user_namespace.h> 114 #include <linux/static_key.h> 115 #include <linux/memcontrol.h> 116 #include <linux/prefetch.h> 117 #include <linux/compat.h> 118 #include <linux/mroute.h> 119 #include <linux/mroute6.h> 120 #include <linux/icmpv6.h> 121 122 #include <linux/uaccess.h> 123 124 #include <linux/netdevice.h> 125 #include <net/protocol.h> 126 #include <linux/skbuff.h> 127 #include <linux/skbuff_ref.h> 128 #include <net/net_namespace.h> 129 #include <net/request_sock.h> 130 #include <net/sock.h> 131 #include <net/proto_memory.h> 132 #include <linux/net_tstamp.h> 133 #include <net/xfrm.h> 134 #include <linux/ipsec.h> 135 #include <net/cls_cgroup.h> 136 #include <net/netprio_cgroup.h> 137 #include <linux/sock_diag.h> 138 139 #include <linux/filter.h> 140 #include <net/sock_reuseport.h> 141 #include <net/bpf_sk_storage.h> 142 143 #include <trace/events/sock.h> 144 145 #include <net/tcp.h> 146 #include <net/busy_poll.h> 147 #include <net/phonet/phonet.h> 148 149 #include <linux/ethtool.h> 150 151 #include <uapi/linux/pidfd.h> 152 153 #include "dev.h" 154 155 static DEFINE_MUTEX(proto_list_mutex); 156 static LIST_HEAD(proto_list); 157 158 static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc); 159 static void sock_def_write_space(struct sock *sk); 160 161 /** 162 * sk_ns_capable - General socket capability test 163 * @sk: Socket to use a capability on or through 164 * @user_ns: The user namespace of the capability to use 165 * @cap: The capability to use 166 * 167 * Test to see if the opener of the socket had when the socket was 168 * created and the current process has the capability @cap in the user 169 * namespace @user_ns. 170 */ 171 bool sk_ns_capable(const struct sock *sk, 172 struct user_namespace *user_ns, int cap) 173 { 174 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 175 ns_capable(user_ns, cap); 176 } 177 EXPORT_SYMBOL(sk_ns_capable); 178 179 /** 180 * sk_capable - Socket global capability test 181 * @sk: Socket to use a capability on or through 182 * @cap: The global capability to use 183 * 184 * Test to see if the opener of the socket had when the socket was 185 * created and the current process has the capability @cap in all user 186 * namespaces. 187 */ 188 bool sk_capable(const struct sock *sk, int cap) 189 { 190 return sk_ns_capable(sk, &init_user_ns, cap); 191 } 192 EXPORT_SYMBOL(sk_capable); 193 194 /** 195 * sk_net_capable - Network namespace socket capability test 196 * @sk: Socket to use a capability on or through 197 * @cap: The capability to use 198 * 199 * Test to see if the opener of the socket had when the socket was created 200 * and the current process has the capability @cap over the network namespace 201 * the socket is a member of. 202 */ 203 bool sk_net_capable(const struct sock *sk, int cap) 204 { 205 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 206 } 207 EXPORT_SYMBOL(sk_net_capable); 208 209 /* 210 * Each address family might have different locking rules, so we have 211 * one slock key per address family and separate keys for internal and 212 * userspace sockets. 213 */ 214 static struct lock_class_key af_family_keys[AF_MAX]; 215 static struct lock_class_key af_family_kern_keys[AF_MAX]; 216 static struct lock_class_key af_family_slock_keys[AF_MAX]; 217 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 218 219 /* 220 * Make lock validator output more readable. (we pre-construct these 221 * strings build-time, so that runtime initialization of socket 222 * locks is fast): 223 */ 224 225 #define _sock_locks(x) \ 226 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 227 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 228 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 229 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 230 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 231 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 232 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 233 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 234 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 235 x "27" , x "28" , x "AF_CAN" , \ 236 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 237 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 238 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 239 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 240 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 241 x "AF_MCTP" , \ 242 x "AF_MAX" 243 244 static const char *const af_family_key_strings[AF_MAX+1] = { 245 _sock_locks("sk_lock-") 246 }; 247 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 248 _sock_locks("slock-") 249 }; 250 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 251 _sock_locks("clock-") 252 }; 253 254 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 255 _sock_locks("k-sk_lock-") 256 }; 257 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 258 _sock_locks("k-slock-") 259 }; 260 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 261 _sock_locks("k-clock-") 262 }; 263 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 264 _sock_locks("rlock-") 265 }; 266 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 267 _sock_locks("wlock-") 268 }; 269 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 270 _sock_locks("elock-") 271 }; 272 273 /* 274 * sk_callback_lock and sk queues locking rules are per-address-family, 275 * so split the lock classes by using a per-AF key: 276 */ 277 static struct lock_class_key af_callback_keys[AF_MAX]; 278 static struct lock_class_key af_rlock_keys[AF_MAX]; 279 static struct lock_class_key af_wlock_keys[AF_MAX]; 280 static struct lock_class_key af_elock_keys[AF_MAX]; 281 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 282 283 /* Run time adjustable parameters. */ 284 __u32 sysctl_wmem_max __read_mostly = 4 << 20; 285 EXPORT_SYMBOL(sysctl_wmem_max); 286 __u32 sysctl_rmem_max __read_mostly = 4 << 20; 287 EXPORT_SYMBOL(sysctl_rmem_max); 288 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT; 289 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT; 290 291 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 292 EXPORT_SYMBOL_GPL(memalloc_socks_key); 293 294 /** 295 * sk_set_memalloc - sets %SOCK_MEMALLOC 296 * @sk: socket to set it on 297 * 298 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 299 * It's the responsibility of the admin to adjust min_free_kbytes 300 * to meet the requirements 301 */ 302 void sk_set_memalloc(struct sock *sk) 303 { 304 sock_set_flag(sk, SOCK_MEMALLOC); 305 sk->sk_allocation |= __GFP_MEMALLOC; 306 static_branch_inc(&memalloc_socks_key); 307 } 308 EXPORT_SYMBOL_GPL(sk_set_memalloc); 309 310 void sk_clear_memalloc(struct sock *sk) 311 { 312 sock_reset_flag(sk, SOCK_MEMALLOC); 313 sk->sk_allocation &= ~__GFP_MEMALLOC; 314 static_branch_dec(&memalloc_socks_key); 315 316 /* 317 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 318 * progress of swapping. SOCK_MEMALLOC may be cleared while 319 * it has rmem allocations due to the last swapfile being deactivated 320 * but there is a risk that the socket is unusable due to exceeding 321 * the rmem limits. Reclaim the reserves and obey rmem limits again. 322 */ 323 sk_mem_reclaim(sk); 324 } 325 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 326 327 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 328 { 329 int ret; 330 unsigned int noreclaim_flag; 331 332 /* these should have been dropped before queueing */ 333 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 334 335 noreclaim_flag = memalloc_noreclaim_save(); 336 ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv, 337 tcp_v6_do_rcv, 338 tcp_v4_do_rcv, 339 sk, skb); 340 memalloc_noreclaim_restore(noreclaim_flag); 341 342 return ret; 343 } 344 EXPORT_SYMBOL(__sk_backlog_rcv); 345 346 void sk_error_report(struct sock *sk) 347 { 348 sk->sk_error_report(sk); 349 350 switch (sk->sk_family) { 351 case AF_INET: 352 fallthrough; 353 case AF_INET6: 354 trace_inet_sk_error_report(sk); 355 break; 356 default: 357 break; 358 } 359 } 360 EXPORT_SYMBOL(sk_error_report); 361 362 int sock_get_timeout(long timeo, void *optval, bool old_timeval) 363 { 364 struct __kernel_sock_timeval tv; 365 366 if (timeo == MAX_SCHEDULE_TIMEOUT) { 367 tv.tv_sec = 0; 368 tv.tv_usec = 0; 369 } else { 370 tv.tv_sec = timeo / HZ; 371 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; 372 } 373 374 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 375 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; 376 *(struct old_timeval32 *)optval = tv32; 377 return sizeof(tv32); 378 } 379 380 if (old_timeval) { 381 struct __kernel_old_timeval old_tv; 382 old_tv.tv_sec = tv.tv_sec; 383 old_tv.tv_usec = tv.tv_usec; 384 *(struct __kernel_old_timeval *)optval = old_tv; 385 return sizeof(old_tv); 386 } 387 388 *(struct __kernel_sock_timeval *)optval = tv; 389 return sizeof(tv); 390 } 391 EXPORT_SYMBOL(sock_get_timeout); 392 393 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, 394 sockptr_t optval, int optlen, bool old_timeval) 395 { 396 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 397 struct old_timeval32 tv32; 398 399 if (optlen < sizeof(tv32)) 400 return -EINVAL; 401 402 if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) 403 return -EFAULT; 404 tv->tv_sec = tv32.tv_sec; 405 tv->tv_usec = tv32.tv_usec; 406 } else if (old_timeval) { 407 struct __kernel_old_timeval old_tv; 408 409 if (optlen < sizeof(old_tv)) 410 return -EINVAL; 411 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) 412 return -EFAULT; 413 tv->tv_sec = old_tv.tv_sec; 414 tv->tv_usec = old_tv.tv_usec; 415 } else { 416 if (optlen < sizeof(*tv)) 417 return -EINVAL; 418 if (copy_from_sockptr(tv, optval, sizeof(*tv))) 419 return -EFAULT; 420 } 421 422 return 0; 423 } 424 EXPORT_SYMBOL(sock_copy_user_timeval); 425 426 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, 427 bool old_timeval) 428 { 429 struct __kernel_sock_timeval tv; 430 int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval); 431 long val; 432 433 if (err) 434 return err; 435 436 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 437 return -EDOM; 438 439 if (tv.tv_sec < 0) { 440 static int warned __read_mostly; 441 442 WRITE_ONCE(*timeo_p, 0); 443 if (warned < 10 && net_ratelimit()) { 444 warned++; 445 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 446 __func__, current->comm, task_pid_nr(current)); 447 } 448 return 0; 449 } 450 val = MAX_SCHEDULE_TIMEOUT; 451 if ((tv.tv_sec || tv.tv_usec) && 452 (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))) 453 val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, 454 USEC_PER_SEC / HZ); 455 WRITE_ONCE(*timeo_p, val); 456 return 0; 457 } 458 459 static bool sk_set_prio_allowed(const struct sock *sk, int val) 460 { 461 return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) || 462 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || 463 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); 464 } 465 466 static bool sock_needs_netstamp(const struct sock *sk) 467 { 468 switch (sk->sk_family) { 469 case AF_UNSPEC: 470 case AF_UNIX: 471 return false; 472 default: 473 return true; 474 } 475 } 476 477 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 478 { 479 if (sk->sk_flags & flags) { 480 sk->sk_flags &= ~flags; 481 if (sock_needs_netstamp(sk) && 482 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 483 net_disable_timestamp(); 484 } 485 } 486 487 488 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 489 { 490 unsigned long flags; 491 struct sk_buff_head *list = &sk->sk_receive_queue; 492 493 if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { 494 sk_drops_inc(sk); 495 trace_sock_rcvqueue_full(sk, skb); 496 return -ENOMEM; 497 } 498 499 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 500 sk_drops_inc(sk); 501 return -ENOBUFS; 502 } 503 504 skb->dev = NULL; 505 skb_set_owner_r(skb, sk); 506 507 /* we escape from rcu protected region, make sure we dont leak 508 * a norefcounted dst 509 */ 510 skb_dst_force(skb); 511 512 spin_lock_irqsave(&list->lock, flags); 513 sock_skb_set_dropcount(sk, skb); 514 __skb_queue_tail(list, skb); 515 spin_unlock_irqrestore(&list->lock, flags); 516 517 if (!sock_flag(sk, SOCK_DEAD)) 518 sk->sk_data_ready(sk); 519 return 0; 520 } 521 EXPORT_SYMBOL(__sock_queue_rcv_skb); 522 523 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, 524 enum skb_drop_reason *reason) 525 { 526 enum skb_drop_reason drop_reason; 527 int err; 528 529 err = sk_filter_reason(sk, skb, &drop_reason); 530 if (err) 531 goto out; 532 533 err = __sock_queue_rcv_skb(sk, skb); 534 switch (err) { 535 case -ENOMEM: 536 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; 537 break; 538 case -ENOBUFS: 539 drop_reason = SKB_DROP_REASON_PROTO_MEM; 540 break; 541 default: 542 drop_reason = SKB_NOT_DROPPED_YET; 543 break; 544 } 545 out: 546 if (reason) 547 *reason = drop_reason; 548 return err; 549 } 550 EXPORT_SYMBOL(sock_queue_rcv_skb_reason); 551 552 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 553 const int nested, unsigned int trim_cap, bool refcounted) 554 { 555 enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; 556 int rc = NET_RX_SUCCESS; 557 int err; 558 559 if (sk_filter_trim_cap(sk, skb, trim_cap, &reason)) 560 goto discard_and_relse; 561 562 skb->dev = NULL; 563 564 if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) { 565 sk_drops_inc(sk); 566 reason = SKB_DROP_REASON_SOCKET_RCVBUFF; 567 goto discard_and_relse; 568 } 569 if (nested) 570 bh_lock_sock_nested(sk); 571 else 572 bh_lock_sock(sk); 573 if (!sock_owned_by_user(sk)) { 574 /* 575 * trylock + unlock semantics: 576 */ 577 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 578 579 rc = sk_backlog_rcv(sk, skb); 580 581 mutex_release(&sk->sk_lock.dep_map, _RET_IP_); 582 } else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) { 583 bh_unlock_sock(sk); 584 if (err == -ENOMEM) 585 reason = SKB_DROP_REASON_PFMEMALLOC; 586 if (err == -ENOBUFS) 587 reason = SKB_DROP_REASON_SOCKET_BACKLOG; 588 sk_drops_inc(sk); 589 goto discard_and_relse; 590 } 591 592 bh_unlock_sock(sk); 593 out: 594 if (refcounted) 595 sock_put(sk); 596 return rc; 597 discard_and_relse: 598 sk_skb_reason_drop(sk, skb, reason); 599 goto out; 600 } 601 EXPORT_SYMBOL(__sk_receive_skb); 602 603 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, 604 u32)); 605 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 606 u32)); 607 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 608 { 609 struct dst_entry *dst = __sk_dst_get(sk); 610 611 if (dst && READ_ONCE(dst->obsolete) && 612 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 613 dst, cookie) == NULL) { 614 sk_tx_queue_clear(sk); 615 WRITE_ONCE(sk->sk_dst_pending_confirm, 0); 616 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 617 dst_release(dst); 618 return NULL; 619 } 620 621 return dst; 622 } 623 EXPORT_SYMBOL(__sk_dst_check); 624 625 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 626 { 627 struct dst_entry *dst = sk_dst_get(sk); 628 629 if (dst && READ_ONCE(dst->obsolete) && 630 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 631 dst, cookie) == NULL) { 632 sk_dst_reset(sk); 633 dst_release(dst); 634 return NULL; 635 } 636 637 return dst; 638 } 639 EXPORT_SYMBOL(sk_dst_check); 640 641 static int sock_bindtoindex_locked(struct sock *sk, int ifindex) 642 { 643 int ret = -ENOPROTOOPT; 644 #ifdef CONFIG_NETDEVICES 645 struct net *net = sock_net(sk); 646 647 /* Sorry... */ 648 ret = -EPERM; 649 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) 650 goto out; 651 652 ret = -EINVAL; 653 if (ifindex < 0) 654 goto out; 655 656 /* Paired with all READ_ONCE() done locklessly. */ 657 WRITE_ONCE(sk->sk_bound_dev_if, ifindex); 658 659 if (sk->sk_prot->rehash) 660 sk->sk_prot->rehash(sk); 661 sk_dst_reset(sk); 662 663 ret = 0; 664 665 out: 666 #endif 667 668 return ret; 669 } 670 671 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) 672 { 673 int ret; 674 675 if (lock_sk) 676 lock_sock(sk); 677 ret = sock_bindtoindex_locked(sk, ifindex); 678 if (lock_sk) 679 release_sock(sk); 680 681 return ret; 682 } 683 EXPORT_SYMBOL(sock_bindtoindex); 684 685 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) 686 { 687 int ret = -ENOPROTOOPT; 688 #ifdef CONFIG_NETDEVICES 689 struct net *net = sock_net(sk); 690 char devname[IFNAMSIZ]; 691 int index; 692 693 ret = -EINVAL; 694 if (optlen < 0) 695 goto out; 696 697 /* Bind this socket to a particular device like "eth0", 698 * as specified in the passed interface name. If the 699 * name is "" or the option length is zero the socket 700 * is not bound. 701 */ 702 if (optlen > IFNAMSIZ - 1) 703 optlen = IFNAMSIZ - 1; 704 memset(devname, 0, sizeof(devname)); 705 706 ret = -EFAULT; 707 if (copy_from_sockptr(devname, optval, optlen)) 708 goto out; 709 710 index = 0; 711 if (devname[0] != '\0') { 712 struct net_device *dev; 713 714 rcu_read_lock(); 715 dev = dev_get_by_name_rcu(net, devname); 716 if (dev) 717 index = dev->ifindex; 718 rcu_read_unlock(); 719 ret = -ENODEV; 720 if (!dev) 721 goto out; 722 } 723 724 sockopt_lock_sock(sk); 725 ret = sock_bindtoindex_locked(sk, index); 726 sockopt_release_sock(sk); 727 out: 728 #endif 729 730 return ret; 731 } 732 733 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval, 734 sockptr_t optlen, int len) 735 { 736 int ret = -ENOPROTOOPT; 737 #ifdef CONFIG_NETDEVICES 738 int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); 739 struct net *net = sock_net(sk); 740 char devname[IFNAMSIZ]; 741 742 if (bound_dev_if == 0) { 743 len = 0; 744 goto zero; 745 } 746 747 ret = -EINVAL; 748 if (len < IFNAMSIZ) 749 goto out; 750 751 ret = netdev_get_name(net, devname, bound_dev_if); 752 if (ret) 753 goto out; 754 755 len = strlen(devname) + 1; 756 757 ret = -EFAULT; 758 if (copy_to_sockptr(optval, devname, len)) 759 goto out; 760 761 zero: 762 ret = -EFAULT; 763 if (copy_to_sockptr(optlen, &len, sizeof(int))) 764 goto out; 765 766 ret = 0; 767 768 out: 769 #endif 770 771 return ret; 772 } 773 774 bool sk_mc_loop(const struct sock *sk) 775 { 776 if (dev_recursion_level()) 777 return false; 778 if (!sk) 779 return true; 780 /* IPV6_ADDRFORM can change sk->sk_family under us. */ 781 switch (READ_ONCE(sk->sk_family)) { 782 case AF_INET: 783 return inet_test_bit(MC_LOOP, sk); 784 #if IS_ENABLED(CONFIG_IPV6) 785 case AF_INET6: 786 return inet6_test_bit(MC6_LOOP, sk); 787 #endif 788 } 789 WARN_ON_ONCE(1); 790 return true; 791 } 792 EXPORT_SYMBOL(sk_mc_loop); 793 794 void sock_set_reuseaddr(struct sock *sk) 795 { 796 lock_sock(sk); 797 sk->sk_reuse = SK_CAN_REUSE; 798 release_sock(sk); 799 } 800 EXPORT_SYMBOL(sock_set_reuseaddr); 801 802 void sock_set_reuseport(struct sock *sk) 803 { 804 lock_sock(sk); 805 sk->sk_reuseport = true; 806 release_sock(sk); 807 } 808 EXPORT_SYMBOL(sock_set_reuseport); 809 810 void sock_no_linger(struct sock *sk) 811 { 812 lock_sock(sk); 813 WRITE_ONCE(sk->sk_lingertime, 0); 814 sock_set_flag(sk, SOCK_LINGER); 815 release_sock(sk); 816 } 817 EXPORT_SYMBOL(sock_no_linger); 818 819 void sock_set_priority(struct sock *sk, u32 priority) 820 { 821 WRITE_ONCE(sk->sk_priority, priority); 822 } 823 EXPORT_SYMBOL(sock_set_priority); 824 825 void sock_set_sndtimeo(struct sock *sk, s64 secs) 826 { 827 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) 828 WRITE_ONCE(sk->sk_sndtimeo, secs * HZ); 829 else 830 WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT); 831 } 832 EXPORT_SYMBOL(sock_set_sndtimeo); 833 834 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) 835 { 836 sock_valbool_flag(sk, SOCK_RCVTSTAMP, val); 837 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns); 838 if (val) { 839 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); 840 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 841 } 842 } 843 844 void sock_set_timestamp(struct sock *sk, int optname, bool valbool) 845 { 846 switch (optname) { 847 case SO_TIMESTAMP_OLD: 848 __sock_set_timestamps(sk, valbool, false, false); 849 break; 850 case SO_TIMESTAMP_NEW: 851 __sock_set_timestamps(sk, valbool, true, false); 852 break; 853 case SO_TIMESTAMPNS_OLD: 854 __sock_set_timestamps(sk, valbool, false, true); 855 break; 856 case SO_TIMESTAMPNS_NEW: 857 __sock_set_timestamps(sk, valbool, true, true); 858 break; 859 } 860 } 861 862 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) 863 { 864 struct net *net = sock_net(sk); 865 struct net_device *dev = NULL; 866 bool match = false; 867 int *vclock_index; 868 int i, num; 869 870 if (sk->sk_bound_dev_if) 871 dev = dev_get_by_index(net, sk->sk_bound_dev_if); 872 873 if (!dev) { 874 pr_err("%s: sock not bind to device\n", __func__); 875 return -EOPNOTSUPP; 876 } 877 878 num = ethtool_get_phc_vclocks(dev, &vclock_index); 879 dev_put(dev); 880 881 for (i = 0; i < num; i++) { 882 if (*(vclock_index + i) == phc_index) { 883 match = true; 884 break; 885 } 886 } 887 888 if (num > 0) 889 kfree(vclock_index); 890 891 if (!match) 892 return -EINVAL; 893 894 WRITE_ONCE(sk->sk_bind_phc, phc_index); 895 896 return 0; 897 } 898 899 int sock_set_timestamping(struct sock *sk, int optname, 900 struct so_timestamping timestamping) 901 { 902 int val = timestamping.flags; 903 int ret; 904 905 if (val & ~SOF_TIMESTAMPING_MASK) 906 return -EINVAL; 907 908 if (val & SOF_TIMESTAMPING_OPT_ID_TCP && 909 !(val & SOF_TIMESTAMPING_OPT_ID)) 910 return -EINVAL; 911 912 if (val & SOF_TIMESTAMPING_OPT_ID && 913 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 914 if (sk_is_tcp(sk)) { 915 if ((1 << sk->sk_state) & 916 (TCPF_CLOSE | TCPF_LISTEN)) 917 return -EINVAL; 918 if (val & SOF_TIMESTAMPING_OPT_ID_TCP) 919 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); 920 else 921 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); 922 } else { 923 atomic_set(&sk->sk_tskey, 0); 924 } 925 } 926 927 if (val & SOF_TIMESTAMPING_OPT_STATS && 928 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) 929 return -EINVAL; 930 931 if (val & SOF_TIMESTAMPING_BIND_PHC) { 932 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc); 933 if (ret) 934 return ret; 935 } 936 937 WRITE_ONCE(sk->sk_tsflags, val); 938 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); 939 sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY)); 940 941 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 942 sock_enable_timestamp(sk, 943 SOCK_TIMESTAMPING_RX_SOFTWARE); 944 else 945 sock_disable_timestamp(sk, 946 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 947 return 0; 948 } 949 950 #if defined(CONFIG_CGROUP_BPF) 951 void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op) 952 { 953 struct bpf_sock_ops_kern sock_ops; 954 955 memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); 956 sock_ops.op = op; 957 sock_ops.is_fullsock = 1; 958 sock_ops.sk = sk; 959 bpf_skops_init_skb(&sock_ops, skb, 0); 960 __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS); 961 } 962 #endif 963 964 void sock_set_keepalive(struct sock *sk) 965 { 966 lock_sock(sk); 967 if (sk->sk_prot->keepalive) 968 sk->sk_prot->keepalive(sk, true); 969 sock_valbool_flag(sk, SOCK_KEEPOPEN, true); 970 release_sock(sk); 971 } 972 EXPORT_SYMBOL(sock_set_keepalive); 973 974 static void __sock_set_rcvbuf(struct sock *sk, int val) 975 { 976 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it 977 * as a negative value. 978 */ 979 val = min_t(int, val, INT_MAX / 2); 980 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 981 982 /* We double it on the way in to account for "struct sk_buff" etc. 983 * overhead. Applications assume that the SO_RCVBUF setting they make 984 * will allow that much actual data to be received on that socket. 985 * 986 * Applications are unaware that "struct sk_buff" and other overheads 987 * allocate from the receive buffer during socket buffer allocation. 988 * 989 * And after considering the possible alternatives, returning the value 990 * we actually used in getsockopt is the most desirable behavior. 991 */ 992 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); 993 } 994 995 void sock_set_rcvbuf(struct sock *sk, int val) 996 { 997 lock_sock(sk); 998 __sock_set_rcvbuf(sk, val); 999 release_sock(sk); 1000 } 1001 EXPORT_SYMBOL(sock_set_rcvbuf); 1002 1003 static void __sock_set_mark(struct sock *sk, u32 val) 1004 { 1005 if (val != sk->sk_mark) { 1006 WRITE_ONCE(sk->sk_mark, val); 1007 sk_dst_reset(sk); 1008 } 1009 } 1010 1011 void sock_set_mark(struct sock *sk, u32 val) 1012 { 1013 lock_sock(sk); 1014 __sock_set_mark(sk, val); 1015 release_sock(sk); 1016 } 1017 EXPORT_SYMBOL(sock_set_mark); 1018 1019 static void sock_release_reserved_memory(struct sock *sk, int bytes) 1020 { 1021 /* Round down bytes to multiple of pages */ 1022 bytes = round_down(bytes, PAGE_SIZE); 1023 1024 WARN_ON(bytes > sk->sk_reserved_mem); 1025 WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes); 1026 sk_mem_reclaim(sk); 1027 } 1028 1029 static int sock_reserve_memory(struct sock *sk, int bytes) 1030 { 1031 long allocated; 1032 bool charged; 1033 int pages; 1034 1035 if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk)) 1036 return -EOPNOTSUPP; 1037 1038 if (!bytes) 1039 return 0; 1040 1041 pages = sk_mem_pages(bytes); 1042 1043 /* pre-charge to memcg */ 1044 charged = mem_cgroup_sk_charge(sk, pages, 1045 GFP_KERNEL | __GFP_RETRY_MAYFAIL); 1046 if (!charged) 1047 return -ENOMEM; 1048 1049 if (sk->sk_bypass_prot_mem) 1050 goto success; 1051 1052 /* pre-charge to forward_alloc */ 1053 sk_memory_allocated_add(sk, pages); 1054 allocated = sk_memory_allocated(sk); 1055 1056 /* If the system goes into memory pressure with this 1057 * precharge, give up and return error. 1058 */ 1059 if (allocated > sk_prot_mem_limits(sk, 1)) { 1060 sk_memory_allocated_sub(sk, pages); 1061 mem_cgroup_sk_uncharge(sk, pages); 1062 return -ENOMEM; 1063 } 1064 1065 success: 1066 sk_forward_alloc_add(sk, pages << PAGE_SHIFT); 1067 1068 WRITE_ONCE(sk->sk_reserved_mem, 1069 sk->sk_reserved_mem + (pages << PAGE_SHIFT)); 1070 1071 return 0; 1072 } 1073 1074 #ifdef CONFIG_PAGE_POOL 1075 1076 /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED 1077 * in 1 syscall. The limit exists to limit the amount of memory the kernel 1078 * allocates to copy these tokens, and to prevent looping over the frags for 1079 * too long. 1080 */ 1081 #define MAX_DONTNEED_TOKENS 128 1082 #define MAX_DONTNEED_FRAGS 1024 1083 1084 static noinline_for_stack int 1085 sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) 1086 { 1087 unsigned int num_tokens, i, j, k, netmem_num = 0; 1088 struct dmabuf_token *tokens; 1089 int ret = 0, num_frags = 0; 1090 netmem_ref netmems[16]; 1091 1092 if (!sk_is_tcp(sk)) 1093 return -EBADF; 1094 1095 if (optlen % sizeof(*tokens) || 1096 optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS) 1097 return -EINVAL; 1098 1099 num_tokens = optlen / sizeof(*tokens); 1100 tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL); 1101 if (!tokens) 1102 return -ENOMEM; 1103 1104 if (copy_from_sockptr(tokens, optval, optlen)) { 1105 kvfree(tokens); 1106 return -EFAULT; 1107 } 1108 1109 xa_lock_bh(&sk->sk_user_frags); 1110 for (i = 0; i < num_tokens; i++) { 1111 for (j = 0; j < tokens[i].token_count; j++) { 1112 if (++num_frags > MAX_DONTNEED_FRAGS) 1113 goto frag_limit_reached; 1114 1115 netmem_ref netmem = (__force netmem_ref)__xa_erase( 1116 &sk->sk_user_frags, tokens[i].token_start + j); 1117 1118 if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem))) 1119 continue; 1120 1121 netmems[netmem_num++] = netmem; 1122 if (netmem_num == ARRAY_SIZE(netmems)) { 1123 xa_unlock_bh(&sk->sk_user_frags); 1124 for (k = 0; k < netmem_num; k++) 1125 WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); 1126 netmem_num = 0; 1127 xa_lock_bh(&sk->sk_user_frags); 1128 } 1129 ret++; 1130 } 1131 } 1132 1133 frag_limit_reached: 1134 xa_unlock_bh(&sk->sk_user_frags); 1135 for (k = 0; k < netmem_num; k++) 1136 WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); 1137 1138 kvfree(tokens); 1139 return ret; 1140 } 1141 #endif 1142 1143 void sockopt_lock_sock(struct sock *sk) 1144 { 1145 /* When current->bpf_ctx is set, the setsockopt is called from 1146 * a bpf prog. bpf has ensured the sk lock has been 1147 * acquired before calling setsockopt(). 1148 */ 1149 if (has_current_bpf_ctx()) 1150 return; 1151 1152 lock_sock(sk); 1153 } 1154 EXPORT_SYMBOL(sockopt_lock_sock); 1155 1156 void sockopt_release_sock(struct sock *sk) 1157 { 1158 if (has_current_bpf_ctx()) 1159 return; 1160 1161 release_sock(sk); 1162 } 1163 EXPORT_SYMBOL(sockopt_release_sock); 1164 1165 bool sockopt_ns_capable(struct user_namespace *ns, int cap) 1166 { 1167 return has_current_bpf_ctx() || ns_capable(ns, cap); 1168 } 1169 EXPORT_SYMBOL(sockopt_ns_capable); 1170 1171 bool sockopt_capable(int cap) 1172 { 1173 return has_current_bpf_ctx() || capable(cap); 1174 } 1175 EXPORT_SYMBOL(sockopt_capable); 1176 1177 static int sockopt_validate_clockid(__kernel_clockid_t value) 1178 { 1179 switch (value) { 1180 case CLOCK_REALTIME: 1181 case CLOCK_MONOTONIC: 1182 case CLOCK_TAI: 1183 return 0; 1184 } 1185 return -EINVAL; 1186 } 1187 1188 /* 1189 * This is meant for all protocols to use and covers goings on 1190 * at the socket level. Everything here is generic. 1191 */ 1192 1193 int sk_setsockopt(struct sock *sk, int level, int optname, 1194 sockptr_t optval, unsigned int optlen) 1195 { 1196 struct so_timestamping timestamping; 1197 struct socket *sock = sk->sk_socket; 1198 struct sock_txtime sk_txtime; 1199 int val; 1200 int valbool; 1201 struct linger ling; 1202 int ret = 0; 1203 1204 /* 1205 * Options without arguments 1206 */ 1207 1208 if (optname == SO_BINDTODEVICE) 1209 return sock_setbindtodevice(sk, optval, optlen); 1210 1211 if (optlen < sizeof(int)) 1212 return -EINVAL; 1213 1214 if (copy_from_sockptr(&val, optval, sizeof(val))) 1215 return -EFAULT; 1216 1217 valbool = val ? 1 : 0; 1218 1219 /* handle options which do not require locking the socket. */ 1220 switch (optname) { 1221 case SO_PRIORITY: 1222 if (sk_set_prio_allowed(sk, val)) { 1223 sock_set_priority(sk, val); 1224 return 0; 1225 } 1226 return -EPERM; 1227 case SO_TYPE: 1228 case SO_PROTOCOL: 1229 case SO_DOMAIN: 1230 case SO_ERROR: 1231 return -ENOPROTOOPT; 1232 #ifdef CONFIG_NET_RX_BUSY_POLL 1233 case SO_BUSY_POLL: 1234 if (val < 0) 1235 return -EINVAL; 1236 WRITE_ONCE(sk->sk_ll_usec, val); 1237 return 0; 1238 case SO_PREFER_BUSY_POLL: 1239 if (valbool && !sockopt_capable(CAP_NET_ADMIN)) 1240 return -EPERM; 1241 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); 1242 return 0; 1243 case SO_BUSY_POLL_BUDGET: 1244 if (val > READ_ONCE(sk->sk_busy_poll_budget) && 1245 !sockopt_capable(CAP_NET_ADMIN)) 1246 return -EPERM; 1247 if (val < 0 || val > U16_MAX) 1248 return -EINVAL; 1249 WRITE_ONCE(sk->sk_busy_poll_budget, val); 1250 return 0; 1251 #endif 1252 case SO_MAX_PACING_RATE: 1253 { 1254 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; 1255 unsigned long pacing_rate; 1256 1257 if (sizeof(ulval) != sizeof(val) && 1258 optlen >= sizeof(ulval) && 1259 copy_from_sockptr(&ulval, optval, sizeof(ulval))) { 1260 return -EFAULT; 1261 } 1262 if (ulval != ~0UL) 1263 cmpxchg(&sk->sk_pacing_status, 1264 SK_PACING_NONE, 1265 SK_PACING_NEEDED); 1266 /* Pairs with READ_ONCE() from sk_getsockopt() */ 1267 WRITE_ONCE(sk->sk_max_pacing_rate, ulval); 1268 pacing_rate = READ_ONCE(sk->sk_pacing_rate); 1269 if (ulval < pacing_rate) 1270 WRITE_ONCE(sk->sk_pacing_rate, ulval); 1271 return 0; 1272 } 1273 case SO_TXREHASH: 1274 if (!sk_is_tcp(sk)) 1275 return -EOPNOTSUPP; 1276 if (val < -1 || val > 1) 1277 return -EINVAL; 1278 if ((u8)val == SOCK_TXREHASH_DEFAULT) 1279 val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); 1280 /* Paired with READ_ONCE() in tcp_rtx_synack() 1281 * and sk_getsockopt(). 1282 */ 1283 WRITE_ONCE(sk->sk_txrehash, (u8)val); 1284 return 0; 1285 case SO_PEEK_OFF: 1286 { 1287 int (*set_peek_off)(struct sock *sk, int val); 1288 1289 set_peek_off = READ_ONCE(sock->ops)->set_peek_off; 1290 if (set_peek_off) 1291 ret = set_peek_off(sk, val); 1292 else 1293 ret = -EOPNOTSUPP; 1294 return ret; 1295 } 1296 #ifdef CONFIG_PAGE_POOL 1297 case SO_DEVMEM_DONTNEED: 1298 return sock_devmem_dontneed(sk, optval, optlen); 1299 #endif 1300 case SO_SNDTIMEO_OLD: 1301 case SO_SNDTIMEO_NEW: 1302 return sock_set_timeout(&sk->sk_sndtimeo, optval, 1303 optlen, optname == SO_SNDTIMEO_OLD); 1304 case SO_RCVTIMEO_OLD: 1305 case SO_RCVTIMEO_NEW: 1306 return sock_set_timeout(&sk->sk_rcvtimeo, optval, 1307 optlen, optname == SO_RCVTIMEO_OLD); 1308 } 1309 1310 sockopt_lock_sock(sk); 1311 1312 switch (optname) { 1313 case SO_DEBUG: 1314 if (val && !sockopt_capable(CAP_NET_ADMIN)) 1315 ret = -EACCES; 1316 else 1317 sock_valbool_flag(sk, SOCK_DBG, valbool); 1318 break; 1319 case SO_REUSEADDR: 1320 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 1321 break; 1322 case SO_REUSEPORT: 1323 if (valbool && !sk_is_inet(sk)) 1324 ret = -EOPNOTSUPP; 1325 else 1326 sk->sk_reuseport = valbool; 1327 break; 1328 case SO_DONTROUTE: 1329 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 1330 sk_dst_reset(sk); 1331 break; 1332 case SO_BROADCAST: 1333 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 1334 break; 1335 case SO_SNDBUF: 1336 /* Don't error on this BSD doesn't and if you think 1337 * about it this is right. Otherwise apps have to 1338 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1339 * are treated in BSD as hints 1340 */ 1341 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); 1342 set_sndbuf: 1343 /* Ensure val * 2 fits into an int, to prevent max_t() 1344 * from treating it as a negative value. 1345 */ 1346 val = min_t(int, val, INT_MAX / 2); 1347 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 1348 WRITE_ONCE(sk->sk_sndbuf, 1349 max_t(int, val * 2, SOCK_MIN_SNDBUF)); 1350 /* Wake up sending tasks if we upped the value. */ 1351 sk->sk_write_space(sk); 1352 break; 1353 1354 case SO_SNDBUFFORCE: 1355 if (!sockopt_capable(CAP_NET_ADMIN)) { 1356 ret = -EPERM; 1357 break; 1358 } 1359 1360 /* No negative values (to prevent underflow, as val will be 1361 * multiplied by 2). 1362 */ 1363 if (val < 0) 1364 val = 0; 1365 goto set_sndbuf; 1366 1367 case SO_RCVBUF: 1368 /* Don't error on this BSD doesn't and if you think 1369 * about it this is right. Otherwise apps have to 1370 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1371 * are treated in BSD as hints 1372 */ 1373 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max))); 1374 break; 1375 1376 case SO_RCVBUFFORCE: 1377 if (!sockopt_capable(CAP_NET_ADMIN)) { 1378 ret = -EPERM; 1379 break; 1380 } 1381 1382 /* No negative values (to prevent underflow, as val will be 1383 * multiplied by 2). 1384 */ 1385 __sock_set_rcvbuf(sk, max(val, 0)); 1386 break; 1387 1388 case SO_KEEPALIVE: 1389 if (sk->sk_prot->keepalive) 1390 sk->sk_prot->keepalive(sk, valbool); 1391 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 1392 break; 1393 1394 case SO_OOBINLINE: 1395 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 1396 break; 1397 1398 case SO_NO_CHECK: 1399 sk->sk_no_check_tx = valbool; 1400 break; 1401 1402 case SO_LINGER: 1403 if (optlen < sizeof(ling)) { 1404 ret = -EINVAL; /* 1003.1g */ 1405 break; 1406 } 1407 if (copy_from_sockptr(&ling, optval, sizeof(ling))) { 1408 ret = -EFAULT; 1409 break; 1410 } 1411 if (!ling.l_onoff) { 1412 sock_reset_flag(sk, SOCK_LINGER); 1413 } else { 1414 unsigned long t_sec = ling.l_linger; 1415 1416 if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ) 1417 WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT); 1418 else 1419 WRITE_ONCE(sk->sk_lingertime, t_sec * HZ); 1420 sock_set_flag(sk, SOCK_LINGER); 1421 } 1422 break; 1423 1424 case SO_BSDCOMPAT: 1425 break; 1426 1427 case SO_TIMESTAMP_OLD: 1428 case SO_TIMESTAMP_NEW: 1429 case SO_TIMESTAMPNS_OLD: 1430 case SO_TIMESTAMPNS_NEW: 1431 sock_set_timestamp(sk, optname, valbool); 1432 break; 1433 1434 case SO_TIMESTAMPING_NEW: 1435 case SO_TIMESTAMPING_OLD: 1436 if (optlen == sizeof(timestamping)) { 1437 if (copy_from_sockptr(×tamping, optval, 1438 sizeof(timestamping))) { 1439 ret = -EFAULT; 1440 break; 1441 } 1442 } else { 1443 memset(×tamping, 0, sizeof(timestamping)); 1444 timestamping.flags = val; 1445 } 1446 ret = sock_set_timestamping(sk, optname, timestamping); 1447 break; 1448 1449 case SO_RCVLOWAT: 1450 { 1451 int (*set_rcvlowat)(struct sock *sk, int val) = NULL; 1452 1453 if (val < 0) 1454 val = INT_MAX; 1455 if (sock) 1456 set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat; 1457 if (set_rcvlowat) 1458 ret = set_rcvlowat(sk, val); 1459 else 1460 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); 1461 break; 1462 } 1463 case SO_ATTACH_FILTER: { 1464 struct sock_fprog fprog; 1465 1466 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1467 if (!ret) 1468 ret = sk_attach_filter(&fprog, sk); 1469 break; 1470 } 1471 case SO_ATTACH_BPF: 1472 ret = -EINVAL; 1473 if (optlen == sizeof(u32)) { 1474 u32 ufd; 1475 1476 ret = -EFAULT; 1477 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1478 break; 1479 1480 ret = sk_attach_bpf(ufd, sk); 1481 } 1482 break; 1483 1484 case SO_ATTACH_REUSEPORT_CBPF: { 1485 struct sock_fprog fprog; 1486 1487 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1488 if (!ret) 1489 ret = sk_reuseport_attach_filter(&fprog, sk); 1490 break; 1491 } 1492 case SO_ATTACH_REUSEPORT_EBPF: 1493 ret = -EINVAL; 1494 if (optlen == sizeof(u32)) { 1495 u32 ufd; 1496 1497 ret = -EFAULT; 1498 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1499 break; 1500 1501 ret = sk_reuseport_attach_bpf(ufd, sk); 1502 } 1503 break; 1504 1505 case SO_DETACH_REUSEPORT_BPF: 1506 ret = reuseport_detach_prog(sk); 1507 break; 1508 1509 case SO_DETACH_FILTER: 1510 ret = sk_detach_filter(sk); 1511 break; 1512 1513 case SO_LOCK_FILTER: 1514 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 1515 ret = -EPERM; 1516 else 1517 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 1518 break; 1519 1520 case SO_MARK: 1521 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 1522 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1523 ret = -EPERM; 1524 break; 1525 } 1526 1527 __sock_set_mark(sk, val); 1528 break; 1529 case SO_RCVMARK: 1530 sock_valbool_flag(sk, SOCK_RCVMARK, valbool); 1531 break; 1532 1533 case SO_RCVPRIORITY: 1534 sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool); 1535 break; 1536 1537 case SO_RXQ_OVFL: 1538 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 1539 break; 1540 1541 case SO_WIFI_STATUS: 1542 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1543 break; 1544 1545 case SO_NOFCS: 1546 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1547 break; 1548 1549 case SO_SELECT_ERR_QUEUE: 1550 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1551 break; 1552 1553 case SO_PASSCRED: 1554 if (sk_may_scm_recv(sk)) 1555 sk->sk_scm_credentials = valbool; 1556 else 1557 ret = -EOPNOTSUPP; 1558 break; 1559 1560 case SO_PASSSEC: 1561 if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk)) 1562 sk->sk_scm_security = valbool; 1563 else 1564 ret = -EOPNOTSUPP; 1565 break; 1566 1567 case SO_PASSPIDFD: 1568 if (sk_is_unix(sk)) 1569 sk->sk_scm_pidfd = valbool; 1570 else 1571 ret = -EOPNOTSUPP; 1572 break; 1573 1574 case SO_PASSRIGHTS: 1575 if (sk_is_unix(sk)) 1576 sk->sk_scm_rights = valbool; 1577 else 1578 ret = -EOPNOTSUPP; 1579 break; 1580 1581 case SO_INCOMING_CPU: 1582 reuseport_update_incoming_cpu(sk, val); 1583 break; 1584 1585 case SO_CNX_ADVICE: 1586 if (val == 1) 1587 dst_negative_advice(sk); 1588 break; 1589 1590 case SO_ZEROCOPY: 1591 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1592 if (!(sk_is_tcp(sk) || 1593 (sk->sk_type == SOCK_DGRAM && 1594 sk->sk_protocol == IPPROTO_UDP))) 1595 ret = -EOPNOTSUPP; 1596 } else if (sk->sk_family != PF_RDS) { 1597 ret = -EOPNOTSUPP; 1598 } 1599 if (!ret) { 1600 if (val < 0 || val > 1) 1601 ret = -EINVAL; 1602 else 1603 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1604 } 1605 break; 1606 1607 case SO_TXTIME: 1608 if (optlen != sizeof(struct sock_txtime)) { 1609 ret = -EINVAL; 1610 break; 1611 } else if (copy_from_sockptr(&sk_txtime, optval, 1612 sizeof(struct sock_txtime))) { 1613 ret = -EFAULT; 1614 break; 1615 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { 1616 ret = -EINVAL; 1617 break; 1618 } 1619 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet 1620 * scheduler has enough safe guards. 1621 */ 1622 if (sk_txtime.clockid != CLOCK_MONOTONIC && 1623 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1624 ret = -EPERM; 1625 break; 1626 } 1627 1628 ret = sockopt_validate_clockid(sk_txtime.clockid); 1629 if (ret) 1630 break; 1631 1632 sock_valbool_flag(sk, SOCK_TXTIME, true); 1633 sk->sk_clockid = sk_txtime.clockid; 1634 sk->sk_txtime_deadline_mode = 1635 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); 1636 sk->sk_txtime_report_errors = 1637 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); 1638 break; 1639 1640 case SO_BINDTOIFINDEX: 1641 ret = sock_bindtoindex_locked(sk, val); 1642 break; 1643 1644 case SO_BUF_LOCK: 1645 if (val & ~SOCK_BUF_LOCK_MASK) { 1646 ret = -EINVAL; 1647 break; 1648 } 1649 sk->sk_userlocks = val | (sk->sk_userlocks & 1650 ~SOCK_BUF_LOCK_MASK); 1651 break; 1652 1653 case SO_RESERVE_MEM: 1654 { 1655 int delta; 1656 1657 if (val < 0) { 1658 ret = -EINVAL; 1659 break; 1660 } 1661 1662 delta = val - sk->sk_reserved_mem; 1663 if (delta < 0) 1664 sock_release_reserved_memory(sk, -delta); 1665 else 1666 ret = sock_reserve_memory(sk, delta); 1667 break; 1668 } 1669 1670 default: 1671 ret = -ENOPROTOOPT; 1672 break; 1673 } 1674 sockopt_release_sock(sk); 1675 return ret; 1676 } 1677 1678 int sock_setsockopt(struct socket *sock, int level, int optname, 1679 sockptr_t optval, unsigned int optlen) 1680 { 1681 return sk_setsockopt(sock->sk, level, optname, 1682 optval, optlen); 1683 } 1684 EXPORT_SYMBOL(sock_setsockopt); 1685 1686 static const struct cred *sk_get_peer_cred(struct sock *sk) 1687 { 1688 const struct cred *cred; 1689 1690 spin_lock(&sk->sk_peer_lock); 1691 cred = get_cred(sk->sk_peer_cred); 1692 spin_unlock(&sk->sk_peer_lock); 1693 1694 return cred; 1695 } 1696 1697 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1698 struct ucred *ucred) 1699 { 1700 ucred->pid = pid_vnr(pid); 1701 ucred->uid = ucred->gid = -1; 1702 if (cred) { 1703 struct user_namespace *current_ns = current_user_ns(); 1704 1705 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1706 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1707 } 1708 } 1709 1710 static int groups_to_user(sockptr_t dst, const struct group_info *src) 1711 { 1712 struct user_namespace *user_ns = current_user_ns(); 1713 int i; 1714 1715 for (i = 0; i < src->ngroups; i++) { 1716 gid_t gid = from_kgid_munged(user_ns, src->gid[i]); 1717 1718 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid))) 1719 return -EFAULT; 1720 } 1721 1722 return 0; 1723 } 1724 1725 int sk_getsockopt(struct sock *sk, int level, int optname, 1726 sockptr_t optval, sockptr_t optlen) 1727 { 1728 struct socket *sock = sk->sk_socket; 1729 1730 union { 1731 int val; 1732 u64 val64; 1733 unsigned long ulval; 1734 struct linger ling; 1735 struct old_timeval32 tm32; 1736 struct __kernel_old_timeval tm; 1737 struct __kernel_sock_timeval stm; 1738 struct sock_txtime txtime; 1739 struct so_timestamping timestamping; 1740 } v; 1741 1742 int lv = sizeof(int); 1743 int len; 1744 1745 if (copy_from_sockptr(&len, optlen, sizeof(int))) 1746 return -EFAULT; 1747 if (len < 0) 1748 return -EINVAL; 1749 1750 memset(&v, 0, sizeof(v)); 1751 1752 switch (optname) { 1753 case SO_DEBUG: 1754 v.val = sock_flag(sk, SOCK_DBG); 1755 break; 1756 1757 case SO_DONTROUTE: 1758 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1759 break; 1760 1761 case SO_BROADCAST: 1762 v.val = sock_flag(sk, SOCK_BROADCAST); 1763 break; 1764 1765 case SO_SNDBUF: 1766 v.val = READ_ONCE(sk->sk_sndbuf); 1767 break; 1768 1769 case SO_RCVBUF: 1770 v.val = READ_ONCE(sk->sk_rcvbuf); 1771 break; 1772 1773 case SO_REUSEADDR: 1774 v.val = sk->sk_reuse; 1775 break; 1776 1777 case SO_REUSEPORT: 1778 v.val = sk->sk_reuseport; 1779 break; 1780 1781 case SO_KEEPALIVE: 1782 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1783 break; 1784 1785 case SO_TYPE: 1786 v.val = sk->sk_type; 1787 break; 1788 1789 case SO_PROTOCOL: 1790 v.val = sk->sk_protocol; 1791 break; 1792 1793 case SO_DOMAIN: 1794 v.val = sk->sk_family; 1795 break; 1796 1797 case SO_ERROR: 1798 v.val = -sock_error(sk); 1799 if (v.val == 0) 1800 v.val = xchg(&sk->sk_err_soft, 0); 1801 break; 1802 1803 case SO_OOBINLINE: 1804 v.val = sock_flag(sk, SOCK_URGINLINE); 1805 break; 1806 1807 case SO_NO_CHECK: 1808 v.val = sk->sk_no_check_tx; 1809 break; 1810 1811 case SO_PRIORITY: 1812 v.val = READ_ONCE(sk->sk_priority); 1813 break; 1814 1815 case SO_LINGER: 1816 lv = sizeof(v.ling); 1817 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1818 v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ; 1819 break; 1820 1821 case SO_BSDCOMPAT: 1822 break; 1823 1824 case SO_TIMESTAMP_OLD: 1825 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1826 !sock_flag(sk, SOCK_TSTAMP_NEW) && 1827 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1828 break; 1829 1830 case SO_TIMESTAMPNS_OLD: 1831 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); 1832 break; 1833 1834 case SO_TIMESTAMP_NEW: 1835 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); 1836 break; 1837 1838 case SO_TIMESTAMPNS_NEW: 1839 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); 1840 break; 1841 1842 case SO_TIMESTAMPING_OLD: 1843 case SO_TIMESTAMPING_NEW: 1844 lv = sizeof(v.timestamping); 1845 /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only 1846 * returning the flags when they were set through the same option. 1847 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD. 1848 */ 1849 if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) { 1850 v.timestamping.flags = READ_ONCE(sk->sk_tsflags); 1851 v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); 1852 } 1853 break; 1854 1855 case SO_RCVTIMEO_OLD: 1856 case SO_RCVTIMEO_NEW: 1857 lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v, 1858 SO_RCVTIMEO_OLD == optname); 1859 break; 1860 1861 case SO_SNDTIMEO_OLD: 1862 case SO_SNDTIMEO_NEW: 1863 lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v, 1864 SO_SNDTIMEO_OLD == optname); 1865 break; 1866 1867 case SO_RCVLOWAT: 1868 v.val = READ_ONCE(sk->sk_rcvlowat); 1869 break; 1870 1871 case SO_SNDLOWAT: 1872 v.val = 1; 1873 break; 1874 1875 case SO_PASSCRED: 1876 if (!sk_may_scm_recv(sk)) 1877 return -EOPNOTSUPP; 1878 1879 v.val = sk->sk_scm_credentials; 1880 break; 1881 1882 case SO_PASSPIDFD: 1883 if (!sk_is_unix(sk)) 1884 return -EOPNOTSUPP; 1885 1886 v.val = sk->sk_scm_pidfd; 1887 break; 1888 1889 case SO_PASSRIGHTS: 1890 if (!sk_is_unix(sk)) 1891 return -EOPNOTSUPP; 1892 1893 v.val = sk->sk_scm_rights; 1894 break; 1895 1896 case SO_PEERCRED: 1897 { 1898 struct ucred peercred; 1899 if (len > sizeof(peercred)) 1900 len = sizeof(peercred); 1901 1902 spin_lock(&sk->sk_peer_lock); 1903 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1904 spin_unlock(&sk->sk_peer_lock); 1905 1906 if (copy_to_sockptr(optval, &peercred, len)) 1907 return -EFAULT; 1908 goto lenout; 1909 } 1910 1911 case SO_PEERPIDFD: 1912 { 1913 struct pid *peer_pid; 1914 struct file *pidfd_file = NULL; 1915 unsigned int flags = 0; 1916 int pidfd; 1917 1918 if (len > sizeof(pidfd)) 1919 len = sizeof(pidfd); 1920 1921 spin_lock(&sk->sk_peer_lock); 1922 peer_pid = get_pid(sk->sk_peer_pid); 1923 spin_unlock(&sk->sk_peer_lock); 1924 1925 if (!peer_pid) 1926 return -ENODATA; 1927 1928 /* The use of PIDFD_STALE requires stashing of struct pid 1929 * on pidfs with pidfs_register_pid() and only AF_UNIX 1930 * were prepared for this. 1931 */ 1932 if (sk->sk_family == AF_UNIX) 1933 flags = PIDFD_STALE; 1934 1935 pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file); 1936 put_pid(peer_pid); 1937 if (pidfd < 0) 1938 return pidfd; 1939 1940 if (copy_to_sockptr(optval, &pidfd, len) || 1941 copy_to_sockptr(optlen, &len, sizeof(int))) { 1942 put_unused_fd(pidfd); 1943 fput(pidfd_file); 1944 1945 return -EFAULT; 1946 } 1947 1948 fd_install(pidfd, pidfd_file); 1949 return 0; 1950 } 1951 1952 case SO_PEERGROUPS: 1953 { 1954 const struct cred *cred; 1955 int ret, n; 1956 1957 cred = sk_get_peer_cred(sk); 1958 if (!cred) 1959 return -ENODATA; 1960 1961 n = cred->group_info->ngroups; 1962 if (len < n * sizeof(gid_t)) { 1963 len = n * sizeof(gid_t); 1964 put_cred(cred); 1965 return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE; 1966 } 1967 len = n * sizeof(gid_t); 1968 1969 ret = groups_to_user(optval, cred->group_info); 1970 put_cred(cred); 1971 if (ret) 1972 return ret; 1973 goto lenout; 1974 } 1975 1976 case SO_PEERNAME: 1977 { 1978 struct sockaddr_storage address; 1979 1980 lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2); 1981 if (lv < 0) 1982 return -ENOTCONN; 1983 if (lv < len) 1984 return -EINVAL; 1985 if (copy_to_sockptr(optval, &address, len)) 1986 return -EFAULT; 1987 goto lenout; 1988 } 1989 1990 /* Dubious BSD thing... Probably nobody even uses it, but 1991 * the UNIX standard wants it for whatever reason... -DaveM 1992 */ 1993 case SO_ACCEPTCONN: 1994 v.val = sk->sk_state == TCP_LISTEN; 1995 break; 1996 1997 case SO_PASSSEC: 1998 if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk)) 1999 return -EOPNOTSUPP; 2000 2001 v.val = sk->sk_scm_security; 2002 break; 2003 2004 case SO_PEERSEC: 2005 return security_socket_getpeersec_stream(sock, 2006 optval, optlen, len); 2007 2008 case SO_MARK: 2009 v.val = READ_ONCE(sk->sk_mark); 2010 break; 2011 2012 case SO_RCVMARK: 2013 v.val = sock_flag(sk, SOCK_RCVMARK); 2014 break; 2015 2016 case SO_RCVPRIORITY: 2017 v.val = sock_flag(sk, SOCK_RCVPRIORITY); 2018 break; 2019 2020 case SO_RXQ_OVFL: 2021 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 2022 break; 2023 2024 case SO_WIFI_STATUS: 2025 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 2026 break; 2027 2028 case SO_PEEK_OFF: 2029 if (!READ_ONCE(sock->ops)->set_peek_off) 2030 return -EOPNOTSUPP; 2031 2032 v.val = READ_ONCE(sk->sk_peek_off); 2033 break; 2034 case SO_NOFCS: 2035 v.val = sock_flag(sk, SOCK_NOFCS); 2036 break; 2037 2038 case SO_BINDTODEVICE: 2039 return sock_getbindtodevice(sk, optval, optlen, len); 2040 2041 case SO_GET_FILTER: 2042 len = sk_get_filter(sk, optval, len); 2043 if (len < 0) 2044 return len; 2045 2046 goto lenout; 2047 2048 case SO_LOCK_FILTER: 2049 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 2050 break; 2051 2052 case SO_BPF_EXTENSIONS: 2053 v.val = bpf_tell_extensions(); 2054 break; 2055 2056 case SO_SELECT_ERR_QUEUE: 2057 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 2058 break; 2059 2060 #ifdef CONFIG_NET_RX_BUSY_POLL 2061 case SO_BUSY_POLL: 2062 v.val = READ_ONCE(sk->sk_ll_usec); 2063 break; 2064 case SO_PREFER_BUSY_POLL: 2065 v.val = READ_ONCE(sk->sk_prefer_busy_poll); 2066 break; 2067 #endif 2068 2069 case SO_MAX_PACING_RATE: 2070 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */ 2071 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { 2072 lv = sizeof(v.ulval); 2073 v.ulval = READ_ONCE(sk->sk_max_pacing_rate); 2074 } else { 2075 /* 32bit version */ 2076 v.val = min_t(unsigned long, ~0U, 2077 READ_ONCE(sk->sk_max_pacing_rate)); 2078 } 2079 break; 2080 2081 case SO_INCOMING_CPU: 2082 v.val = READ_ONCE(sk->sk_incoming_cpu); 2083 break; 2084 2085 case SO_MEMINFO: 2086 { 2087 u32 meminfo[SK_MEMINFO_VARS]; 2088 2089 sk_get_meminfo(sk, meminfo); 2090 2091 len = min_t(unsigned int, len, sizeof(meminfo)); 2092 if (copy_to_sockptr(optval, &meminfo, len)) 2093 return -EFAULT; 2094 2095 goto lenout; 2096 } 2097 2098 #ifdef CONFIG_NET_RX_BUSY_POLL 2099 case SO_INCOMING_NAPI_ID: 2100 v.val = READ_ONCE(sk->sk_napi_id); 2101 2102 /* aggregate non-NAPI IDs down to 0 */ 2103 if (!napi_id_valid(v.val)) 2104 v.val = 0; 2105 2106 break; 2107 #endif 2108 2109 case SO_COOKIE: 2110 lv = sizeof(u64); 2111 if (len < lv) 2112 return -EINVAL; 2113 v.val64 = sock_gen_cookie(sk); 2114 break; 2115 2116 case SO_ZEROCOPY: 2117 v.val = sock_flag(sk, SOCK_ZEROCOPY); 2118 break; 2119 2120 case SO_TXTIME: 2121 lv = sizeof(v.txtime); 2122 v.txtime.clockid = sk->sk_clockid; 2123 v.txtime.flags |= sk->sk_txtime_deadline_mode ? 2124 SOF_TXTIME_DEADLINE_MODE : 0; 2125 v.txtime.flags |= sk->sk_txtime_report_errors ? 2126 SOF_TXTIME_REPORT_ERRORS : 0; 2127 break; 2128 2129 case SO_BINDTOIFINDEX: 2130 v.val = READ_ONCE(sk->sk_bound_dev_if); 2131 break; 2132 2133 case SO_NETNS_COOKIE: 2134 lv = sizeof(u64); 2135 if (len != lv) 2136 return -EINVAL; 2137 v.val64 = sock_net(sk)->net_cookie; 2138 break; 2139 2140 case SO_BUF_LOCK: 2141 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; 2142 break; 2143 2144 case SO_RESERVE_MEM: 2145 v.val = READ_ONCE(sk->sk_reserved_mem); 2146 break; 2147 2148 case SO_TXREHASH: 2149 if (!sk_is_tcp(sk)) 2150 return -EOPNOTSUPP; 2151 2152 /* Paired with WRITE_ONCE() in sk_setsockopt() */ 2153 v.val = READ_ONCE(sk->sk_txrehash); 2154 break; 2155 2156 default: 2157 /* We implement the SO_SNDLOWAT etc to not be settable 2158 * (1003.1g 7). 2159 */ 2160 return -ENOPROTOOPT; 2161 } 2162 2163 if (len > lv) 2164 len = lv; 2165 if (copy_to_sockptr(optval, &v, len)) 2166 return -EFAULT; 2167 lenout: 2168 if (copy_to_sockptr(optlen, &len, sizeof(int))) 2169 return -EFAULT; 2170 return 0; 2171 } 2172 2173 /* 2174 * Initialize an sk_lock. 2175 * 2176 * (We also register the sk_lock with the lock validator.) 2177 */ 2178 static inline void sock_lock_init(struct sock *sk) 2179 { 2180 sk_owner_clear(sk); 2181 2182 if (sk->sk_kern_sock) 2183 sock_lock_init_class_and_name( 2184 sk, 2185 af_family_kern_slock_key_strings[sk->sk_family], 2186 af_family_kern_slock_keys + sk->sk_family, 2187 af_family_kern_key_strings[sk->sk_family], 2188 af_family_kern_keys + sk->sk_family); 2189 else 2190 sock_lock_init_class_and_name( 2191 sk, 2192 af_family_slock_key_strings[sk->sk_family], 2193 af_family_slock_keys + sk->sk_family, 2194 af_family_key_strings[sk->sk_family], 2195 af_family_keys + sk->sk_family); 2196 } 2197 2198 /* 2199 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 2200 * even temporarily, because of RCU lookups. sk_node should also be left as is. 2201 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 2202 */ 2203 static void sock_copy(struct sock *nsk, const struct sock *osk) 2204 { 2205 const struct proto *prot = READ_ONCE(osk->sk_prot); 2206 #ifdef CONFIG_SECURITY_NETWORK 2207 void *sptr = nsk->sk_security; 2208 #endif 2209 2210 /* If we move sk_tx_queue_mapping out of the private section, 2211 * we must check if sk_tx_queue_clear() is called after 2212 * sock_copy() in sk_clone_lock(). 2213 */ 2214 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < 2215 offsetof(struct sock, sk_dontcopy_begin) || 2216 offsetof(struct sock, sk_tx_queue_mapping) >= 2217 offsetof(struct sock, sk_dontcopy_end)); 2218 2219 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 2220 2221 unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 2222 prot->obj_size - offsetof(struct sock, sk_dontcopy_end), 2223 /* alloc is larger than struct, see sk_prot_alloc() */); 2224 2225 #ifdef CONFIG_SECURITY_NETWORK 2226 nsk->sk_security = sptr; 2227 security_sk_clone(osk, nsk); 2228 #endif 2229 } 2230 2231 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 2232 int family) 2233 { 2234 struct sock *sk; 2235 struct kmem_cache *slab; 2236 2237 slab = prot->slab; 2238 if (slab != NULL) { 2239 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 2240 if (!sk) 2241 return sk; 2242 if (want_init_on_alloc(priority)) 2243 sk_prot_clear_nulls(sk, prot->obj_size); 2244 } else 2245 sk = kmalloc(prot->obj_size, priority); 2246 2247 if (sk != NULL) { 2248 if (security_sk_alloc(sk, family, priority)) 2249 goto out_free; 2250 2251 if (!try_module_get(prot->owner)) 2252 goto out_free_sec; 2253 } 2254 2255 return sk; 2256 2257 out_free_sec: 2258 security_sk_free(sk); 2259 out_free: 2260 if (slab != NULL) 2261 kmem_cache_free(slab, sk); 2262 else 2263 kfree(sk); 2264 return NULL; 2265 } 2266 2267 static void sk_prot_free(struct proto *prot, struct sock *sk) 2268 { 2269 struct kmem_cache *slab; 2270 struct module *owner; 2271 2272 owner = prot->owner; 2273 slab = prot->slab; 2274 2275 cgroup_sk_free(&sk->sk_cgrp_data); 2276 mem_cgroup_sk_free(sk); 2277 security_sk_free(sk); 2278 2279 sk_owner_put(sk); 2280 2281 if (slab != NULL) 2282 kmem_cache_free(slab, sk); 2283 else 2284 kfree(sk); 2285 module_put(owner); 2286 } 2287 2288 /** 2289 * sk_alloc - All socket objects are allocated here 2290 * @net: the applicable net namespace 2291 * @family: protocol family 2292 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 2293 * @prot: struct proto associated with this new sock instance 2294 * @kern: is this to be a kernel socket? 2295 */ 2296 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 2297 struct proto *prot, int kern) 2298 { 2299 struct sock *sk; 2300 2301 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 2302 if (sk) { 2303 sk->sk_family = family; 2304 /* 2305 * See comment in struct sock definition to understand 2306 * why we need sk_prot_creator -acme 2307 */ 2308 sk->sk_prot = sk->sk_prot_creator = prot; 2309 2310 if (READ_ONCE(net->core.sysctl_bypass_prot_mem)) 2311 sk->sk_bypass_prot_mem = 1; 2312 2313 sk->sk_kern_sock = kern; 2314 sock_lock_init(sk); 2315 2316 sk->sk_net_refcnt = kern ? 0 : 1; 2317 if (likely(sk->sk_net_refcnt)) { 2318 get_net_track(net, &sk->ns_tracker, priority); 2319 sock_inuse_add(net, 1); 2320 } else { 2321 net_passive_inc(net); 2322 __netns_tracker_alloc(net, &sk->ns_tracker, 2323 false, priority); 2324 } 2325 2326 sock_net_set(sk, net); 2327 refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); 2328 2329 mem_cgroup_sk_alloc(sk); 2330 cgroup_sk_alloc(&sk->sk_cgrp_data); 2331 sock_update_classid(&sk->sk_cgrp_data); 2332 sock_update_netprioidx(&sk->sk_cgrp_data); 2333 sk_tx_queue_clear(sk); 2334 } 2335 2336 return sk; 2337 } 2338 EXPORT_SYMBOL(sk_alloc); 2339 2340 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 2341 * grace period. This is the case for UDP sockets and TCP listeners. 2342 */ 2343 static void __sk_destruct(struct rcu_head *head) 2344 { 2345 struct sock *sk = container_of(head, struct sock, sk_rcu); 2346 struct net *net = sock_net(sk); 2347 struct sk_filter *filter; 2348 2349 if (sk->sk_destruct) 2350 sk->sk_destruct(sk); 2351 2352 filter = rcu_dereference_check(sk->sk_filter, 2353 refcount_read(&sk->sk_wmem_alloc) == 0); 2354 if (filter) { 2355 sk_filter_uncharge(sk, filter); 2356 RCU_INIT_POINTER(sk->sk_filter, NULL); 2357 } 2358 2359 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 2360 2361 #ifdef CONFIG_BPF_SYSCALL 2362 bpf_sk_storage_free(sk); 2363 #endif 2364 2365 if (atomic_read(&sk->sk_omem_alloc)) 2366 pr_debug("%s: optmem leakage (%d bytes) detected\n", 2367 __func__, atomic_read(&sk->sk_omem_alloc)); 2368 2369 if (sk->sk_frag.page) { 2370 put_page(sk->sk_frag.page); 2371 sk->sk_frag.page = NULL; 2372 } 2373 2374 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ 2375 put_cred(sk->sk_peer_cred); 2376 put_pid(sk->sk_peer_pid); 2377 2378 if (likely(sk->sk_net_refcnt)) { 2379 put_net_track(net, &sk->ns_tracker); 2380 } else { 2381 __netns_tracker_free(net, &sk->ns_tracker, false); 2382 net_passive_dec(net); 2383 } 2384 sk_prot_free(sk->sk_prot_creator, sk); 2385 } 2386 2387 void sk_net_refcnt_upgrade(struct sock *sk) 2388 { 2389 struct net *net = sock_net(sk); 2390 2391 WARN_ON_ONCE(sk->sk_net_refcnt); 2392 __netns_tracker_free(net, &sk->ns_tracker, false); 2393 net_passive_dec(net); 2394 sk->sk_net_refcnt = 1; 2395 get_net_track(net, &sk->ns_tracker, GFP_KERNEL); 2396 sock_inuse_add(net, 1); 2397 } 2398 EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade); 2399 2400 void sk_destruct(struct sock *sk) 2401 { 2402 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); 2403 2404 if (rcu_access_pointer(sk->sk_reuseport_cb)) { 2405 reuseport_detach_sock(sk); 2406 use_call_rcu = true; 2407 } 2408 2409 if (use_call_rcu) 2410 call_rcu(&sk->sk_rcu, __sk_destruct); 2411 else 2412 __sk_destruct(&sk->sk_rcu); 2413 } 2414 2415 static void __sk_free(struct sock *sk) 2416 { 2417 if (likely(sk->sk_net_refcnt)) 2418 sock_inuse_add(sock_net(sk), -1); 2419 2420 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 2421 sock_diag_broadcast_destroy(sk); 2422 else 2423 sk_destruct(sk); 2424 } 2425 2426 void sk_free(struct sock *sk) 2427 { 2428 /* 2429 * We subtract one from sk_wmem_alloc and can know if 2430 * some packets are still in some tx queue. 2431 * If not null, sock_wfree() will call __sk_free(sk) later 2432 */ 2433 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 2434 __sk_free(sk); 2435 } 2436 EXPORT_SYMBOL(sk_free); 2437 2438 static void sk_init_common(struct sock *sk) 2439 { 2440 skb_queue_head_init(&sk->sk_receive_queue); 2441 skb_queue_head_init(&sk->sk_write_queue); 2442 skb_queue_head_init(&sk->sk_error_queue); 2443 2444 rwlock_init(&sk->sk_callback_lock); 2445 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 2446 af_rlock_keys + sk->sk_family, 2447 af_family_rlock_key_strings[sk->sk_family]); 2448 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 2449 af_wlock_keys + sk->sk_family, 2450 af_family_wlock_key_strings[sk->sk_family]); 2451 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 2452 af_elock_keys + sk->sk_family, 2453 af_family_elock_key_strings[sk->sk_family]); 2454 if (sk->sk_kern_sock) 2455 lockdep_set_class_and_name(&sk->sk_callback_lock, 2456 af_kern_callback_keys + sk->sk_family, 2457 af_family_kern_clock_key_strings[sk->sk_family]); 2458 else 2459 lockdep_set_class_and_name(&sk->sk_callback_lock, 2460 af_callback_keys + sk->sk_family, 2461 af_family_clock_key_strings[sk->sk_family]); 2462 } 2463 2464 /** 2465 * sk_clone - clone a socket 2466 * @sk: the socket to clone 2467 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 2468 * @lock: if true, lock the cloned sk 2469 * 2470 * If @lock is true, the clone is locked by bh_lock_sock(), and 2471 * caller must unlock socket even in error path by bh_unlock_sock(). 2472 */ 2473 struct sock *sk_clone(const struct sock *sk, const gfp_t priority, 2474 bool lock) 2475 { 2476 struct proto *prot = READ_ONCE(sk->sk_prot); 2477 struct sk_filter *filter; 2478 bool is_charged = true; 2479 struct sock *newsk; 2480 2481 newsk = sk_prot_alloc(prot, priority, sk->sk_family); 2482 if (!newsk) 2483 goto out; 2484 2485 sock_copy(newsk, sk); 2486 2487 newsk->sk_prot_creator = prot; 2488 2489 /* SANITY */ 2490 if (likely(newsk->sk_net_refcnt)) { 2491 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority); 2492 sock_inuse_add(sock_net(newsk), 1); 2493 } else { 2494 /* Kernel sockets are not elevating the struct net refcount. 2495 * Instead, use a tracker to more easily detect if a layer 2496 * is not properly dismantling its kernel sockets at netns 2497 * destroy time. 2498 */ 2499 net_passive_inc(sock_net(newsk)); 2500 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, 2501 false, priority); 2502 } 2503 2504 sk_node_init(&newsk->sk_node); 2505 sock_lock_init(newsk); 2506 2507 if (lock) 2508 bh_lock_sock(newsk); 2509 2510 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 2511 newsk->sk_backlog.len = 0; 2512 2513 atomic_set(&newsk->sk_rmem_alloc, 0); 2514 2515 refcount_set(&newsk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); 2516 2517 atomic_set(&newsk->sk_omem_alloc, 0); 2518 sk_init_common(newsk); 2519 2520 newsk->sk_dst_cache = NULL; 2521 newsk->sk_dst_pending_confirm = 0; 2522 newsk->sk_wmem_queued = 0; 2523 newsk->sk_forward_alloc = 0; 2524 newsk->sk_reserved_mem = 0; 2525 DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters); 2526 sk_drops_reset(newsk); 2527 newsk->sk_send_head = NULL; 2528 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 2529 atomic_set(&newsk->sk_zckey, 0); 2530 2531 sock_reset_flag(newsk, SOCK_DONE); 2532 2533 #ifdef CONFIG_MEMCG 2534 /* sk->sk_memcg will be populated at accept() time */ 2535 newsk->sk_memcg = NULL; 2536 #endif 2537 2538 cgroup_sk_clone(&newsk->sk_cgrp_data); 2539 2540 rcu_read_lock(); 2541 filter = rcu_dereference(sk->sk_filter); 2542 if (filter != NULL) 2543 /* though it's an empty new sock, the charging may fail 2544 * if sysctl_optmem_max was changed between creation of 2545 * original socket and cloning 2546 */ 2547 is_charged = sk_filter_charge(newsk, filter); 2548 RCU_INIT_POINTER(newsk->sk_filter, filter); 2549 rcu_read_unlock(); 2550 2551 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 2552 /* We need to make sure that we don't uncharge the new 2553 * socket if we couldn't charge it in the first place 2554 * as otherwise we uncharge the parent's filter. 2555 */ 2556 if (!is_charged) 2557 RCU_INIT_POINTER(newsk->sk_filter, NULL); 2558 2559 goto free; 2560 } 2561 2562 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 2563 2564 if (bpf_sk_storage_clone(sk, newsk)) 2565 goto free; 2566 2567 /* Clear sk_user_data if parent had the pointer tagged 2568 * as not suitable for copying when cloning. 2569 */ 2570 if (sk_user_data_is_nocopy(newsk)) 2571 newsk->sk_user_data = NULL; 2572 2573 newsk->sk_err = 0; 2574 newsk->sk_err_soft = 0; 2575 newsk->sk_priority = 0; 2576 newsk->sk_incoming_cpu = raw_smp_processor_id(); 2577 2578 /* Before updating sk_refcnt, we must commit prior changes to memory 2579 * (Documentation/RCU/rculist_nulls.rst for details) 2580 */ 2581 smp_wmb(); 2582 refcount_set(&newsk->sk_refcnt, 2); 2583 2584 sk_set_socket(newsk, NULL); 2585 sk_tx_queue_clear(newsk); 2586 RCU_INIT_POINTER(newsk->sk_wq, NULL); 2587 2588 if (newsk->sk_prot->sockets_allocated) 2589 sk_sockets_allocated_inc(newsk); 2590 2591 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) 2592 net_enable_timestamp(); 2593 out: 2594 return newsk; 2595 free: 2596 /* It is still raw copy of parent, so invalidate 2597 * destructor and make plain sk_free() 2598 */ 2599 newsk->sk_destruct = NULL; 2600 if (lock) 2601 bh_unlock_sock(newsk); 2602 sk_free(newsk); 2603 newsk = NULL; 2604 goto out; 2605 } 2606 EXPORT_SYMBOL_GPL(sk_clone); 2607 2608 static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev) 2609 { 2610 bool is_ipv6 = false; 2611 u32 max_size; 2612 2613 #if IS_ENABLED(CONFIG_IPV6) 2614 is_ipv6 = (sk->sk_family == AF_INET6 && 2615 !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); 2616 #endif 2617 /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ 2618 max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) : 2619 READ_ONCE(dev->gso_ipv4_max_size); 2620 if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) 2621 max_size = GSO_LEGACY_MAX_SIZE; 2622 2623 return max_size - (MAX_TCP_HEADER + 1); 2624 } 2625 2626 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 2627 { 2628 const struct net_device *dev; 2629 u32 max_segs = 1; 2630 2631 rcu_read_lock(); 2632 dev = dst_dev_rcu(dst); 2633 sk->sk_route_caps = dev->features; 2634 if (sk_is_tcp(sk)) { 2635 struct inet_connection_sock *icsk = inet_csk(sk); 2636 2637 sk->sk_route_caps |= NETIF_F_GSO; 2638 icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK); 2639 } 2640 if (sk->sk_route_caps & NETIF_F_GSO) 2641 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 2642 if (unlikely(sk->sk_gso_disabled)) 2643 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2644 if (sk_can_gso(sk)) { 2645 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 2646 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2647 } else { 2648 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 2649 sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev); 2650 /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ 2651 max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1); 2652 } 2653 } 2654 sk->sk_gso_max_segs = max_segs; 2655 sk_dst_set(sk, dst); 2656 rcu_read_unlock(); 2657 } 2658 EXPORT_SYMBOL_GPL(sk_setup_caps); 2659 2660 /* 2661 * Simple resource managers for sockets. 2662 */ 2663 2664 2665 /* 2666 * Write buffer destructor automatically called from kfree_skb. 2667 */ 2668 void sock_wfree(struct sk_buff *skb) 2669 { 2670 unsigned int len = skb->truesize; 2671 struct sock *sk = skb->sk; 2672 bool free; 2673 int old; 2674 2675 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 2676 if (sock_flag(sk, SOCK_RCU_FREE) && 2677 sk->sk_write_space == sock_def_write_space) { 2678 rcu_read_lock(); 2679 free = __refcount_sub_and_test(len, &sk->sk_wmem_alloc, 2680 &old); 2681 sock_def_write_space_wfree(sk, old - len); 2682 rcu_read_unlock(); 2683 if (unlikely(free)) 2684 __sk_free(sk); 2685 return; 2686 } 2687 2688 /* 2689 * Keep a reference on sk_wmem_alloc, this will be released 2690 * after sk_write_space() call 2691 */ 2692 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 2693 sk->sk_write_space(sk); 2694 len = 1; 2695 } 2696 /* 2697 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 2698 * could not do because of in-flight packets 2699 */ 2700 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 2701 __sk_free(sk); 2702 } 2703 EXPORT_SYMBOL(sock_wfree); 2704 2705 /* This variant of sock_wfree() is used by TCP, 2706 * since it sets SOCK_USE_WRITE_QUEUE. 2707 */ 2708 void __sock_wfree(struct sk_buff *skb) 2709 { 2710 struct sock *sk = skb->sk; 2711 2712 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 2713 __sk_free(sk); 2714 } 2715 2716 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 2717 { 2718 int old_wmem; 2719 2720 skb_orphan(skb); 2721 #ifdef CONFIG_INET 2722 if (unlikely(!sk_fullsock(sk))) 2723 return skb_set_owner_edemux(skb, sk); 2724 #endif 2725 skb->sk = sk; 2726 skb->destructor = sock_wfree; 2727 skb_set_hash_from_sk(skb, sk); 2728 /* 2729 * We used to take a refcount on sk, but following operation 2730 * is enough to guarantee sk_free() won't free this sock until 2731 * all in-flight packets are completed 2732 */ 2733 __refcount_add(skb->truesize, &sk->sk_wmem_alloc, &old_wmem); 2734 2735 /* (old_wmem == SK_WMEM_ALLOC_BIAS) if no other TX packet for this socket 2736 * is in a host queue (qdisc, NIC queue). 2737 * Set skb->ooo_okay so that netdev_pick_tx() can choose a TX queue 2738 * based on XPS for better performance. 2739 * Otherwise clear ooo_okay to not risk Out Of Order delivery. 2740 */ 2741 skb->ooo_okay = (old_wmem == SK_WMEM_ALLOC_BIAS); 2742 } 2743 EXPORT_SYMBOL(skb_set_owner_w); 2744 2745 static bool can_skb_orphan_partial(const struct sk_buff *skb) 2746 { 2747 /* Drivers depend on in-order delivery for crypto offload, 2748 * partial orphan breaks out-of-order-OK logic. 2749 */ 2750 if (skb_is_decrypted(skb)) 2751 return false; 2752 2753 return (skb->destructor == sock_wfree || 2754 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); 2755 } 2756 2757 /* This helper is used by netem, as it can hold packets in its 2758 * delay queue. We want to allow the owner socket to send more 2759 * packets, as if they were already TX completed by a typical driver. 2760 * But we also want to keep skb->sk set because some packet schedulers 2761 * rely on it (sch_fq for example). 2762 */ 2763 void skb_orphan_partial(struct sk_buff *skb) 2764 { 2765 if (skb_is_tcp_pure_ack(skb)) 2766 return; 2767 2768 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) 2769 return; 2770 2771 skb_orphan(skb); 2772 } 2773 EXPORT_SYMBOL(skb_orphan_partial); 2774 2775 /* 2776 * Read buffer destructor automatically called from kfree_skb. 2777 */ 2778 void sock_rfree(struct sk_buff *skb) 2779 { 2780 struct sock *sk = skb->sk; 2781 unsigned int len = skb->truesize; 2782 2783 atomic_sub(len, &sk->sk_rmem_alloc); 2784 sk_mem_uncharge(sk, len); 2785 } 2786 EXPORT_SYMBOL(sock_rfree); 2787 2788 /* 2789 * Buffer destructor for skbs that are not used directly in read or write 2790 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 2791 */ 2792 void sock_efree(struct sk_buff *skb) 2793 { 2794 sock_put(skb->sk); 2795 } 2796 EXPORT_SYMBOL(sock_efree); 2797 2798 /* Buffer destructor for prefetch/receive path where reference count may 2799 * not be held, e.g. for listen sockets. 2800 */ 2801 #ifdef CONFIG_INET 2802 void sock_pfree(struct sk_buff *skb) 2803 { 2804 struct sock *sk = skb->sk; 2805 2806 if (!sk_is_refcounted(sk)) 2807 return; 2808 2809 if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) { 2810 inet_reqsk(sk)->rsk_listener = NULL; 2811 reqsk_free(inet_reqsk(sk)); 2812 return; 2813 } 2814 2815 sock_gen_put(sk); 2816 } 2817 EXPORT_SYMBOL(sock_pfree); 2818 #endif /* CONFIG_INET */ 2819 2820 /* 2821 * Allocate a skb from the socket's send buffer. 2822 */ 2823 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 2824 gfp_t priority) 2825 { 2826 if (force || 2827 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { 2828 struct sk_buff *skb = alloc_skb(size, priority); 2829 2830 if (skb) { 2831 skb_set_owner_w(skb, sk); 2832 return skb; 2833 } 2834 } 2835 return NULL; 2836 } 2837 EXPORT_SYMBOL(sock_wmalloc); 2838 2839 static void sock_ofree(struct sk_buff *skb) 2840 { 2841 struct sock *sk = skb->sk; 2842 2843 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 2844 } 2845 2846 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 2847 gfp_t priority) 2848 { 2849 struct sk_buff *skb; 2850 2851 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 2852 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 2853 READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) 2854 return NULL; 2855 2856 skb = alloc_skb(size, priority); 2857 if (!skb) 2858 return NULL; 2859 2860 atomic_add(skb->truesize, &sk->sk_omem_alloc); 2861 skb->sk = sk; 2862 skb->destructor = sock_ofree; 2863 return skb; 2864 } 2865 2866 /* 2867 * Allocate a memory block from the socket's option memory buffer. 2868 */ 2869 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 2870 { 2871 int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); 2872 2873 if ((unsigned int)size <= optmem_max && 2874 atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { 2875 void *mem; 2876 /* First do the add, to avoid the race if kmalloc 2877 * might sleep. 2878 */ 2879 atomic_add(size, &sk->sk_omem_alloc); 2880 mem = kmalloc(size, priority); 2881 if (mem) 2882 return mem; 2883 atomic_sub(size, &sk->sk_omem_alloc); 2884 } 2885 return NULL; 2886 } 2887 EXPORT_SYMBOL(sock_kmalloc); 2888 2889 /* 2890 * Duplicate the input "src" memory block using the socket's 2891 * option memory buffer. 2892 */ 2893 void *sock_kmemdup(struct sock *sk, const void *src, 2894 int size, gfp_t priority) 2895 { 2896 void *mem; 2897 2898 mem = sock_kmalloc(sk, size, priority); 2899 if (mem) 2900 memcpy(mem, src, size); 2901 return mem; 2902 } 2903 EXPORT_SYMBOL(sock_kmemdup); 2904 2905 /* Free an option memory block. Note, we actually want the inline 2906 * here as this allows gcc to detect the nullify and fold away the 2907 * condition entirely. 2908 */ 2909 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2910 const bool nullify) 2911 { 2912 if (WARN_ON_ONCE(!mem)) 2913 return; 2914 if (nullify) 2915 kfree_sensitive(mem); 2916 else 2917 kfree(mem); 2918 atomic_sub(size, &sk->sk_omem_alloc); 2919 } 2920 2921 void sock_kfree_s(struct sock *sk, void *mem, int size) 2922 { 2923 __sock_kfree_s(sk, mem, size, false); 2924 } 2925 EXPORT_SYMBOL(sock_kfree_s); 2926 2927 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2928 { 2929 __sock_kfree_s(sk, mem, size, true); 2930 } 2931 EXPORT_SYMBOL(sock_kzfree_s); 2932 2933 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2934 I think, these locks should be removed for datagram sockets. 2935 */ 2936 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2937 { 2938 DEFINE_WAIT(wait); 2939 2940 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2941 for (;;) { 2942 if (!timeo) 2943 break; 2944 if (signal_pending(current)) 2945 break; 2946 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2947 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2948 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) 2949 break; 2950 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2951 break; 2952 if (READ_ONCE(sk->sk_err)) 2953 break; 2954 timeo = schedule_timeout(timeo); 2955 } 2956 finish_wait(sk_sleep(sk), &wait); 2957 return timeo; 2958 } 2959 2960 2961 /* 2962 * Generic send/receive buffer handlers 2963 */ 2964 2965 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2966 unsigned long data_len, int noblock, 2967 int *errcode, int max_page_order) 2968 { 2969 struct sk_buff *skb; 2970 long timeo; 2971 int err; 2972 2973 timeo = sock_sndtimeo(sk, noblock); 2974 for (;;) { 2975 err = sock_error(sk); 2976 if (err != 0) 2977 goto failure; 2978 2979 err = -EPIPE; 2980 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2981 goto failure; 2982 2983 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) 2984 break; 2985 2986 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2987 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2988 err = -EAGAIN; 2989 if (!timeo) 2990 goto failure; 2991 if (signal_pending(current)) 2992 goto interrupted; 2993 timeo = sock_wait_for_wmem(sk, timeo); 2994 } 2995 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2996 errcode, sk->sk_allocation); 2997 if (skb) 2998 skb_set_owner_w(skb, sk); 2999 return skb; 3000 3001 interrupted: 3002 err = sock_intr_errno(timeo); 3003 failure: 3004 *errcode = err; 3005 return NULL; 3006 } 3007 EXPORT_SYMBOL(sock_alloc_send_pskb); 3008 3009 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, 3010 struct sockcm_cookie *sockc) 3011 { 3012 u32 tsflags; 3013 3014 BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31)); 3015 3016 switch (cmsg->cmsg_type) { 3017 case SO_MARK: 3018 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 3019 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3020 return -EPERM; 3021 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3022 return -EINVAL; 3023 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 3024 break; 3025 case SO_TIMESTAMPING_OLD: 3026 case SO_TIMESTAMPING_NEW: 3027 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3028 return -EINVAL; 3029 3030 tsflags = *(u32 *)CMSG_DATA(cmsg); 3031 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 3032 return -EINVAL; 3033 3034 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 3035 sockc->tsflags |= tsflags; 3036 break; 3037 case SCM_TXTIME: 3038 if (!sock_flag(sk, SOCK_TXTIME)) 3039 return -EINVAL; 3040 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 3041 return -EINVAL; 3042 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 3043 break; 3044 case SCM_TS_OPT_ID: 3045 if (sk_is_tcp(sk)) 3046 return -EINVAL; 3047 tsflags = READ_ONCE(sk->sk_tsflags); 3048 if (!(tsflags & SOF_TIMESTAMPING_OPT_ID)) 3049 return -EINVAL; 3050 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3051 return -EINVAL; 3052 sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg); 3053 sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID; 3054 break; 3055 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 3056 case SCM_RIGHTS: 3057 case SCM_CREDENTIALS: 3058 break; 3059 case SO_PRIORITY: 3060 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3061 return -EINVAL; 3062 if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg))) 3063 return -EPERM; 3064 sockc->priority = *(u32 *)CMSG_DATA(cmsg); 3065 break; 3066 case SCM_DEVMEM_DMABUF: 3067 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3068 return -EINVAL; 3069 sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg); 3070 break; 3071 default: 3072 return -EINVAL; 3073 } 3074 return 0; 3075 } 3076 EXPORT_SYMBOL(__sock_cmsg_send); 3077 3078 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 3079 struct sockcm_cookie *sockc) 3080 { 3081 struct cmsghdr *cmsg; 3082 int ret; 3083 3084 for_each_cmsghdr(cmsg, msg) { 3085 if (!CMSG_OK(msg, cmsg)) 3086 return -EINVAL; 3087 if (cmsg->cmsg_level != SOL_SOCKET) 3088 continue; 3089 ret = __sock_cmsg_send(sk, cmsg, sockc); 3090 if (ret) 3091 return ret; 3092 } 3093 return 0; 3094 } 3095 EXPORT_SYMBOL(sock_cmsg_send); 3096 3097 static void sk_enter_memory_pressure(struct sock *sk) 3098 { 3099 if (!sk->sk_prot->enter_memory_pressure) 3100 return; 3101 3102 sk->sk_prot->enter_memory_pressure(sk); 3103 } 3104 3105 static void sk_leave_memory_pressure(struct sock *sk) 3106 { 3107 if (sk->sk_prot->leave_memory_pressure) { 3108 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure, 3109 tcp_leave_memory_pressure, sk); 3110 } else { 3111 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 3112 3113 if (memory_pressure && READ_ONCE(*memory_pressure)) 3114 WRITE_ONCE(*memory_pressure, 0); 3115 } 3116 } 3117 3118 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); 3119 3120 /** 3121 * skb_page_frag_refill - check that a page_frag contains enough room 3122 * @sz: minimum size of the fragment we want to get 3123 * @pfrag: pointer to page_frag 3124 * @gfp: priority for memory allocation 3125 * 3126 * Note: While this allocator tries to use high order pages, there is 3127 * no guarantee that allocations succeed. Therefore, @sz MUST be 3128 * less or equal than PAGE_SIZE. 3129 */ 3130 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 3131 { 3132 if (pfrag->page) { 3133 if (page_ref_count(pfrag->page) == 1) { 3134 pfrag->offset = 0; 3135 return true; 3136 } 3137 if (pfrag->offset + sz <= pfrag->size) 3138 return true; 3139 put_page(pfrag->page); 3140 } 3141 3142 pfrag->offset = 0; 3143 if (SKB_FRAG_PAGE_ORDER && 3144 !static_branch_unlikely(&net_high_order_alloc_disable_key)) { 3145 /* Avoid direct reclaim but allow kswapd to wake */ 3146 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 3147 __GFP_COMP | __GFP_NOWARN | 3148 __GFP_NORETRY, 3149 SKB_FRAG_PAGE_ORDER); 3150 if (likely(pfrag->page)) { 3151 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 3152 return true; 3153 } 3154 } 3155 pfrag->page = alloc_page(gfp); 3156 if (likely(pfrag->page)) { 3157 pfrag->size = PAGE_SIZE; 3158 return true; 3159 } 3160 return false; 3161 } 3162 EXPORT_SYMBOL(skb_page_frag_refill); 3163 3164 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 3165 { 3166 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 3167 return true; 3168 3169 if (!sk->sk_bypass_prot_mem) 3170 sk_enter_memory_pressure(sk); 3171 3172 sk_stream_moderate_sndbuf(sk); 3173 3174 return false; 3175 } 3176 EXPORT_SYMBOL(sk_page_frag_refill); 3177 3178 void __lock_sock(struct sock *sk) 3179 __releases(&sk->sk_lock.slock) 3180 __acquires(&sk->sk_lock.slock) 3181 { 3182 DEFINE_WAIT(wait); 3183 3184 for (;;) { 3185 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 3186 TASK_UNINTERRUPTIBLE); 3187 spin_unlock_bh(&sk->sk_lock.slock); 3188 schedule(); 3189 spin_lock_bh(&sk->sk_lock.slock); 3190 if (!sock_owned_by_user(sk)) 3191 break; 3192 } 3193 finish_wait(&sk->sk_lock.wq, &wait); 3194 } 3195 3196 void __release_sock(struct sock *sk) 3197 __releases(&sk->sk_lock.slock) 3198 __acquires(&sk->sk_lock.slock) 3199 { 3200 struct sk_buff *skb, *next; 3201 int nb = 0; 3202 3203 while ((skb = sk->sk_backlog.head) != NULL) { 3204 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 3205 3206 spin_unlock_bh(&sk->sk_lock.slock); 3207 3208 while (1) { 3209 next = skb->next; 3210 prefetch(next); 3211 DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb)); 3212 skb_mark_not_on_list(skb); 3213 sk_backlog_rcv(sk, skb); 3214 3215 skb = next; 3216 if (!skb) 3217 break; 3218 3219 if (!(++nb & 15)) 3220 cond_resched(); 3221 } 3222 3223 spin_lock_bh(&sk->sk_lock.slock); 3224 } 3225 3226 /* 3227 * Doing the zeroing here guarantee we can not loop forever 3228 * while a wild producer attempts to flood us. 3229 */ 3230 sk->sk_backlog.len = 0; 3231 } 3232 3233 void __sk_flush_backlog(struct sock *sk) 3234 { 3235 spin_lock_bh(&sk->sk_lock.slock); 3236 __release_sock(sk); 3237 3238 if (sk->sk_prot->release_cb) 3239 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, 3240 tcp_release_cb, sk); 3241 3242 spin_unlock_bh(&sk->sk_lock.slock); 3243 } 3244 EXPORT_SYMBOL_GPL(__sk_flush_backlog); 3245 3246 /** 3247 * sk_wait_data - wait for data to arrive at sk_receive_queue 3248 * @sk: sock to wait on 3249 * @timeo: for how long 3250 * @skb: last skb seen on sk_receive_queue 3251 * 3252 * Now socket state including sk->sk_err is changed only under lock, 3253 * hence we may omit checks after joining wait queue. 3254 * We check receive queue before schedule() only as optimization; 3255 * it is very likely that release_sock() added new data. 3256 */ 3257 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 3258 { 3259 DEFINE_WAIT_FUNC(wait, woken_wake_function); 3260 int rc; 3261 3262 add_wait_queue(sk_sleep(sk), &wait); 3263 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 3264 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 3265 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 3266 remove_wait_queue(sk_sleep(sk), &wait); 3267 return rc; 3268 } 3269 EXPORT_SYMBOL(sk_wait_data); 3270 3271 /** 3272 * __sk_mem_raise_allocated - increase memory_allocated 3273 * @sk: socket 3274 * @size: memory size to allocate 3275 * @amt: pages to allocate 3276 * @kind: allocation type 3277 * 3278 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc. 3279 * 3280 * Unlike the globally shared limits among the sockets under same protocol, 3281 * consuming the budget of a memcg won't have direct effect on other ones. 3282 * So be optimistic about memcg's tolerance, and leave the callers to decide 3283 * whether or not to raise allocated through sk_under_memory_pressure() or 3284 * its variants. 3285 */ 3286 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 3287 { 3288 bool memcg_enabled = false, charged = false; 3289 struct proto *prot = sk->sk_prot; 3290 long allocated = 0; 3291 3292 if (!sk->sk_bypass_prot_mem) { 3293 sk_memory_allocated_add(sk, amt); 3294 allocated = sk_memory_allocated(sk); 3295 } 3296 3297 if (mem_cgroup_sk_enabled(sk)) { 3298 memcg_enabled = true; 3299 charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge()); 3300 if (!charged) 3301 goto suppress_allocation; 3302 } 3303 3304 if (!allocated) 3305 return 1; 3306 3307 /* Under limit. */ 3308 if (allocated <= sk_prot_mem_limits(sk, 0)) { 3309 sk_leave_memory_pressure(sk); 3310 return 1; 3311 } 3312 3313 /* Under pressure. */ 3314 if (allocated > sk_prot_mem_limits(sk, 1)) 3315 sk_enter_memory_pressure(sk); 3316 3317 /* Over hard limit. */ 3318 if (allocated > sk_prot_mem_limits(sk, 2)) 3319 goto suppress_allocation; 3320 3321 /* Guarantee minimum buffer size under pressure (either global 3322 * or memcg) to make sure features described in RFC 7323 (TCP 3323 * Extensions for High Performance) work properly. 3324 * 3325 * This rule does NOT stand when exceeds global or memcg's hard 3326 * limit, or else a DoS attack can be taken place by spawning 3327 * lots of sockets whose usage are under minimum buffer size. 3328 */ 3329 if (kind == SK_MEM_RECV) { 3330 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 3331 return 1; 3332 3333 } else { /* SK_MEM_SEND */ 3334 int wmem0 = sk_get_wmem0(sk, prot); 3335 3336 if (sk->sk_type == SOCK_STREAM) { 3337 if (sk->sk_wmem_queued < wmem0) 3338 return 1; 3339 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 3340 return 1; 3341 } 3342 } 3343 3344 if (sk_has_memory_pressure(sk)) { 3345 u64 alloc; 3346 3347 /* The following 'average' heuristic is within the 3348 * scope of global accounting, so it only makes 3349 * sense for global memory pressure. 3350 */ 3351 if (!sk_under_global_memory_pressure(sk)) 3352 return 1; 3353 3354 /* Try to be fair among all the sockets under global 3355 * pressure by allowing the ones that below average 3356 * usage to raise. 3357 */ 3358 alloc = sk_sockets_allocated_read_positive(sk); 3359 if (sk_prot_mem_limits(sk, 2) > alloc * 3360 sk_mem_pages(sk->sk_wmem_queued + 3361 atomic_read(&sk->sk_rmem_alloc) + 3362 sk->sk_forward_alloc)) 3363 return 1; 3364 } 3365 3366 suppress_allocation: 3367 3368 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 3369 sk_stream_moderate_sndbuf(sk); 3370 3371 /* Fail only if socket is _under_ its sndbuf. 3372 * In this case we cannot block, so that we have to fail. 3373 */ 3374 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { 3375 /* Force charge with __GFP_NOFAIL */ 3376 if (memcg_enabled && !charged) 3377 mem_cgroup_sk_charge(sk, amt, 3378 gfp_memcg_charge() | __GFP_NOFAIL); 3379 return 1; 3380 } 3381 } 3382 3383 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 3384 3385 if (allocated) 3386 sk_memory_allocated_sub(sk, amt); 3387 3388 if (charged) 3389 mem_cgroup_sk_uncharge(sk, amt); 3390 3391 return 0; 3392 } 3393 3394 /** 3395 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 3396 * @sk: socket 3397 * @size: memory size to allocate 3398 * @kind: allocation type 3399 * 3400 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 3401 * rmem allocation. This function assumes that protocols which have 3402 * memory_pressure use sk_wmem_queued as write buffer accounting. 3403 */ 3404 int __sk_mem_schedule(struct sock *sk, int size, int kind) 3405 { 3406 int ret, amt = sk_mem_pages(size); 3407 3408 sk_forward_alloc_add(sk, amt << PAGE_SHIFT); 3409 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 3410 if (!ret) 3411 sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT)); 3412 return ret; 3413 } 3414 EXPORT_SYMBOL(__sk_mem_schedule); 3415 3416 /** 3417 * __sk_mem_reduce_allocated - reclaim memory_allocated 3418 * @sk: socket 3419 * @amount: number of quanta 3420 * 3421 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 3422 */ 3423 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 3424 { 3425 if (mem_cgroup_sk_enabled(sk)) 3426 mem_cgroup_sk_uncharge(sk, amount); 3427 3428 if (sk->sk_bypass_prot_mem) 3429 return; 3430 3431 sk_memory_allocated_sub(sk, amount); 3432 3433 if (sk_under_global_memory_pressure(sk) && 3434 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 3435 sk_leave_memory_pressure(sk); 3436 } 3437 3438 /** 3439 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 3440 * @sk: socket 3441 * @amount: number of bytes (rounded down to a PAGE_SIZE multiple) 3442 */ 3443 void __sk_mem_reclaim(struct sock *sk, int amount) 3444 { 3445 amount >>= PAGE_SHIFT; 3446 sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT)); 3447 __sk_mem_reduce_allocated(sk, amount); 3448 } 3449 EXPORT_SYMBOL(__sk_mem_reclaim); 3450 3451 void __sk_charge(struct sock *sk, gfp_t gfp) 3452 { 3453 int amt; 3454 3455 gfp |= __GFP_NOFAIL; 3456 if (mem_cgroup_from_sk(sk)) { 3457 /* The socket has not been accepted yet, no need 3458 * to look at newsk->sk_wmem_queued. 3459 */ 3460 amt = sk_mem_pages(sk->sk_forward_alloc + 3461 atomic_read(&sk->sk_rmem_alloc)); 3462 if (amt) 3463 mem_cgroup_sk_charge(sk, amt, gfp); 3464 } 3465 3466 kmem_cache_charge(sk, gfp); 3467 } 3468 3469 int sk_set_peek_off(struct sock *sk, int val) 3470 { 3471 WRITE_ONCE(sk->sk_peek_off, val); 3472 return 0; 3473 } 3474 EXPORT_SYMBOL_GPL(sk_set_peek_off); 3475 3476 /* 3477 * Set of default routines for initialising struct proto_ops when 3478 * the protocol does not support a particular function. In certain 3479 * cases where it makes no sense for a protocol to have a "do nothing" 3480 * function, some default processing is provided. 3481 */ 3482 3483 int sock_no_bind(struct socket *sock, struct sockaddr_unsized *saddr, int len) 3484 { 3485 return -EOPNOTSUPP; 3486 } 3487 EXPORT_SYMBOL(sock_no_bind); 3488 3489 int sock_no_connect(struct socket *sock, struct sockaddr_unsized *saddr, 3490 int len, int flags) 3491 { 3492 return -EOPNOTSUPP; 3493 } 3494 EXPORT_SYMBOL(sock_no_connect); 3495 3496 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 3497 { 3498 return -EOPNOTSUPP; 3499 } 3500 EXPORT_SYMBOL(sock_no_socketpair); 3501 3502 int sock_no_accept(struct socket *sock, struct socket *newsock, 3503 struct proto_accept_arg *arg) 3504 { 3505 return -EOPNOTSUPP; 3506 } 3507 EXPORT_SYMBOL(sock_no_accept); 3508 3509 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 3510 int peer) 3511 { 3512 return -EOPNOTSUPP; 3513 } 3514 EXPORT_SYMBOL(sock_no_getname); 3515 3516 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3517 { 3518 return -EOPNOTSUPP; 3519 } 3520 EXPORT_SYMBOL(sock_no_ioctl); 3521 3522 int sock_no_listen(struct socket *sock, int backlog) 3523 { 3524 return -EOPNOTSUPP; 3525 } 3526 EXPORT_SYMBOL(sock_no_listen); 3527 3528 int sock_no_shutdown(struct socket *sock, int how) 3529 { 3530 return -EOPNOTSUPP; 3531 } 3532 EXPORT_SYMBOL(sock_no_shutdown); 3533 3534 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 3535 { 3536 return -EOPNOTSUPP; 3537 } 3538 EXPORT_SYMBOL(sock_no_sendmsg); 3539 3540 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 3541 { 3542 return -EOPNOTSUPP; 3543 } 3544 EXPORT_SYMBOL(sock_no_sendmsg_locked); 3545 3546 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 3547 int flags) 3548 { 3549 return -EOPNOTSUPP; 3550 } 3551 EXPORT_SYMBOL(sock_no_recvmsg); 3552 3553 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 3554 { 3555 /* Mirror missing mmap method error code */ 3556 return -ENODEV; 3557 } 3558 EXPORT_SYMBOL(sock_no_mmap); 3559 3560 /* 3561 * When a file is received (via SCM_RIGHTS, etc), we must bump the 3562 * various sock-based usage counts. 3563 */ 3564 void __receive_sock(struct file *file) 3565 { 3566 struct socket *sock; 3567 3568 sock = sock_from_file(file); 3569 if (sock) { 3570 sock_update_netprioidx(&sock->sk->sk_cgrp_data); 3571 sock_update_classid(&sock->sk->sk_cgrp_data); 3572 } 3573 } 3574 3575 /* 3576 * Default Socket Callbacks 3577 */ 3578 3579 static void sock_def_wakeup(struct sock *sk) 3580 { 3581 struct socket_wq *wq; 3582 3583 rcu_read_lock(); 3584 wq = rcu_dereference(sk->sk_wq); 3585 if (skwq_has_sleeper(wq)) 3586 wake_up_interruptible_all(&wq->wait); 3587 rcu_read_unlock(); 3588 } 3589 3590 static void sock_def_error_report(struct sock *sk) 3591 { 3592 struct socket_wq *wq; 3593 3594 rcu_read_lock(); 3595 wq = rcu_dereference(sk->sk_wq); 3596 if (skwq_has_sleeper(wq)) 3597 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 3598 sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR); 3599 rcu_read_unlock(); 3600 } 3601 3602 void sock_def_readable(struct sock *sk) 3603 { 3604 struct socket_wq *wq; 3605 3606 trace_sk_data_ready(sk); 3607 3608 rcu_read_lock(); 3609 wq = rcu_dereference(sk->sk_wq); 3610 if (skwq_has_sleeper(wq)) 3611 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 3612 EPOLLRDNORM | EPOLLRDBAND); 3613 sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); 3614 rcu_read_unlock(); 3615 } 3616 3617 static void sock_def_write_space(struct sock *sk) 3618 { 3619 struct socket_wq *wq; 3620 3621 rcu_read_lock(); 3622 3623 /* Do not wake up a writer until he can make "significant" 3624 * progress. --DaveM 3625 */ 3626 if (sock_writeable(sk)) { 3627 wq = rcu_dereference(sk->sk_wq); 3628 if (skwq_has_sleeper(wq)) 3629 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 3630 EPOLLWRNORM | EPOLLWRBAND); 3631 3632 /* Should agree with poll, otherwise some programs break */ 3633 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 3634 } 3635 3636 rcu_read_unlock(); 3637 } 3638 3639 /* An optimised version of sock_def_write_space(), should only be called 3640 * for SOCK_RCU_FREE sockets under RCU read section and after putting 3641 * ->sk_wmem_alloc. 3642 */ 3643 static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc) 3644 { 3645 /* Do not wake up a writer until he can make "significant" 3646 * progress. --DaveM 3647 */ 3648 if (__sock_writeable(sk, wmem_alloc)) { 3649 struct socket_wq *wq = rcu_dereference(sk->sk_wq); 3650 3651 /* rely on refcount_sub from sock_wfree() */ 3652 smp_mb__after_atomic(); 3653 if (wq && waitqueue_active(&wq->wait)) 3654 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 3655 EPOLLWRNORM | EPOLLWRBAND); 3656 3657 /* Should agree with poll, otherwise some programs break */ 3658 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 3659 } 3660 } 3661 3662 static void sock_def_destruct(struct sock *sk) 3663 { 3664 } 3665 3666 void sk_send_sigurg(struct sock *sk) 3667 { 3668 if (sk->sk_socket && sk->sk_socket->file) 3669 if (send_sigurg(sk->sk_socket->file)) 3670 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 3671 } 3672 EXPORT_SYMBOL(sk_send_sigurg); 3673 3674 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 3675 unsigned long expires) 3676 { 3677 if (!mod_timer(timer, expires)) 3678 sock_hold(sk); 3679 } 3680 EXPORT_SYMBOL(sk_reset_timer); 3681 3682 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 3683 { 3684 if (timer_delete(timer)) 3685 __sock_put(sk); 3686 } 3687 EXPORT_SYMBOL(sk_stop_timer); 3688 3689 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) 3690 { 3691 if (timer_delete_sync(timer)) 3692 __sock_put(sk); 3693 } 3694 EXPORT_SYMBOL(sk_stop_timer_sync); 3695 3696 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) 3697 { 3698 sk_init_common(sk); 3699 sk->sk_send_head = NULL; 3700 3701 timer_setup(&sk->sk_timer, NULL, 0); 3702 3703 sk->sk_allocation = GFP_KERNEL; 3704 sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); 3705 sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); 3706 sk->sk_state = TCP_CLOSE; 3707 sk->sk_use_task_frag = true; 3708 sk_set_socket(sk, sock); 3709 3710 sock_set_flag(sk, SOCK_ZAPPED); 3711 3712 if (sock) { 3713 sk->sk_type = sock->type; 3714 RCU_INIT_POINTER(sk->sk_wq, &sock->wq); 3715 sock->sk = sk; 3716 } else { 3717 RCU_INIT_POINTER(sk->sk_wq, NULL); 3718 } 3719 sk->sk_uid = uid; 3720 3721 sk->sk_state_change = sock_def_wakeup; 3722 sk->sk_data_ready = sock_def_readable; 3723 sk->sk_write_space = sock_def_write_space; 3724 sk->sk_error_report = sock_def_error_report; 3725 sk->sk_destruct = sock_def_destruct; 3726 3727 sk->sk_frag.page = NULL; 3728 sk->sk_frag.offset = 0; 3729 sk->sk_peek_off = -1; 3730 3731 sk->sk_peer_pid = NULL; 3732 sk->sk_peer_cred = NULL; 3733 spin_lock_init(&sk->sk_peer_lock); 3734 3735 sk->sk_write_pending = 0; 3736 sk->sk_rcvlowat = 1; 3737 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 3738 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 3739 3740 sk->sk_stamp = SK_DEFAULT_STAMP; 3741 #if BITS_PER_LONG==32 3742 seqlock_init(&sk->sk_stamp_seq); 3743 #endif 3744 atomic_set(&sk->sk_zckey, 0); 3745 3746 #ifdef CONFIG_NET_RX_BUSY_POLL 3747 sk->sk_napi_id = 0; 3748 sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read); 3749 #endif 3750 3751 sk->sk_max_pacing_rate = ~0UL; 3752 sk->sk_pacing_rate = ~0UL; 3753 WRITE_ONCE(sk->sk_pacing_shift, 10); 3754 sk->sk_incoming_cpu = -1; 3755 3756 sk_rx_queue_clear(sk); 3757 /* 3758 * Before updating sk_refcnt, we must commit prior changes to memory 3759 * (Documentation/RCU/rculist_nulls.rst for details) 3760 */ 3761 smp_wmb(); 3762 refcount_set(&sk->sk_refcnt, 1); 3763 sk_drops_reset(sk); 3764 } 3765 EXPORT_SYMBOL(sock_init_data_uid); 3766 3767 void sock_init_data(struct socket *sock, struct sock *sk) 3768 { 3769 kuid_t uid = sock ? 3770 SOCK_INODE(sock)->i_uid : 3771 make_kuid(sock_net(sk)->user_ns, 0); 3772 3773 sock_init_data_uid(sock, sk, uid); 3774 } 3775 EXPORT_SYMBOL(sock_init_data); 3776 3777 void lock_sock_nested(struct sock *sk, int subclass) 3778 { 3779 /* The sk_lock has mutex_lock() semantics here. */ 3780 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 3781 3782 might_sleep(); 3783 spin_lock_bh(&sk->sk_lock.slock); 3784 if (sock_owned_by_user_nocheck(sk)) 3785 __lock_sock(sk); 3786 sk->sk_lock.owned = 1; 3787 spin_unlock_bh(&sk->sk_lock.slock); 3788 } 3789 EXPORT_SYMBOL(lock_sock_nested); 3790 3791 void release_sock(struct sock *sk) 3792 { 3793 spin_lock_bh(&sk->sk_lock.slock); 3794 if (sk->sk_backlog.tail) 3795 __release_sock(sk); 3796 3797 if (sk->sk_prot->release_cb) 3798 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, 3799 tcp_release_cb, sk); 3800 3801 sock_release_ownership(sk); 3802 if (waitqueue_active(&sk->sk_lock.wq)) 3803 wake_up(&sk->sk_lock.wq); 3804 spin_unlock_bh(&sk->sk_lock.slock); 3805 } 3806 EXPORT_SYMBOL(release_sock); 3807 3808 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) 3809 { 3810 might_sleep(); 3811 spin_lock_bh(&sk->sk_lock.slock); 3812 3813 if (!sock_owned_by_user_nocheck(sk)) { 3814 /* 3815 * Fast path return with bottom halves disabled and 3816 * sock::sk_lock.slock held. 3817 * 3818 * The 'mutex' is not contended and holding 3819 * sock::sk_lock.slock prevents all other lockers to 3820 * proceed so the corresponding unlock_sock_fast() can 3821 * avoid the slow path of release_sock() completely and 3822 * just release slock. 3823 * 3824 * From a semantical POV this is equivalent to 'acquiring' 3825 * the 'mutex', hence the corresponding lockdep 3826 * mutex_release() has to happen in the fast path of 3827 * unlock_sock_fast(). 3828 */ 3829 return false; 3830 } 3831 3832 __lock_sock(sk); 3833 sk->sk_lock.owned = 1; 3834 __acquire(&sk->sk_lock.slock); 3835 spin_unlock_bh(&sk->sk_lock.slock); 3836 return true; 3837 } 3838 EXPORT_SYMBOL(__lock_sock_fast); 3839 3840 int sock_gettstamp(struct socket *sock, void __user *userstamp, 3841 bool timeval, bool time32) 3842 { 3843 struct sock *sk = sock->sk; 3844 struct timespec64 ts; 3845 3846 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 3847 ts = ktime_to_timespec64(sock_read_timestamp(sk)); 3848 if (ts.tv_sec == -1) 3849 return -ENOENT; 3850 if (ts.tv_sec == 0) { 3851 ktime_t kt = ktime_get_real(); 3852 sock_write_timestamp(sk, kt); 3853 ts = ktime_to_timespec64(kt); 3854 } 3855 3856 if (timeval) 3857 ts.tv_nsec /= 1000; 3858 3859 #ifdef CONFIG_COMPAT_32BIT_TIME 3860 if (time32) 3861 return put_old_timespec32(&ts, userstamp); 3862 #endif 3863 #ifdef CONFIG_SPARC64 3864 /* beware of padding in sparc64 timeval */ 3865 if (timeval && !in_compat_syscall()) { 3866 struct __kernel_old_timeval __user tv = { 3867 .tv_sec = ts.tv_sec, 3868 .tv_usec = ts.tv_nsec, 3869 }; 3870 if (copy_to_user(userstamp, &tv, sizeof(tv))) 3871 return -EFAULT; 3872 return 0; 3873 } 3874 #endif 3875 return put_timespec64(&ts, userstamp); 3876 } 3877 EXPORT_SYMBOL(sock_gettstamp); 3878 3879 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) 3880 { 3881 if (!sock_flag(sk, flag)) { 3882 unsigned long previous_flags = sk->sk_flags; 3883 3884 sock_set_flag(sk, flag); 3885 /* 3886 * we just set one of the two flags which require net 3887 * time stamping, but time stamping might have been on 3888 * already because of the other one 3889 */ 3890 if (sock_needs_netstamp(sk) && 3891 !(previous_flags & SK_FLAGS_TIMESTAMP)) 3892 net_enable_timestamp(); 3893 } 3894 } 3895 3896 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 3897 int level, int type) 3898 { 3899 struct sock_exterr_skb *serr; 3900 struct sk_buff *skb; 3901 int copied, err; 3902 3903 err = -EAGAIN; 3904 skb = sock_dequeue_err_skb(sk); 3905 if (skb == NULL) 3906 goto out; 3907 3908 copied = skb->len; 3909 if (copied > len) { 3910 msg->msg_flags |= MSG_TRUNC; 3911 copied = len; 3912 } 3913 err = skb_copy_datagram_msg(skb, 0, msg, copied); 3914 if (err) 3915 goto out_free_skb; 3916 3917 sock_recv_timestamp(msg, sk, skb); 3918 3919 serr = SKB_EXT_ERR(skb); 3920 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 3921 3922 msg->msg_flags |= MSG_ERRQUEUE; 3923 err = copied; 3924 3925 out_free_skb: 3926 kfree_skb(skb); 3927 out: 3928 return err; 3929 } 3930 EXPORT_SYMBOL(sock_recv_errqueue); 3931 3932 /* 3933 * Get a socket option on an socket. 3934 * 3935 * FIX: POSIX 1003.1g is very ambiguous here. It states that 3936 * asynchronous errors should be reported by getsockopt. We assume 3937 * this means if you specify SO_ERROR (otherwise what is the point of it). 3938 */ 3939 int sock_common_getsockopt(struct socket *sock, int level, int optname, 3940 char __user *optval, int __user *optlen) 3941 { 3942 struct sock *sk = sock->sk; 3943 3944 /* IPV6_ADDRFORM can change sk->sk_prot under us. */ 3945 return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen); 3946 } 3947 EXPORT_SYMBOL(sock_common_getsockopt); 3948 3949 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 3950 int flags) 3951 { 3952 struct sock *sk = sock->sk; 3953 int addr_len = 0; 3954 int err; 3955 3956 err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len); 3957 if (err >= 0) 3958 msg->msg_namelen = addr_len; 3959 return err; 3960 } 3961 EXPORT_SYMBOL(sock_common_recvmsg); 3962 3963 /* 3964 * Set socket options on an inet socket. 3965 */ 3966 int sock_common_setsockopt(struct socket *sock, int level, int optname, 3967 sockptr_t optval, unsigned int optlen) 3968 { 3969 struct sock *sk = sock->sk; 3970 3971 /* IPV6_ADDRFORM can change sk->sk_prot under us. */ 3972 return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen); 3973 } 3974 EXPORT_SYMBOL(sock_common_setsockopt); 3975 3976 void sk_common_release(struct sock *sk) 3977 { 3978 if (sk->sk_prot->destroy) 3979 sk->sk_prot->destroy(sk); 3980 3981 /* 3982 * Observation: when sk_common_release is called, processes have 3983 * no access to socket. But net still has. 3984 * Step one, detach it from networking: 3985 * 3986 * A. Remove from hash tables. 3987 */ 3988 3989 sk->sk_prot->unhash(sk); 3990 3991 /* 3992 * In this point socket cannot receive new packets, but it is possible 3993 * that some packets are in flight because some CPU runs receiver and 3994 * did hash table lookup before we unhashed socket. They will achieve 3995 * receive queue and will be purged by socket destructor. 3996 * 3997 * Also we still have packets pending on receive queue and probably, 3998 * our own packets waiting in device queues. sock_destroy will drain 3999 * receive queue, but transmitted packets will delay socket destruction 4000 * until the last reference will be released. 4001 */ 4002 4003 sock_orphan(sk); 4004 4005 xfrm_sk_free_policy(sk); 4006 4007 sock_put(sk); 4008 } 4009 EXPORT_SYMBOL(sk_common_release); 4010 4011 void sk_get_meminfo(const struct sock *sk, u32 *mem) 4012 { 4013 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 4014 4015 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 4016 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); 4017 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 4018 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); 4019 mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc); 4020 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); 4021 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 4022 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); 4023 mem[SK_MEMINFO_DROPS] = sk_drops_read(sk); 4024 } 4025 4026 #ifdef CONFIG_PROC_FS 4027 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 4028 4029 int sock_prot_inuse_get(struct net *net, struct proto *prot) 4030 { 4031 int cpu, idx = prot->inuse_idx; 4032 int res = 0; 4033 4034 for_each_possible_cpu(cpu) 4035 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 4036 4037 return res >= 0 ? res : 0; 4038 } 4039 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 4040 4041 int sock_inuse_get(struct net *net) 4042 { 4043 int cpu, res = 0; 4044 4045 for_each_possible_cpu(cpu) 4046 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all; 4047 4048 return res; 4049 } 4050 4051 EXPORT_SYMBOL_GPL(sock_inuse_get); 4052 4053 static int __net_init sock_inuse_init_net(struct net *net) 4054 { 4055 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 4056 if (net->core.prot_inuse == NULL) 4057 return -ENOMEM; 4058 return 0; 4059 } 4060 4061 static void __net_exit sock_inuse_exit_net(struct net *net) 4062 { 4063 free_percpu(net->core.prot_inuse); 4064 } 4065 4066 static struct pernet_operations net_inuse_ops = { 4067 .init = sock_inuse_init_net, 4068 .exit = sock_inuse_exit_net, 4069 }; 4070 4071 static __init int net_inuse_init(void) 4072 { 4073 if (register_pernet_subsys(&net_inuse_ops)) 4074 panic("Cannot initialize net inuse counters"); 4075 4076 return 0; 4077 } 4078 4079 core_initcall(net_inuse_init); 4080 4081 static int assign_proto_idx(struct proto *prot) 4082 { 4083 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 4084 4085 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) { 4086 pr_err("PROTO_INUSE_NR exhausted\n"); 4087 return -ENOSPC; 4088 } 4089 4090 set_bit(prot->inuse_idx, proto_inuse_idx); 4091 return 0; 4092 } 4093 4094 static void release_proto_idx(struct proto *prot) 4095 { 4096 if (prot->inuse_idx != PROTO_INUSE_NR) 4097 clear_bit(prot->inuse_idx, proto_inuse_idx); 4098 } 4099 #else 4100 static inline int assign_proto_idx(struct proto *prot) 4101 { 4102 return 0; 4103 } 4104 4105 static inline void release_proto_idx(struct proto *prot) 4106 { 4107 } 4108 4109 #endif 4110 4111 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) 4112 { 4113 if (!twsk_prot) 4114 return; 4115 kfree(twsk_prot->twsk_slab_name); 4116 twsk_prot->twsk_slab_name = NULL; 4117 kmem_cache_destroy(twsk_prot->twsk_slab); 4118 twsk_prot->twsk_slab = NULL; 4119 } 4120 4121 static int tw_prot_init(const struct proto *prot) 4122 { 4123 struct timewait_sock_ops *twsk_prot = prot->twsk_prot; 4124 4125 if (!twsk_prot) 4126 return 0; 4127 4128 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", 4129 prot->name); 4130 if (!twsk_prot->twsk_slab_name) 4131 return -ENOMEM; 4132 4133 twsk_prot->twsk_slab = 4134 kmem_cache_create(twsk_prot->twsk_slab_name, 4135 twsk_prot->twsk_obj_size, 0, 4136 SLAB_ACCOUNT | prot->slab_flags, 4137 NULL); 4138 if (!twsk_prot->twsk_slab) { 4139 pr_crit("%s: Can't create timewait sock SLAB cache!\n", 4140 prot->name); 4141 return -ENOMEM; 4142 } 4143 4144 return 0; 4145 } 4146 4147 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 4148 { 4149 if (!rsk_prot) 4150 return; 4151 kfree(rsk_prot->slab_name); 4152 rsk_prot->slab_name = NULL; 4153 kmem_cache_destroy(rsk_prot->slab); 4154 rsk_prot->slab = NULL; 4155 } 4156 4157 static int req_prot_init(const struct proto *prot) 4158 { 4159 struct request_sock_ops *rsk_prot = prot->rsk_prot; 4160 4161 if (!rsk_prot) 4162 return 0; 4163 4164 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 4165 prot->name); 4166 if (!rsk_prot->slab_name) 4167 return -ENOMEM; 4168 4169 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 4170 rsk_prot->obj_size, 0, 4171 SLAB_ACCOUNT | prot->slab_flags, 4172 NULL); 4173 4174 if (!rsk_prot->slab) { 4175 pr_crit("%s: Can't create request sock SLAB cache!\n", 4176 prot->name); 4177 return -ENOMEM; 4178 } 4179 return 0; 4180 } 4181 4182 int proto_register(struct proto *prot, int alloc_slab) 4183 { 4184 int ret = -ENOBUFS; 4185 4186 if (prot->memory_allocated && !prot->sysctl_mem) { 4187 pr_err("%s: missing sysctl_mem\n", prot->name); 4188 return -EINVAL; 4189 } 4190 if (prot->memory_allocated && !prot->per_cpu_fw_alloc) { 4191 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name); 4192 return -EINVAL; 4193 } 4194 if (alloc_slab) { 4195 prot->slab = kmem_cache_create_usercopy(prot->name, 4196 prot->obj_size, 0, 4197 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | 4198 prot->slab_flags, 4199 prot->useroffset, prot->usersize, 4200 NULL); 4201 4202 if (prot->slab == NULL) { 4203 pr_crit("%s: Can't create sock SLAB cache!\n", 4204 prot->name); 4205 goto out; 4206 } 4207 4208 if (req_prot_init(prot)) 4209 goto out_free_request_sock_slab; 4210 4211 if (tw_prot_init(prot)) 4212 goto out_free_timewait_sock_slab; 4213 } 4214 4215 mutex_lock(&proto_list_mutex); 4216 ret = assign_proto_idx(prot); 4217 if (ret) { 4218 mutex_unlock(&proto_list_mutex); 4219 goto out_free_timewait_sock_slab; 4220 } 4221 list_add(&prot->node, &proto_list); 4222 mutex_unlock(&proto_list_mutex); 4223 return ret; 4224 4225 out_free_timewait_sock_slab: 4226 if (alloc_slab) 4227 tw_prot_cleanup(prot->twsk_prot); 4228 out_free_request_sock_slab: 4229 if (alloc_slab) { 4230 req_prot_cleanup(prot->rsk_prot); 4231 4232 kmem_cache_destroy(prot->slab); 4233 prot->slab = NULL; 4234 } 4235 out: 4236 return ret; 4237 } 4238 EXPORT_SYMBOL(proto_register); 4239 4240 void proto_unregister(struct proto *prot) 4241 { 4242 mutex_lock(&proto_list_mutex); 4243 release_proto_idx(prot); 4244 list_del(&prot->node); 4245 mutex_unlock(&proto_list_mutex); 4246 4247 kmem_cache_destroy(prot->slab); 4248 prot->slab = NULL; 4249 4250 req_prot_cleanup(prot->rsk_prot); 4251 tw_prot_cleanup(prot->twsk_prot); 4252 } 4253 EXPORT_SYMBOL(proto_unregister); 4254 4255 int sock_load_diag_module(int family, int protocol) 4256 { 4257 if (!protocol) { 4258 if (!sock_is_registered(family)) 4259 return -ENOENT; 4260 4261 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 4262 NETLINK_SOCK_DIAG, family); 4263 } 4264 4265 #ifdef CONFIG_INET 4266 if (family == AF_INET && 4267 protocol != IPPROTO_RAW && 4268 protocol < MAX_INET_PROTOS && 4269 !rcu_access_pointer(inet_protos[protocol])) 4270 return -ENOENT; 4271 #endif 4272 4273 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 4274 NETLINK_SOCK_DIAG, family, protocol); 4275 } 4276 EXPORT_SYMBOL(sock_load_diag_module); 4277 4278 #ifdef CONFIG_PROC_FS 4279 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 4280 __acquires(proto_list_mutex) 4281 { 4282 mutex_lock(&proto_list_mutex); 4283 return seq_list_start_head(&proto_list, *pos); 4284 } 4285 4286 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 4287 { 4288 return seq_list_next(v, &proto_list, pos); 4289 } 4290 4291 static void proto_seq_stop(struct seq_file *seq, void *v) 4292 __releases(proto_list_mutex) 4293 { 4294 mutex_unlock(&proto_list_mutex); 4295 } 4296 4297 static char proto_method_implemented(const void *method) 4298 { 4299 return method == NULL ? 'n' : 'y'; 4300 } 4301 static long sock_prot_memory_allocated(struct proto *proto) 4302 { 4303 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 4304 } 4305 4306 static const char *sock_prot_memory_pressure(struct proto *proto) 4307 { 4308 return proto->memory_pressure != NULL ? 4309 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 4310 } 4311 4312 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 4313 { 4314 4315 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 4316 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 4317 proto->name, 4318 proto->obj_size, 4319 sock_prot_inuse_get(seq_file_net(seq), proto), 4320 sock_prot_memory_allocated(proto), 4321 sock_prot_memory_pressure(proto), 4322 proto->max_header, 4323 proto->slab == NULL ? "no" : "yes", 4324 module_name(proto->owner), 4325 proto_method_implemented(proto->close), 4326 proto_method_implemented(proto->connect), 4327 proto_method_implemented(proto->disconnect), 4328 proto_method_implemented(proto->accept), 4329 proto_method_implemented(proto->ioctl), 4330 proto_method_implemented(proto->init), 4331 proto_method_implemented(proto->destroy), 4332 proto_method_implemented(proto->shutdown), 4333 proto_method_implemented(proto->setsockopt), 4334 proto_method_implemented(proto->getsockopt), 4335 proto_method_implemented(proto->sendmsg), 4336 proto_method_implemented(proto->recvmsg), 4337 proto_method_implemented(proto->bind), 4338 proto_method_implemented(proto->backlog_rcv), 4339 proto_method_implemented(proto->hash), 4340 proto_method_implemented(proto->unhash), 4341 proto_method_implemented(proto->get_port), 4342 proto_method_implemented(proto->enter_memory_pressure)); 4343 } 4344 4345 static int proto_seq_show(struct seq_file *seq, void *v) 4346 { 4347 if (v == &proto_list) 4348 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 4349 "protocol", 4350 "size", 4351 "sockets", 4352 "memory", 4353 "press", 4354 "maxhdr", 4355 "slab", 4356 "module", 4357 "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n"); 4358 else 4359 proto_seq_printf(seq, list_entry(v, struct proto, node)); 4360 return 0; 4361 } 4362 4363 static const struct seq_operations proto_seq_ops = { 4364 .start = proto_seq_start, 4365 .next = proto_seq_next, 4366 .stop = proto_seq_stop, 4367 .show = proto_seq_show, 4368 }; 4369 4370 static __net_init int proto_init_net(struct net *net) 4371 { 4372 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, 4373 sizeof(struct seq_net_private))) 4374 return -ENOMEM; 4375 4376 return 0; 4377 } 4378 4379 static __net_exit void proto_exit_net(struct net *net) 4380 { 4381 remove_proc_entry("protocols", net->proc_net); 4382 } 4383 4384 4385 static __net_initdata struct pernet_operations proto_net_ops = { 4386 .init = proto_init_net, 4387 .exit = proto_exit_net, 4388 }; 4389 4390 static int __init proto_init(void) 4391 { 4392 return register_pernet_subsys(&proto_net_ops); 4393 } 4394 4395 subsys_initcall(proto_init); 4396 4397 #endif /* PROC_FS */ 4398 4399 #ifdef CONFIG_NET_RX_BUSY_POLL 4400 bool sk_busy_loop_end(void *p, unsigned long start_time) 4401 { 4402 struct sock *sk = p; 4403 4404 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 4405 return true; 4406 4407 if (sk_is_udp(sk) && 4408 !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) 4409 return true; 4410 4411 return sk_busy_loop_timeout(sk, start_time); 4412 } 4413 EXPORT_SYMBOL(sk_busy_loop_end); 4414 #endif /* CONFIG_NET_RX_BUSY_POLL */ 4415 4416 int sock_bind_add(struct sock *sk, struct sockaddr_unsized *addr, int addr_len) 4417 { 4418 if (!sk->sk_prot->bind_add) 4419 return -EOPNOTSUPP; 4420 return sk->sk_prot->bind_add(sk, addr, addr_len); 4421 } 4422 EXPORT_SYMBOL(sock_bind_add); 4423 4424 /* Copy 'size' bytes from userspace and return `size` back to userspace */ 4425 int sock_ioctl_inout(struct sock *sk, unsigned int cmd, 4426 void __user *arg, void *karg, size_t size) 4427 { 4428 int ret; 4429 4430 if (copy_from_user(karg, arg, size)) 4431 return -EFAULT; 4432 4433 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg); 4434 if (ret) 4435 return ret; 4436 4437 if (copy_to_user(arg, karg, size)) 4438 return -EFAULT; 4439 4440 return 0; 4441 } 4442 EXPORT_SYMBOL(sock_ioctl_inout); 4443 4444 /* This is the most common ioctl prep function, where the result (4 bytes) is 4445 * copied back to userspace if the ioctl() returns successfully. No input is 4446 * copied from userspace as input argument. 4447 */ 4448 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg) 4449 { 4450 int ret, karg = 0; 4451 4452 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg); 4453 if (ret) 4454 return ret; 4455 4456 return put_user(karg, (int __user *)arg); 4457 } 4458 4459 /* A wrapper around sock ioctls, which copies the data from userspace 4460 * (depending on the protocol/ioctl), and copies back the result to userspace. 4461 * The main motivation for this function is to pass kernel memory to the 4462 * protocol ioctl callbacks, instead of userspace memory. 4463 */ 4464 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) 4465 { 4466 int rc = 1; 4467 4468 if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET) 4469 rc = ipmr_sk_ioctl(sk, cmd, arg); 4470 else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6) 4471 rc = ip6mr_sk_ioctl(sk, cmd, arg); 4472 else if (sk_is_phonet(sk)) 4473 rc = phonet_sk_ioctl(sk, cmd, arg); 4474 4475 /* If ioctl was processed, returns its value */ 4476 if (rc <= 0) 4477 return rc; 4478 4479 /* Otherwise call the default handler */ 4480 return sock_ioctl_out(sk, cmd, arg); 4481 } 4482 EXPORT_SYMBOL(sk_ioctl); 4483 4484 static int __init sock_struct_check(void) 4485 { 4486 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops); 4487 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off); 4488 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue); 4489 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue); 4490 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog); 4491 4492 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst); 4493 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex); 4494 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie); 4495 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf); 4496 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter); 4497 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq); 4498 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready); 4499 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo); 4500 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat); 4501 4502 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err); 4503 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket); 4504 #ifdef CONFIG_MEMCG 4505 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); 4506 #endif 4507 4508 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); 4509 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); 4510 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc); 4511 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags); 4512 4513 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); 4514 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); 4515 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft); 4516 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued); 4517 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc); 4518 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags); 4519 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head); 4520 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue); 4521 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending); 4522 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag); 4523 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer); 4524 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate); 4525 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey); 4526 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey); 4527 4528 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_pending_confirm); 4529 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_status); 4530 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate); 4531 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo); 4532 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority); 4533 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark); 4534 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid); 4535 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol); 4536 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache); 4537 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps); 4538 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type); 4539 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size); 4540 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation); 4541 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash); 4542 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf); 4543 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs); 4544 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift); 4545 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag); 4546 return 0; 4547 } 4548 4549 core_initcall(sock_struct_check); 4550