1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * Generic socket support routines. Memory allocators, socket lock/release 8 * handler for protocols to use and generic option handler. 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 */ 85 86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 87 88 #include <linux/unaligned.h> 89 #include <linux/capability.h> 90 #include <linux/errno.h> 91 #include <linux/errqueue.h> 92 #include <linux/types.h> 93 #include <linux/socket.h> 94 #include <linux/in.h> 95 #include <linux/kernel.h> 96 #include <linux/module.h> 97 #include <linux/proc_fs.h> 98 #include <linux/seq_file.h> 99 #include <linux/sched.h> 100 #include <linux/sched/mm.h> 101 #include <linux/timer.h> 102 #include <linux/string.h> 103 #include <linux/sockios.h> 104 #include <linux/net.h> 105 #include <linux/mm.h> 106 #include <linux/slab.h> 107 #include <linux/interrupt.h> 108 #include <linux/poll.h> 109 #include <linux/tcp.h> 110 #include <linux/udp.h> 111 #include <linux/init.h> 112 #include <linux/highmem.h> 113 #include <linux/user_namespace.h> 114 #include <linux/static_key.h> 115 #include <linux/memcontrol.h> 116 #include <linux/prefetch.h> 117 #include <linux/compat.h> 118 #include <linux/mroute.h> 119 #include <linux/mroute6.h> 120 #include <linux/icmpv6.h> 121 122 #include <linux/uaccess.h> 123 124 #include <linux/netdevice.h> 125 #include <net/protocol.h> 126 #include <linux/skbuff.h> 127 #include <linux/skbuff_ref.h> 128 #include <net/net_namespace.h> 129 #include <net/request_sock.h> 130 #include <net/sock.h> 131 #include <net/proto_memory.h> 132 #include <linux/net_tstamp.h> 133 #include <net/xfrm.h> 134 #include <linux/ipsec.h> 135 #include <net/cls_cgroup.h> 136 #include <net/netprio_cgroup.h> 137 #include <linux/sock_diag.h> 138 139 #include <linux/filter.h> 140 #include <net/sock_reuseport.h> 141 #include <net/bpf_sk_storage.h> 142 143 #include <trace/events/sock.h> 144 145 #include <net/tcp.h> 146 #include <net/busy_poll.h> 147 #include <net/phonet/phonet.h> 148 149 #include <linux/ethtool.h> 150 151 #include <uapi/linux/pidfd.h> 152 153 #include "dev.h" 154 155 static DEFINE_MUTEX(proto_list_mutex); 156 static LIST_HEAD(proto_list); 157 158 static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc); 159 static void sock_def_write_space(struct sock *sk); 160 161 /** 162 * sk_ns_capable - General socket capability test 163 * @sk: Socket to use a capability on or through 164 * @user_ns: The user namespace of the capability to use 165 * @cap: The capability to use 166 * 167 * Test to see if the opener of the socket had when the socket was 168 * created and the current process has the capability @cap in the user 169 * namespace @user_ns. 170 */ 171 bool sk_ns_capable(const struct sock *sk, 172 struct user_namespace *user_ns, int cap) 173 { 174 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 175 ns_capable(user_ns, cap); 176 } 177 EXPORT_SYMBOL(sk_ns_capable); 178 179 /** 180 * sk_capable - Socket global capability test 181 * @sk: Socket to use a capability on or through 182 * @cap: The global capability to use 183 * 184 * Test to see if the opener of the socket had when the socket was 185 * created and the current process has the capability @cap in all user 186 * namespaces. 187 */ 188 bool sk_capable(const struct sock *sk, int cap) 189 { 190 return sk_ns_capable(sk, &init_user_ns, cap); 191 } 192 EXPORT_SYMBOL(sk_capable); 193 194 /** 195 * sk_net_capable - Network namespace socket capability test 196 * @sk: Socket to use a capability on or through 197 * @cap: The capability to use 198 * 199 * Test to see if the opener of the socket had when the socket was created 200 * and the current process has the capability @cap over the network namespace 201 * the socket is a member of. 202 */ 203 bool sk_net_capable(const struct sock *sk, int cap) 204 { 205 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 206 } 207 EXPORT_SYMBOL(sk_net_capable); 208 209 /* 210 * Each address family might have different locking rules, so we have 211 * one slock key per address family and separate keys for internal and 212 * userspace sockets. 213 */ 214 static struct lock_class_key af_family_keys[AF_MAX]; 215 static struct lock_class_key af_family_kern_keys[AF_MAX]; 216 static struct lock_class_key af_family_slock_keys[AF_MAX]; 217 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 218 219 /* 220 * Make lock validator output more readable. (we pre-construct these 221 * strings build-time, so that runtime initialization of socket 222 * locks is fast): 223 */ 224 225 #define _sock_locks(x) \ 226 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 227 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 228 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 229 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 230 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 231 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 232 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 233 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 234 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 235 x "27" , x "28" , x "AF_CAN" , \ 236 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 237 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 238 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 239 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 240 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 241 x "AF_MCTP" , \ 242 x "AF_MAX" 243 244 static const char *const af_family_key_strings[AF_MAX+1] = { 245 _sock_locks("sk_lock-") 246 }; 247 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 248 _sock_locks("slock-") 249 }; 250 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 251 _sock_locks("clock-") 252 }; 253 254 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 255 _sock_locks("k-sk_lock-") 256 }; 257 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 258 _sock_locks("k-slock-") 259 }; 260 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 261 _sock_locks("k-clock-") 262 }; 263 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 264 _sock_locks("rlock-") 265 }; 266 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 267 _sock_locks("wlock-") 268 }; 269 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 270 _sock_locks("elock-") 271 }; 272 273 /* 274 * sk_callback_lock and sk queues locking rules are per-address-family, 275 * so split the lock classes by using a per-AF key: 276 */ 277 static struct lock_class_key af_callback_keys[AF_MAX]; 278 static struct lock_class_key af_rlock_keys[AF_MAX]; 279 static struct lock_class_key af_wlock_keys[AF_MAX]; 280 static struct lock_class_key af_elock_keys[AF_MAX]; 281 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 282 283 /* Run time adjustable parameters. */ 284 __u32 sysctl_wmem_max __read_mostly = 4 << 20; 285 EXPORT_SYMBOL(sysctl_wmem_max); 286 __u32 sysctl_rmem_max __read_mostly = 4 << 20; 287 EXPORT_SYMBOL(sysctl_rmem_max); 288 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_DEFAULT; 289 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_DEFAULT; 290 291 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 292 EXPORT_SYMBOL_GPL(memalloc_socks_key); 293 294 /** 295 * sk_set_memalloc - sets %SOCK_MEMALLOC 296 * @sk: socket to set it on 297 * 298 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 299 * It's the responsibility of the admin to adjust min_free_kbytes 300 * to meet the requirements 301 */ 302 void sk_set_memalloc(struct sock *sk) 303 { 304 sock_set_flag(sk, SOCK_MEMALLOC); 305 sk->sk_allocation |= __GFP_MEMALLOC; 306 static_branch_inc(&memalloc_socks_key); 307 } 308 EXPORT_SYMBOL_GPL(sk_set_memalloc); 309 310 void sk_clear_memalloc(struct sock *sk) 311 { 312 sock_reset_flag(sk, SOCK_MEMALLOC); 313 sk->sk_allocation &= ~__GFP_MEMALLOC; 314 static_branch_dec(&memalloc_socks_key); 315 316 /* 317 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 318 * progress of swapping. SOCK_MEMALLOC may be cleared while 319 * it has rmem allocations due to the last swapfile being deactivated 320 * but there is a risk that the socket is unusable due to exceeding 321 * the rmem limits. Reclaim the reserves and obey rmem limits again. 322 */ 323 sk_mem_reclaim(sk); 324 } 325 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 326 327 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 328 { 329 int ret; 330 unsigned int noreclaim_flag; 331 332 /* these should have been dropped before queueing */ 333 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 334 335 noreclaim_flag = memalloc_noreclaim_save(); 336 ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv, 337 tcp_v6_do_rcv, 338 tcp_v4_do_rcv, 339 sk, skb); 340 memalloc_noreclaim_restore(noreclaim_flag); 341 342 return ret; 343 } 344 EXPORT_SYMBOL(__sk_backlog_rcv); 345 346 void sk_error_report(struct sock *sk) 347 { 348 sk->sk_error_report(sk); 349 350 switch (sk->sk_family) { 351 case AF_INET: 352 fallthrough; 353 case AF_INET6: 354 trace_inet_sk_error_report(sk); 355 break; 356 default: 357 break; 358 } 359 } 360 EXPORT_SYMBOL(sk_error_report); 361 362 int sock_get_timeout(long timeo, void *optval, bool old_timeval) 363 { 364 struct __kernel_sock_timeval tv; 365 366 if (timeo == MAX_SCHEDULE_TIMEOUT) { 367 tv.tv_sec = 0; 368 tv.tv_usec = 0; 369 } else { 370 tv.tv_sec = timeo / HZ; 371 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ; 372 } 373 374 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 375 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec }; 376 *(struct old_timeval32 *)optval = tv32; 377 return sizeof(tv32); 378 } 379 380 if (old_timeval) { 381 struct __kernel_old_timeval old_tv; 382 old_tv.tv_sec = tv.tv_sec; 383 old_tv.tv_usec = tv.tv_usec; 384 *(struct __kernel_old_timeval *)optval = old_tv; 385 return sizeof(old_tv); 386 } 387 388 *(struct __kernel_sock_timeval *)optval = tv; 389 return sizeof(tv); 390 } 391 EXPORT_SYMBOL(sock_get_timeout); 392 393 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv, 394 sockptr_t optval, int optlen, bool old_timeval) 395 { 396 if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) { 397 struct old_timeval32 tv32; 398 399 if (optlen < sizeof(tv32)) 400 return -EINVAL; 401 402 if (copy_from_sockptr(&tv32, optval, sizeof(tv32))) 403 return -EFAULT; 404 tv->tv_sec = tv32.tv_sec; 405 tv->tv_usec = tv32.tv_usec; 406 } else if (old_timeval) { 407 struct __kernel_old_timeval old_tv; 408 409 if (optlen < sizeof(old_tv)) 410 return -EINVAL; 411 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv))) 412 return -EFAULT; 413 tv->tv_sec = old_tv.tv_sec; 414 tv->tv_usec = old_tv.tv_usec; 415 } else { 416 if (optlen < sizeof(*tv)) 417 return -EINVAL; 418 if (copy_from_sockptr(tv, optval, sizeof(*tv))) 419 return -EFAULT; 420 } 421 422 return 0; 423 } 424 EXPORT_SYMBOL(sock_copy_user_timeval); 425 426 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen, 427 bool old_timeval) 428 { 429 struct __kernel_sock_timeval tv; 430 int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval); 431 long val; 432 433 if (err) 434 return err; 435 436 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 437 return -EDOM; 438 439 if (tv.tv_sec < 0) { 440 static int warned __read_mostly; 441 442 WRITE_ONCE(*timeo_p, 0); 443 if (warned < 10 && net_ratelimit()) { 444 warned++; 445 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 446 __func__, current->comm, task_pid_nr(current)); 447 } 448 return 0; 449 } 450 val = MAX_SCHEDULE_TIMEOUT; 451 if ((tv.tv_sec || tv.tv_usec) && 452 (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))) 453 val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, 454 USEC_PER_SEC / HZ); 455 WRITE_ONCE(*timeo_p, val); 456 return 0; 457 } 458 459 static bool sk_set_prio_allowed(const struct sock *sk, int val) 460 { 461 return ((val >= TC_PRIO_BESTEFFORT && val <= TC_PRIO_INTERACTIVE) || 462 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || 463 sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)); 464 } 465 466 static bool sock_needs_netstamp(const struct sock *sk) 467 { 468 switch (sk->sk_family) { 469 case AF_UNSPEC: 470 case AF_UNIX: 471 return false; 472 default: 473 return true; 474 } 475 } 476 477 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 478 { 479 if (sk->sk_flags & flags) { 480 sk->sk_flags &= ~flags; 481 if (sock_needs_netstamp(sk) && 482 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 483 net_disable_timestamp(); 484 } 485 } 486 487 488 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 489 { 490 unsigned long flags; 491 struct sk_buff_head *list = &sk->sk_receive_queue; 492 493 if (atomic_read(&sk->sk_rmem_alloc) >= READ_ONCE(sk->sk_rcvbuf)) { 494 sk_drops_inc(sk); 495 trace_sock_rcvqueue_full(sk, skb); 496 return -ENOMEM; 497 } 498 499 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 500 sk_drops_inc(sk); 501 return -ENOBUFS; 502 } 503 504 skb->dev = NULL; 505 skb_set_owner_r(skb, sk); 506 507 /* we escape from rcu protected region, make sure we dont leak 508 * a norefcounted dst 509 */ 510 skb_dst_force(skb); 511 512 spin_lock_irqsave(&list->lock, flags); 513 sock_skb_set_dropcount(sk, skb); 514 __skb_queue_tail(list, skb); 515 spin_unlock_irqrestore(&list->lock, flags); 516 517 if (!sock_flag(sk, SOCK_DEAD)) 518 sk->sk_data_ready(sk); 519 return 0; 520 } 521 EXPORT_SYMBOL(__sock_queue_rcv_skb); 522 523 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb, 524 enum skb_drop_reason *reason) 525 { 526 enum skb_drop_reason drop_reason; 527 int err; 528 529 err = sk_filter_reason(sk, skb, &drop_reason); 530 if (err) 531 goto out; 532 533 err = __sock_queue_rcv_skb(sk, skb); 534 switch (err) { 535 case -ENOMEM: 536 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; 537 break; 538 case -ENOBUFS: 539 drop_reason = SKB_DROP_REASON_PROTO_MEM; 540 break; 541 default: 542 drop_reason = SKB_NOT_DROPPED_YET; 543 break; 544 } 545 out: 546 if (reason) 547 *reason = drop_reason; 548 return err; 549 } 550 EXPORT_SYMBOL(sock_queue_rcv_skb_reason); 551 552 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 553 const int nested, unsigned int trim_cap, bool refcounted) 554 { 555 enum skb_drop_reason reason = SKB_DROP_REASON_NOT_SPECIFIED; 556 int rc = NET_RX_SUCCESS; 557 int err; 558 559 if (sk_filter_trim_cap(sk, skb, trim_cap, &reason)) 560 goto discard_and_relse; 561 562 skb->dev = NULL; 563 564 if (sk_rcvqueues_full(sk, READ_ONCE(sk->sk_rcvbuf))) { 565 sk_drops_inc(sk); 566 reason = SKB_DROP_REASON_SOCKET_RCVBUFF; 567 goto discard_and_relse; 568 } 569 if (nested) 570 bh_lock_sock_nested(sk); 571 else 572 bh_lock_sock(sk); 573 if (!sock_owned_by_user(sk)) { 574 /* 575 * trylock + unlock semantics: 576 */ 577 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 578 579 rc = sk_backlog_rcv(sk, skb); 580 581 mutex_release(&sk->sk_lock.dep_map, _RET_IP_); 582 } else if ((err = sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf)))) { 583 bh_unlock_sock(sk); 584 if (err == -ENOMEM) 585 reason = SKB_DROP_REASON_PFMEMALLOC; 586 if (err == -ENOBUFS) 587 reason = SKB_DROP_REASON_SOCKET_BACKLOG; 588 sk_drops_inc(sk); 589 goto discard_and_relse; 590 } 591 592 bh_unlock_sock(sk); 593 out: 594 if (refcounted) 595 sock_put(sk); 596 return rc; 597 discard_and_relse: 598 sk_skb_reason_drop(sk, skb, reason); 599 goto out; 600 } 601 EXPORT_SYMBOL(__sk_receive_skb); 602 603 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *, 604 u32)); 605 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *, 606 u32)); 607 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 608 { 609 struct dst_entry *dst = __sk_dst_get(sk); 610 611 if (dst && READ_ONCE(dst->obsolete) && 612 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 613 dst, cookie) == NULL) { 614 sk_tx_queue_clear(sk); 615 WRITE_ONCE(sk->sk_dst_pending_confirm, 0); 616 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 617 dst_release(dst); 618 return NULL; 619 } 620 621 return dst; 622 } 623 EXPORT_SYMBOL(__sk_dst_check); 624 625 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 626 { 627 struct dst_entry *dst = sk_dst_get(sk); 628 629 if (dst && READ_ONCE(dst->obsolete) && 630 INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check, 631 dst, cookie) == NULL) { 632 sk_dst_reset(sk); 633 dst_release(dst); 634 return NULL; 635 } 636 637 return dst; 638 } 639 EXPORT_SYMBOL(sk_dst_check); 640 641 static int sock_bindtoindex_locked(struct sock *sk, int ifindex) 642 { 643 int ret = -ENOPROTOOPT; 644 #ifdef CONFIG_NETDEVICES 645 struct net *net = sock_net(sk); 646 647 /* Sorry... */ 648 ret = -EPERM; 649 if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW)) 650 goto out; 651 652 ret = -EINVAL; 653 if (ifindex < 0) 654 goto out; 655 656 /* Paired with all READ_ONCE() done locklessly. */ 657 WRITE_ONCE(sk->sk_bound_dev_if, ifindex); 658 659 if (sk->sk_prot->rehash) 660 sk->sk_prot->rehash(sk); 661 sk_dst_reset(sk); 662 663 ret = 0; 664 665 out: 666 #endif 667 668 return ret; 669 } 670 671 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk) 672 { 673 int ret; 674 675 if (lock_sk) 676 lock_sock(sk); 677 ret = sock_bindtoindex_locked(sk, ifindex); 678 if (lock_sk) 679 release_sock(sk); 680 681 return ret; 682 } 683 EXPORT_SYMBOL(sock_bindtoindex); 684 685 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) 686 { 687 int ret = -ENOPROTOOPT; 688 #ifdef CONFIG_NETDEVICES 689 struct net *net = sock_net(sk); 690 char devname[IFNAMSIZ]; 691 int index; 692 693 ret = -EINVAL; 694 if (optlen < 0) 695 goto out; 696 697 /* Bind this socket to a particular device like "eth0", 698 * as specified in the passed interface name. If the 699 * name is "" or the option length is zero the socket 700 * is not bound. 701 */ 702 if (optlen > IFNAMSIZ - 1) 703 optlen = IFNAMSIZ - 1; 704 memset(devname, 0, sizeof(devname)); 705 706 ret = -EFAULT; 707 if (copy_from_sockptr(devname, optval, optlen)) 708 goto out; 709 710 index = 0; 711 if (devname[0] != '\0') { 712 struct net_device *dev; 713 714 rcu_read_lock(); 715 dev = dev_get_by_name_rcu(net, devname); 716 if (dev) 717 index = dev->ifindex; 718 rcu_read_unlock(); 719 ret = -ENODEV; 720 if (!dev) 721 goto out; 722 } 723 724 sockopt_lock_sock(sk); 725 ret = sock_bindtoindex_locked(sk, index); 726 sockopt_release_sock(sk); 727 out: 728 #endif 729 730 return ret; 731 } 732 733 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval, 734 sockptr_t optlen, int len) 735 { 736 int ret = -ENOPROTOOPT; 737 #ifdef CONFIG_NETDEVICES 738 int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if); 739 struct net *net = sock_net(sk); 740 char devname[IFNAMSIZ]; 741 742 if (bound_dev_if == 0) { 743 len = 0; 744 goto zero; 745 } 746 747 ret = -EINVAL; 748 if (len < IFNAMSIZ) 749 goto out; 750 751 ret = netdev_get_name(net, devname, bound_dev_if); 752 if (ret) 753 goto out; 754 755 len = strlen(devname) + 1; 756 757 ret = -EFAULT; 758 if (copy_to_sockptr(optval, devname, len)) 759 goto out; 760 761 zero: 762 ret = -EFAULT; 763 if (copy_to_sockptr(optlen, &len, sizeof(int))) 764 goto out; 765 766 ret = 0; 767 768 out: 769 #endif 770 771 return ret; 772 } 773 774 bool sk_mc_loop(const struct sock *sk) 775 { 776 if (dev_recursion_level()) 777 return false; 778 if (!sk) 779 return true; 780 /* IPV6_ADDRFORM can change sk->sk_family under us. */ 781 switch (READ_ONCE(sk->sk_family)) { 782 case AF_INET: 783 return inet_test_bit(MC_LOOP, sk); 784 #if IS_ENABLED(CONFIG_IPV6) 785 case AF_INET6: 786 return inet6_test_bit(MC6_LOOP, sk); 787 #endif 788 } 789 WARN_ON_ONCE(1); 790 return true; 791 } 792 EXPORT_SYMBOL(sk_mc_loop); 793 794 void sock_set_reuseaddr(struct sock *sk) 795 { 796 lock_sock(sk); 797 sk->sk_reuse = SK_CAN_REUSE; 798 release_sock(sk); 799 } 800 EXPORT_SYMBOL(sock_set_reuseaddr); 801 802 void sock_set_reuseport(struct sock *sk) 803 { 804 lock_sock(sk); 805 sk->sk_reuseport = true; 806 release_sock(sk); 807 } 808 EXPORT_SYMBOL(sock_set_reuseport); 809 810 void sock_no_linger(struct sock *sk) 811 { 812 lock_sock(sk); 813 WRITE_ONCE(sk->sk_lingertime, 0); 814 sock_set_flag(sk, SOCK_LINGER); 815 release_sock(sk); 816 } 817 EXPORT_SYMBOL(sock_no_linger); 818 819 void sock_set_priority(struct sock *sk, u32 priority) 820 { 821 WRITE_ONCE(sk->sk_priority, priority); 822 } 823 EXPORT_SYMBOL(sock_set_priority); 824 825 void sock_set_sndtimeo(struct sock *sk, s64 secs) 826 { 827 if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1) 828 WRITE_ONCE(sk->sk_sndtimeo, secs * HZ); 829 else 830 WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT); 831 } 832 EXPORT_SYMBOL(sock_set_sndtimeo); 833 834 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns) 835 { 836 sock_valbool_flag(sk, SOCK_RCVTSTAMP, val); 837 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, val && ns); 838 if (val) { 839 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new); 840 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 841 } 842 } 843 844 void sock_set_timestamp(struct sock *sk, int optname, bool valbool) 845 { 846 switch (optname) { 847 case SO_TIMESTAMP_OLD: 848 __sock_set_timestamps(sk, valbool, false, false); 849 break; 850 case SO_TIMESTAMP_NEW: 851 __sock_set_timestamps(sk, valbool, true, false); 852 break; 853 case SO_TIMESTAMPNS_OLD: 854 __sock_set_timestamps(sk, valbool, false, true); 855 break; 856 case SO_TIMESTAMPNS_NEW: 857 __sock_set_timestamps(sk, valbool, true, true); 858 break; 859 } 860 } 861 862 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index) 863 { 864 struct net *net = sock_net(sk); 865 struct net_device *dev = NULL; 866 bool match = false; 867 int *vclock_index; 868 int i, num; 869 870 if (sk->sk_bound_dev_if) 871 dev = dev_get_by_index(net, sk->sk_bound_dev_if); 872 873 if (!dev) { 874 pr_err("%s: sock not bind to device\n", __func__); 875 return -EOPNOTSUPP; 876 } 877 878 num = ethtool_get_phc_vclocks(dev, &vclock_index); 879 dev_put(dev); 880 881 for (i = 0; i < num; i++) { 882 if (*(vclock_index + i) == phc_index) { 883 match = true; 884 break; 885 } 886 } 887 888 if (num > 0) 889 kfree(vclock_index); 890 891 if (!match) 892 return -EINVAL; 893 894 WRITE_ONCE(sk->sk_bind_phc, phc_index); 895 896 return 0; 897 } 898 899 int sock_set_timestamping(struct sock *sk, int optname, 900 struct so_timestamping timestamping) 901 { 902 int val = timestamping.flags; 903 int ret; 904 905 if (val & ~SOF_TIMESTAMPING_MASK) 906 return -EINVAL; 907 908 if (val & SOF_TIMESTAMPING_OPT_ID_TCP && 909 !(val & SOF_TIMESTAMPING_OPT_ID)) 910 return -EINVAL; 911 912 if (val & SOF_TIMESTAMPING_OPT_ID && 913 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 914 if (sk_is_tcp(sk)) { 915 if ((1 << sk->sk_state) & 916 (TCPF_CLOSE | TCPF_LISTEN)) 917 return -EINVAL; 918 if (val & SOF_TIMESTAMPING_OPT_ID_TCP) 919 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq); 920 else 921 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una); 922 } else { 923 atomic_set(&sk->sk_tskey, 0); 924 } 925 } 926 927 if (val & SOF_TIMESTAMPING_OPT_STATS && 928 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) 929 return -EINVAL; 930 931 if (val & SOF_TIMESTAMPING_BIND_PHC) { 932 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc); 933 if (ret) 934 return ret; 935 } 936 937 WRITE_ONCE(sk->sk_tsflags, val); 938 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW); 939 sock_valbool_flag(sk, SOCK_TIMESTAMPING_ANY, !!(val & TSFLAGS_ANY)); 940 941 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 942 sock_enable_timestamp(sk, 943 SOCK_TIMESTAMPING_RX_SOFTWARE); 944 else 945 sock_disable_timestamp(sk, 946 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 947 return 0; 948 } 949 950 #if defined(CONFIG_CGROUP_BPF) 951 void bpf_skops_tx_timestamping(struct sock *sk, struct sk_buff *skb, int op) 952 { 953 struct bpf_sock_ops_kern sock_ops; 954 955 memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); 956 sock_ops.op = op; 957 sock_ops.is_fullsock = 1; 958 sock_ops.sk = sk; 959 bpf_skops_init_skb(&sock_ops, skb, 0); 960 __cgroup_bpf_run_filter_sock_ops(sk, &sock_ops, CGROUP_SOCK_OPS); 961 } 962 #endif 963 964 void sock_set_keepalive(struct sock *sk) 965 { 966 lock_sock(sk); 967 if (sk->sk_prot->keepalive) 968 sk->sk_prot->keepalive(sk, true); 969 sock_valbool_flag(sk, SOCK_KEEPOPEN, true); 970 release_sock(sk); 971 } 972 EXPORT_SYMBOL(sock_set_keepalive); 973 974 static void __sock_set_rcvbuf(struct sock *sk, int val) 975 { 976 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it 977 * as a negative value. 978 */ 979 val = min_t(int, val, INT_MAX / 2); 980 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 981 982 /* We double it on the way in to account for "struct sk_buff" etc. 983 * overhead. Applications assume that the SO_RCVBUF setting they make 984 * will allow that much actual data to be received on that socket. 985 * 986 * Applications are unaware that "struct sk_buff" and other overheads 987 * allocate from the receive buffer during socket buffer allocation. 988 * 989 * And after considering the possible alternatives, returning the value 990 * we actually used in getsockopt is the most desirable behavior. 991 */ 992 WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF)); 993 } 994 995 void sock_set_rcvbuf(struct sock *sk, int val) 996 { 997 lock_sock(sk); 998 __sock_set_rcvbuf(sk, val); 999 release_sock(sk); 1000 } 1001 EXPORT_SYMBOL(sock_set_rcvbuf); 1002 1003 static void __sock_set_mark(struct sock *sk, u32 val) 1004 { 1005 if (val != sk->sk_mark) { 1006 WRITE_ONCE(sk->sk_mark, val); 1007 sk_dst_reset(sk); 1008 } 1009 } 1010 1011 void sock_set_mark(struct sock *sk, u32 val) 1012 { 1013 lock_sock(sk); 1014 __sock_set_mark(sk, val); 1015 release_sock(sk); 1016 } 1017 EXPORT_SYMBOL(sock_set_mark); 1018 1019 static void sock_release_reserved_memory(struct sock *sk, int bytes) 1020 { 1021 /* Round down bytes to multiple of pages */ 1022 bytes = round_down(bytes, PAGE_SIZE); 1023 1024 WARN_ON(bytes > sk->sk_reserved_mem); 1025 WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes); 1026 sk_mem_reclaim(sk); 1027 } 1028 1029 static int sock_reserve_memory(struct sock *sk, int bytes) 1030 { 1031 long allocated; 1032 bool charged; 1033 int pages; 1034 1035 if (!mem_cgroup_sk_enabled(sk) || !sk_has_account(sk)) 1036 return -EOPNOTSUPP; 1037 1038 if (!bytes) 1039 return 0; 1040 1041 pages = sk_mem_pages(bytes); 1042 1043 /* pre-charge to memcg */ 1044 charged = mem_cgroup_sk_charge(sk, pages, 1045 GFP_KERNEL | __GFP_RETRY_MAYFAIL); 1046 if (!charged) 1047 return -ENOMEM; 1048 1049 if (sk->sk_bypass_prot_mem) 1050 goto success; 1051 1052 /* pre-charge to forward_alloc */ 1053 sk_memory_allocated_add(sk, pages); 1054 allocated = sk_memory_allocated(sk); 1055 1056 /* If the system goes into memory pressure with this 1057 * precharge, give up and return error. 1058 */ 1059 if (allocated > sk_prot_mem_limits(sk, 1)) { 1060 sk_memory_allocated_sub(sk, pages); 1061 mem_cgroup_sk_uncharge(sk, pages); 1062 return -ENOMEM; 1063 } 1064 1065 success: 1066 sk_forward_alloc_add(sk, pages << PAGE_SHIFT); 1067 1068 WRITE_ONCE(sk->sk_reserved_mem, 1069 sk->sk_reserved_mem + (pages << PAGE_SHIFT)); 1070 1071 return 0; 1072 } 1073 1074 #ifdef CONFIG_PAGE_POOL 1075 1076 /* This is the number of tokens and frags that the user can SO_DEVMEM_DONTNEED 1077 * in 1 syscall. The limit exists to limit the amount of memory the kernel 1078 * allocates to copy these tokens, and to prevent looping over the frags for 1079 * too long. 1080 */ 1081 #define MAX_DONTNEED_TOKENS 128 1082 #define MAX_DONTNEED_FRAGS 1024 1083 1084 static noinline_for_stack int 1085 sock_devmem_dontneed(struct sock *sk, sockptr_t optval, unsigned int optlen) 1086 { 1087 unsigned int num_tokens, i, j, k, netmem_num = 0; 1088 struct dmabuf_token *tokens; 1089 int ret = 0, num_frags = 0; 1090 netmem_ref netmems[16]; 1091 1092 if (!sk_is_tcp(sk)) 1093 return -EBADF; 1094 1095 if (optlen % sizeof(*tokens) || 1096 optlen > sizeof(*tokens) * MAX_DONTNEED_TOKENS) 1097 return -EINVAL; 1098 1099 num_tokens = optlen / sizeof(*tokens); 1100 tokens = kvmalloc_array(num_tokens, sizeof(*tokens), GFP_KERNEL); 1101 if (!tokens) 1102 return -ENOMEM; 1103 1104 if (copy_from_sockptr(tokens, optval, optlen)) { 1105 kvfree(tokens); 1106 return -EFAULT; 1107 } 1108 1109 xa_lock_bh(&sk->sk_user_frags); 1110 for (i = 0; i < num_tokens; i++) { 1111 for (j = 0; j < tokens[i].token_count; j++) { 1112 if (++num_frags > MAX_DONTNEED_FRAGS) 1113 goto frag_limit_reached; 1114 1115 netmem_ref netmem = (__force netmem_ref)__xa_erase( 1116 &sk->sk_user_frags, tokens[i].token_start + j); 1117 1118 if (!netmem || WARN_ON_ONCE(!netmem_is_net_iov(netmem))) 1119 continue; 1120 1121 netmems[netmem_num++] = netmem; 1122 if (netmem_num == ARRAY_SIZE(netmems)) { 1123 xa_unlock_bh(&sk->sk_user_frags); 1124 for (k = 0; k < netmem_num; k++) 1125 WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); 1126 netmem_num = 0; 1127 xa_lock_bh(&sk->sk_user_frags); 1128 } 1129 ret++; 1130 } 1131 } 1132 1133 frag_limit_reached: 1134 xa_unlock_bh(&sk->sk_user_frags); 1135 for (k = 0; k < netmem_num; k++) 1136 WARN_ON_ONCE(!napi_pp_put_page(netmems[k])); 1137 1138 kvfree(tokens); 1139 return ret; 1140 } 1141 #endif 1142 1143 void sockopt_lock_sock(struct sock *sk) 1144 { 1145 /* When current->bpf_ctx is set, the setsockopt is called from 1146 * a bpf prog. bpf has ensured the sk lock has been 1147 * acquired before calling setsockopt(). 1148 */ 1149 if (has_current_bpf_ctx()) 1150 return; 1151 1152 lock_sock(sk); 1153 } 1154 EXPORT_SYMBOL(sockopt_lock_sock); 1155 1156 void sockopt_release_sock(struct sock *sk) 1157 { 1158 if (has_current_bpf_ctx()) 1159 return; 1160 1161 release_sock(sk); 1162 } 1163 EXPORT_SYMBOL(sockopt_release_sock); 1164 1165 bool sockopt_ns_capable(struct user_namespace *ns, int cap) 1166 { 1167 return has_current_bpf_ctx() || ns_capable(ns, cap); 1168 } 1169 EXPORT_SYMBOL(sockopt_ns_capable); 1170 1171 bool sockopt_capable(int cap) 1172 { 1173 return has_current_bpf_ctx() || capable(cap); 1174 } 1175 EXPORT_SYMBOL(sockopt_capable); 1176 1177 static int sockopt_validate_clockid(__kernel_clockid_t value) 1178 { 1179 switch (value) { 1180 case CLOCK_REALTIME: 1181 case CLOCK_MONOTONIC: 1182 case CLOCK_TAI: 1183 return 0; 1184 } 1185 return -EINVAL; 1186 } 1187 1188 /* 1189 * This is meant for all protocols to use and covers goings on 1190 * at the socket level. Everything here is generic. 1191 */ 1192 1193 int sk_setsockopt(struct sock *sk, int level, int optname, 1194 sockptr_t optval, unsigned int optlen) 1195 { 1196 struct so_timestamping timestamping; 1197 struct socket *sock = sk->sk_socket; 1198 struct sock_txtime sk_txtime; 1199 int val; 1200 int valbool; 1201 struct linger ling; 1202 int ret = 0; 1203 1204 /* 1205 * Options without arguments 1206 */ 1207 1208 if (optname == SO_BINDTODEVICE) 1209 return sock_setbindtodevice(sk, optval, optlen); 1210 1211 if (optlen < sizeof(int)) 1212 return -EINVAL; 1213 1214 if (copy_from_sockptr(&val, optval, sizeof(val))) 1215 return -EFAULT; 1216 1217 valbool = val ? 1 : 0; 1218 1219 /* handle options which do not require locking the socket. */ 1220 switch (optname) { 1221 case SO_PRIORITY: 1222 if (sk_set_prio_allowed(sk, val)) { 1223 sock_set_priority(sk, val); 1224 return 0; 1225 } 1226 return -EPERM; 1227 case SO_TYPE: 1228 case SO_PROTOCOL: 1229 case SO_DOMAIN: 1230 case SO_ERROR: 1231 return -ENOPROTOOPT; 1232 #ifdef CONFIG_NET_RX_BUSY_POLL 1233 case SO_BUSY_POLL: 1234 if (val < 0) 1235 return -EINVAL; 1236 WRITE_ONCE(sk->sk_ll_usec, val); 1237 return 0; 1238 case SO_PREFER_BUSY_POLL: 1239 if (valbool && !sockopt_capable(CAP_NET_ADMIN)) 1240 return -EPERM; 1241 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); 1242 return 0; 1243 case SO_BUSY_POLL_BUDGET: 1244 if (val > READ_ONCE(sk->sk_busy_poll_budget) && 1245 !sockopt_capable(CAP_NET_ADMIN)) 1246 return -EPERM; 1247 if (val < 0 || val > U16_MAX) 1248 return -EINVAL; 1249 WRITE_ONCE(sk->sk_busy_poll_budget, val); 1250 return 0; 1251 #endif 1252 case SO_MAX_PACING_RATE: 1253 { 1254 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val; 1255 unsigned long pacing_rate; 1256 1257 if (sizeof(ulval) != sizeof(val) && 1258 optlen >= sizeof(ulval) && 1259 copy_from_sockptr(&ulval, optval, sizeof(ulval))) { 1260 return -EFAULT; 1261 } 1262 if (ulval != ~0UL) 1263 cmpxchg(&sk->sk_pacing_status, 1264 SK_PACING_NONE, 1265 SK_PACING_NEEDED); 1266 /* Pairs with READ_ONCE() from sk_getsockopt() */ 1267 WRITE_ONCE(sk->sk_max_pacing_rate, ulval); 1268 pacing_rate = READ_ONCE(sk->sk_pacing_rate); 1269 if (ulval < pacing_rate) 1270 WRITE_ONCE(sk->sk_pacing_rate, ulval); 1271 return 0; 1272 } 1273 case SO_TXREHASH: 1274 if (!sk_is_tcp(sk)) 1275 return -EOPNOTSUPP; 1276 if (val < -1 || val > 1) 1277 return -EINVAL; 1278 if ((u8)val == SOCK_TXREHASH_DEFAULT) 1279 val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash); 1280 /* Paired with READ_ONCE() in tcp_rtx_synack() 1281 * and sk_getsockopt(). 1282 */ 1283 WRITE_ONCE(sk->sk_txrehash, (u8)val); 1284 return 0; 1285 case SO_PEEK_OFF: 1286 { 1287 int (*set_peek_off)(struct sock *sk, int val); 1288 1289 set_peek_off = READ_ONCE(sock->ops)->set_peek_off; 1290 if (set_peek_off) 1291 ret = set_peek_off(sk, val); 1292 else 1293 ret = -EOPNOTSUPP; 1294 return ret; 1295 } 1296 #ifdef CONFIG_PAGE_POOL 1297 case SO_DEVMEM_DONTNEED: 1298 return sock_devmem_dontneed(sk, optval, optlen); 1299 #endif 1300 case SO_SNDTIMEO_OLD: 1301 case SO_SNDTIMEO_NEW: 1302 return sock_set_timeout(&sk->sk_sndtimeo, optval, 1303 optlen, optname == SO_SNDTIMEO_OLD); 1304 case SO_RCVTIMEO_OLD: 1305 case SO_RCVTIMEO_NEW: 1306 return sock_set_timeout(&sk->sk_rcvtimeo, optval, 1307 optlen, optname == SO_RCVTIMEO_OLD); 1308 } 1309 1310 sockopt_lock_sock(sk); 1311 1312 switch (optname) { 1313 case SO_DEBUG: 1314 if (val && !sockopt_capable(CAP_NET_ADMIN)) 1315 ret = -EACCES; 1316 else 1317 sock_valbool_flag(sk, SOCK_DBG, valbool); 1318 break; 1319 case SO_REUSEADDR: 1320 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 1321 break; 1322 case SO_REUSEPORT: 1323 if (valbool && !sk_is_inet(sk)) 1324 ret = -EOPNOTSUPP; 1325 else 1326 sk->sk_reuseport = valbool; 1327 break; 1328 case SO_DONTROUTE: 1329 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 1330 sk_dst_reset(sk); 1331 break; 1332 case SO_BROADCAST: 1333 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 1334 break; 1335 case SO_SNDBUF: 1336 /* Don't error on this BSD doesn't and if you think 1337 * about it this is right. Otherwise apps have to 1338 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1339 * are treated in BSD as hints 1340 */ 1341 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); 1342 set_sndbuf: 1343 /* Ensure val * 2 fits into an int, to prevent max_t() 1344 * from treating it as a negative value. 1345 */ 1346 val = min_t(int, val, INT_MAX / 2); 1347 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 1348 WRITE_ONCE(sk->sk_sndbuf, 1349 max_t(int, val * 2, SOCK_MIN_SNDBUF)); 1350 /* Wake up sending tasks if we upped the value. */ 1351 sk->sk_write_space(sk); 1352 break; 1353 1354 case SO_SNDBUFFORCE: 1355 if (!sockopt_capable(CAP_NET_ADMIN)) { 1356 ret = -EPERM; 1357 break; 1358 } 1359 1360 /* No negative values (to prevent underflow, as val will be 1361 * multiplied by 2). 1362 */ 1363 if (val < 0) 1364 val = 0; 1365 goto set_sndbuf; 1366 1367 case SO_RCVBUF: 1368 /* Don't error on this BSD doesn't and if you think 1369 * about it this is right. Otherwise apps have to 1370 * play 'guess the biggest size' games. RCVBUF/SNDBUF 1371 * are treated in BSD as hints 1372 */ 1373 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max))); 1374 break; 1375 1376 case SO_RCVBUFFORCE: 1377 if (!sockopt_capable(CAP_NET_ADMIN)) { 1378 ret = -EPERM; 1379 break; 1380 } 1381 1382 /* No negative values (to prevent underflow, as val will be 1383 * multiplied by 2). 1384 */ 1385 __sock_set_rcvbuf(sk, max(val, 0)); 1386 break; 1387 1388 case SO_KEEPALIVE: 1389 if (sk->sk_prot->keepalive) 1390 sk->sk_prot->keepalive(sk, valbool); 1391 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 1392 break; 1393 1394 case SO_OOBINLINE: 1395 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 1396 break; 1397 1398 case SO_NO_CHECK: 1399 sk->sk_no_check_tx = valbool; 1400 break; 1401 1402 case SO_LINGER: 1403 if (optlen < sizeof(ling)) { 1404 ret = -EINVAL; /* 1003.1g */ 1405 break; 1406 } 1407 if (copy_from_sockptr(&ling, optval, sizeof(ling))) { 1408 ret = -EFAULT; 1409 break; 1410 } 1411 if (!ling.l_onoff) { 1412 sock_reset_flag(sk, SOCK_LINGER); 1413 } else { 1414 unsigned long t_sec = ling.l_linger; 1415 1416 if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ) 1417 WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT); 1418 else 1419 WRITE_ONCE(sk->sk_lingertime, t_sec * HZ); 1420 sock_set_flag(sk, SOCK_LINGER); 1421 } 1422 break; 1423 1424 case SO_BSDCOMPAT: 1425 break; 1426 1427 case SO_TIMESTAMP_OLD: 1428 case SO_TIMESTAMP_NEW: 1429 case SO_TIMESTAMPNS_OLD: 1430 case SO_TIMESTAMPNS_NEW: 1431 sock_set_timestamp(sk, optname, valbool); 1432 break; 1433 1434 case SO_TIMESTAMPING_NEW: 1435 case SO_TIMESTAMPING_OLD: 1436 if (optlen == sizeof(timestamping)) { 1437 if (copy_from_sockptr(×tamping, optval, 1438 sizeof(timestamping))) { 1439 ret = -EFAULT; 1440 break; 1441 } 1442 } else { 1443 memset(×tamping, 0, sizeof(timestamping)); 1444 timestamping.flags = val; 1445 } 1446 ret = sock_set_timestamping(sk, optname, timestamping); 1447 break; 1448 1449 case SO_RCVLOWAT: 1450 { 1451 int (*set_rcvlowat)(struct sock *sk, int val) = NULL; 1452 1453 if (val < 0) 1454 val = INT_MAX; 1455 if (sock) 1456 set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat; 1457 if (set_rcvlowat) 1458 ret = set_rcvlowat(sk, val); 1459 else 1460 WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); 1461 break; 1462 } 1463 case SO_ATTACH_FILTER: { 1464 struct sock_fprog fprog; 1465 1466 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1467 if (!ret) 1468 ret = sk_attach_filter(&fprog, sk); 1469 break; 1470 } 1471 case SO_ATTACH_BPF: 1472 ret = -EINVAL; 1473 if (optlen == sizeof(u32)) { 1474 u32 ufd; 1475 1476 ret = -EFAULT; 1477 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1478 break; 1479 1480 ret = sk_attach_bpf(ufd, sk); 1481 } 1482 break; 1483 1484 case SO_ATTACH_REUSEPORT_CBPF: { 1485 struct sock_fprog fprog; 1486 1487 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen); 1488 if (!ret) 1489 ret = sk_reuseport_attach_filter(&fprog, sk); 1490 break; 1491 } 1492 case SO_ATTACH_REUSEPORT_EBPF: 1493 ret = -EINVAL; 1494 if (optlen == sizeof(u32)) { 1495 u32 ufd; 1496 1497 ret = -EFAULT; 1498 if (copy_from_sockptr(&ufd, optval, sizeof(ufd))) 1499 break; 1500 1501 ret = sk_reuseport_attach_bpf(ufd, sk); 1502 } 1503 break; 1504 1505 case SO_DETACH_REUSEPORT_BPF: 1506 ret = reuseport_detach_prog(sk); 1507 break; 1508 1509 case SO_DETACH_FILTER: 1510 ret = sk_detach_filter(sk); 1511 break; 1512 1513 case SO_LOCK_FILTER: 1514 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 1515 ret = -EPERM; 1516 else 1517 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 1518 break; 1519 1520 case SO_MARK: 1521 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 1522 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1523 ret = -EPERM; 1524 break; 1525 } 1526 1527 __sock_set_mark(sk, val); 1528 break; 1529 case SO_RCVMARK: 1530 sock_valbool_flag(sk, SOCK_RCVMARK, valbool); 1531 break; 1532 1533 case SO_RCVPRIORITY: 1534 sock_valbool_flag(sk, SOCK_RCVPRIORITY, valbool); 1535 break; 1536 1537 case SO_RXQ_OVFL: 1538 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 1539 break; 1540 1541 case SO_WIFI_STATUS: 1542 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 1543 break; 1544 1545 case SO_NOFCS: 1546 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 1547 break; 1548 1549 case SO_SELECT_ERR_QUEUE: 1550 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1551 break; 1552 1553 case SO_PASSCRED: 1554 if (sk_may_scm_recv(sk)) 1555 sk->sk_scm_credentials = valbool; 1556 else 1557 ret = -EOPNOTSUPP; 1558 break; 1559 1560 case SO_PASSSEC: 1561 if (IS_ENABLED(CONFIG_SECURITY_NETWORK) && sk_may_scm_recv(sk)) 1562 sk->sk_scm_security = valbool; 1563 else 1564 ret = -EOPNOTSUPP; 1565 break; 1566 1567 case SO_PASSPIDFD: 1568 if (sk_is_unix(sk)) 1569 sk->sk_scm_pidfd = valbool; 1570 else 1571 ret = -EOPNOTSUPP; 1572 break; 1573 1574 case SO_PASSRIGHTS: 1575 if (sk_is_unix(sk)) 1576 sk->sk_scm_rights = valbool; 1577 else 1578 ret = -EOPNOTSUPP; 1579 break; 1580 1581 case SO_INCOMING_CPU: 1582 reuseport_update_incoming_cpu(sk, val); 1583 break; 1584 1585 case SO_CNX_ADVICE: 1586 if (val == 1) 1587 dst_negative_advice(sk); 1588 break; 1589 1590 case SO_ZEROCOPY: 1591 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1592 if (!(sk_is_tcp(sk) || 1593 (sk->sk_type == SOCK_DGRAM && 1594 sk->sk_protocol == IPPROTO_UDP))) 1595 ret = -EOPNOTSUPP; 1596 } else if (sk->sk_family != PF_RDS) { 1597 ret = -EOPNOTSUPP; 1598 } 1599 if (!ret) { 1600 if (val < 0 || val > 1) 1601 ret = -EINVAL; 1602 else 1603 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1604 } 1605 break; 1606 1607 case SO_TXTIME: 1608 if (optlen != sizeof(struct sock_txtime)) { 1609 ret = -EINVAL; 1610 break; 1611 } else if (copy_from_sockptr(&sk_txtime, optval, 1612 sizeof(struct sock_txtime))) { 1613 ret = -EFAULT; 1614 break; 1615 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { 1616 ret = -EINVAL; 1617 break; 1618 } 1619 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet 1620 * scheduler has enough safe guards. 1621 */ 1622 if (sk_txtime.clockid != CLOCK_MONOTONIC && 1623 !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1624 ret = -EPERM; 1625 break; 1626 } 1627 1628 ret = sockopt_validate_clockid(sk_txtime.clockid); 1629 if (ret) 1630 break; 1631 1632 sock_valbool_flag(sk, SOCK_TXTIME, true); 1633 sk->sk_clockid = sk_txtime.clockid; 1634 sk->sk_txtime_deadline_mode = 1635 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); 1636 sk->sk_txtime_report_errors = 1637 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); 1638 break; 1639 1640 case SO_BINDTOIFINDEX: 1641 ret = sock_bindtoindex_locked(sk, val); 1642 break; 1643 1644 case SO_BUF_LOCK: 1645 if (val & ~SOCK_BUF_LOCK_MASK) { 1646 ret = -EINVAL; 1647 break; 1648 } 1649 sk->sk_userlocks = val | (sk->sk_userlocks & 1650 ~SOCK_BUF_LOCK_MASK); 1651 break; 1652 1653 case SO_RESERVE_MEM: 1654 { 1655 int delta; 1656 1657 if (val < 0) { 1658 ret = -EINVAL; 1659 break; 1660 } 1661 1662 delta = val - sk->sk_reserved_mem; 1663 if (delta < 0) 1664 sock_release_reserved_memory(sk, -delta); 1665 else 1666 ret = sock_reserve_memory(sk, delta); 1667 break; 1668 } 1669 1670 default: 1671 ret = -ENOPROTOOPT; 1672 break; 1673 } 1674 sockopt_release_sock(sk); 1675 return ret; 1676 } 1677 1678 int sock_setsockopt(struct socket *sock, int level, int optname, 1679 sockptr_t optval, unsigned int optlen) 1680 { 1681 return sk_setsockopt(sock->sk, level, optname, 1682 optval, optlen); 1683 } 1684 EXPORT_SYMBOL(sock_setsockopt); 1685 1686 static const struct cred *sk_get_peer_cred(struct sock *sk) 1687 { 1688 const struct cred *cred; 1689 1690 spin_lock(&sk->sk_peer_lock); 1691 cred = get_cred(sk->sk_peer_cred); 1692 spin_unlock(&sk->sk_peer_lock); 1693 1694 return cred; 1695 } 1696 1697 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1698 struct ucred *ucred) 1699 { 1700 ucred->pid = pid_vnr(pid); 1701 ucred->uid = ucred->gid = -1; 1702 if (cred) { 1703 struct user_namespace *current_ns = current_user_ns(); 1704 1705 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1706 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1707 } 1708 } 1709 1710 static int groups_to_user(sockptr_t dst, const struct group_info *src) 1711 { 1712 struct user_namespace *user_ns = current_user_ns(); 1713 int i; 1714 1715 for (i = 0; i < src->ngroups; i++) { 1716 gid_t gid = from_kgid_munged(user_ns, src->gid[i]); 1717 1718 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid))) 1719 return -EFAULT; 1720 } 1721 1722 return 0; 1723 } 1724 1725 int sk_getsockopt(struct sock *sk, int level, int optname, 1726 sockptr_t optval, sockptr_t optlen) 1727 { 1728 struct socket *sock = sk->sk_socket; 1729 1730 union { 1731 int val; 1732 u64 val64; 1733 unsigned long ulval; 1734 struct linger ling; 1735 struct old_timeval32 tm32; 1736 struct __kernel_old_timeval tm; 1737 struct __kernel_sock_timeval stm; 1738 struct sock_txtime txtime; 1739 struct so_timestamping timestamping; 1740 } v; 1741 1742 int lv = sizeof(int); 1743 int len; 1744 1745 if (copy_from_sockptr(&len, optlen, sizeof(int))) 1746 return -EFAULT; 1747 if (len < 0) 1748 return -EINVAL; 1749 1750 memset(&v, 0, sizeof(v)); 1751 1752 switch (optname) { 1753 case SO_DEBUG: 1754 v.val = sock_flag(sk, SOCK_DBG); 1755 break; 1756 1757 case SO_DONTROUTE: 1758 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1759 break; 1760 1761 case SO_BROADCAST: 1762 v.val = sock_flag(sk, SOCK_BROADCAST); 1763 break; 1764 1765 case SO_SNDBUF: 1766 v.val = READ_ONCE(sk->sk_sndbuf); 1767 break; 1768 1769 case SO_RCVBUF: 1770 v.val = READ_ONCE(sk->sk_rcvbuf); 1771 break; 1772 1773 case SO_REUSEADDR: 1774 v.val = sk->sk_reuse; 1775 break; 1776 1777 case SO_REUSEPORT: 1778 v.val = sk->sk_reuseport; 1779 break; 1780 1781 case SO_KEEPALIVE: 1782 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1783 break; 1784 1785 case SO_TYPE: 1786 v.val = sk->sk_type; 1787 break; 1788 1789 case SO_PROTOCOL: 1790 v.val = sk->sk_protocol; 1791 break; 1792 1793 case SO_DOMAIN: 1794 v.val = sk->sk_family; 1795 break; 1796 1797 case SO_ERROR: 1798 v.val = -sock_error(sk); 1799 if (v.val == 0) 1800 v.val = xchg(&sk->sk_err_soft, 0); 1801 break; 1802 1803 case SO_OOBINLINE: 1804 v.val = sock_flag(sk, SOCK_URGINLINE); 1805 break; 1806 1807 case SO_NO_CHECK: 1808 v.val = sk->sk_no_check_tx; 1809 break; 1810 1811 case SO_PRIORITY: 1812 v.val = READ_ONCE(sk->sk_priority); 1813 break; 1814 1815 case SO_LINGER: 1816 lv = sizeof(v.ling); 1817 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1818 v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ; 1819 break; 1820 1821 case SO_BSDCOMPAT: 1822 break; 1823 1824 case SO_TIMESTAMP_OLD: 1825 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1826 !sock_flag(sk, SOCK_TSTAMP_NEW) && 1827 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1828 break; 1829 1830 case SO_TIMESTAMPNS_OLD: 1831 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW); 1832 break; 1833 1834 case SO_TIMESTAMP_NEW: 1835 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW); 1836 break; 1837 1838 case SO_TIMESTAMPNS_NEW: 1839 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW); 1840 break; 1841 1842 case SO_TIMESTAMPING_OLD: 1843 case SO_TIMESTAMPING_NEW: 1844 lv = sizeof(v.timestamping); 1845 /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only 1846 * returning the flags when they were set through the same option. 1847 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD. 1848 */ 1849 if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) { 1850 v.timestamping.flags = READ_ONCE(sk->sk_tsflags); 1851 v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc); 1852 } 1853 break; 1854 1855 case SO_RCVTIMEO_OLD: 1856 case SO_RCVTIMEO_NEW: 1857 lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v, 1858 SO_RCVTIMEO_OLD == optname); 1859 break; 1860 1861 case SO_SNDTIMEO_OLD: 1862 case SO_SNDTIMEO_NEW: 1863 lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v, 1864 SO_SNDTIMEO_OLD == optname); 1865 break; 1866 1867 case SO_RCVLOWAT: 1868 v.val = READ_ONCE(sk->sk_rcvlowat); 1869 break; 1870 1871 case SO_SNDLOWAT: 1872 v.val = 1; 1873 break; 1874 1875 case SO_PASSCRED: 1876 if (!sk_may_scm_recv(sk)) 1877 return -EOPNOTSUPP; 1878 1879 v.val = sk->sk_scm_credentials; 1880 break; 1881 1882 case SO_PASSPIDFD: 1883 if (!sk_is_unix(sk)) 1884 return -EOPNOTSUPP; 1885 1886 v.val = sk->sk_scm_pidfd; 1887 break; 1888 1889 case SO_PASSRIGHTS: 1890 if (!sk_is_unix(sk)) 1891 return -EOPNOTSUPP; 1892 1893 v.val = sk->sk_scm_rights; 1894 break; 1895 1896 case SO_PEERCRED: 1897 { 1898 struct ucred peercred; 1899 if (len > sizeof(peercred)) 1900 len = sizeof(peercred); 1901 1902 spin_lock(&sk->sk_peer_lock); 1903 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1904 spin_unlock(&sk->sk_peer_lock); 1905 1906 if (copy_to_sockptr(optval, &peercred, len)) 1907 return -EFAULT; 1908 goto lenout; 1909 } 1910 1911 case SO_PEERPIDFD: 1912 { 1913 struct pid *peer_pid; 1914 struct file *pidfd_file = NULL; 1915 unsigned int flags = 0; 1916 int pidfd; 1917 1918 if (len > sizeof(pidfd)) 1919 len = sizeof(pidfd); 1920 1921 spin_lock(&sk->sk_peer_lock); 1922 peer_pid = get_pid(sk->sk_peer_pid); 1923 spin_unlock(&sk->sk_peer_lock); 1924 1925 if (!peer_pid) 1926 return -ENODATA; 1927 1928 /* The use of PIDFD_STALE requires stashing of struct pid 1929 * on pidfs with pidfs_register_pid() and only AF_UNIX 1930 * were prepared for this. 1931 */ 1932 if (sk->sk_family == AF_UNIX) 1933 flags = PIDFD_STALE; 1934 1935 pidfd = pidfd_prepare(peer_pid, flags, &pidfd_file); 1936 put_pid(peer_pid); 1937 if (pidfd < 0) 1938 return pidfd; 1939 1940 if (copy_to_sockptr(optval, &pidfd, len) || 1941 copy_to_sockptr(optlen, &len, sizeof(int))) { 1942 put_unused_fd(pidfd); 1943 fput(pidfd_file); 1944 1945 return -EFAULT; 1946 } 1947 1948 fd_install(pidfd, pidfd_file); 1949 return 0; 1950 } 1951 1952 case SO_PEERGROUPS: 1953 { 1954 const struct cred *cred; 1955 int ret, n; 1956 1957 cred = sk_get_peer_cred(sk); 1958 if (!cred) 1959 return -ENODATA; 1960 1961 n = cred->group_info->ngroups; 1962 if (len < n * sizeof(gid_t)) { 1963 len = n * sizeof(gid_t); 1964 put_cred(cred); 1965 return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE; 1966 } 1967 len = n * sizeof(gid_t); 1968 1969 ret = groups_to_user(optval, cred->group_info); 1970 put_cred(cred); 1971 if (ret) 1972 return ret; 1973 goto lenout; 1974 } 1975 1976 case SO_PEERNAME: 1977 { 1978 struct sockaddr_storage address; 1979 1980 lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2); 1981 if (lv < 0) 1982 return -ENOTCONN; 1983 if (lv < len) 1984 return -EINVAL; 1985 if (copy_to_sockptr(optval, &address, len)) 1986 return -EFAULT; 1987 goto lenout; 1988 } 1989 1990 /* Dubious BSD thing... Probably nobody even uses it, but 1991 * the UNIX standard wants it for whatever reason... -DaveM 1992 */ 1993 case SO_ACCEPTCONN: 1994 v.val = sk->sk_state == TCP_LISTEN; 1995 break; 1996 1997 case SO_PASSSEC: 1998 if (!IS_ENABLED(CONFIG_SECURITY_NETWORK) || !sk_may_scm_recv(sk)) 1999 return -EOPNOTSUPP; 2000 2001 v.val = sk->sk_scm_security; 2002 break; 2003 2004 case SO_PEERSEC: 2005 return security_socket_getpeersec_stream(sock, 2006 optval, optlen, len); 2007 2008 case SO_MARK: 2009 v.val = READ_ONCE(sk->sk_mark); 2010 break; 2011 2012 case SO_RCVMARK: 2013 v.val = sock_flag(sk, SOCK_RCVMARK); 2014 break; 2015 2016 case SO_RCVPRIORITY: 2017 v.val = sock_flag(sk, SOCK_RCVPRIORITY); 2018 break; 2019 2020 case SO_RXQ_OVFL: 2021 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 2022 break; 2023 2024 case SO_WIFI_STATUS: 2025 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 2026 break; 2027 2028 case SO_PEEK_OFF: 2029 if (!READ_ONCE(sock->ops)->set_peek_off) 2030 return -EOPNOTSUPP; 2031 2032 v.val = READ_ONCE(sk->sk_peek_off); 2033 break; 2034 case SO_NOFCS: 2035 v.val = sock_flag(sk, SOCK_NOFCS); 2036 break; 2037 2038 case SO_BINDTODEVICE: 2039 return sock_getbindtodevice(sk, optval, optlen, len); 2040 2041 case SO_GET_FILTER: 2042 len = sk_get_filter(sk, optval, len); 2043 if (len < 0) 2044 return len; 2045 2046 goto lenout; 2047 2048 case SO_LOCK_FILTER: 2049 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 2050 break; 2051 2052 case SO_BPF_EXTENSIONS: 2053 v.val = bpf_tell_extensions(); 2054 break; 2055 2056 case SO_SELECT_ERR_QUEUE: 2057 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 2058 break; 2059 2060 #ifdef CONFIG_NET_RX_BUSY_POLL 2061 case SO_BUSY_POLL: 2062 v.val = READ_ONCE(sk->sk_ll_usec); 2063 break; 2064 case SO_PREFER_BUSY_POLL: 2065 v.val = READ_ONCE(sk->sk_prefer_busy_poll); 2066 break; 2067 #endif 2068 2069 case SO_MAX_PACING_RATE: 2070 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */ 2071 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) { 2072 lv = sizeof(v.ulval); 2073 v.ulval = READ_ONCE(sk->sk_max_pacing_rate); 2074 } else { 2075 /* 32bit version */ 2076 v.val = min_t(unsigned long, ~0U, 2077 READ_ONCE(sk->sk_max_pacing_rate)); 2078 } 2079 break; 2080 2081 case SO_INCOMING_CPU: 2082 v.val = READ_ONCE(sk->sk_incoming_cpu); 2083 break; 2084 2085 case SO_MEMINFO: 2086 { 2087 u32 meminfo[SK_MEMINFO_VARS]; 2088 2089 sk_get_meminfo(sk, meminfo); 2090 2091 len = min_t(unsigned int, len, sizeof(meminfo)); 2092 if (copy_to_sockptr(optval, &meminfo, len)) 2093 return -EFAULT; 2094 2095 goto lenout; 2096 } 2097 2098 #ifdef CONFIG_NET_RX_BUSY_POLL 2099 case SO_INCOMING_NAPI_ID: 2100 v.val = READ_ONCE(sk->sk_napi_id); 2101 2102 /* aggregate non-NAPI IDs down to 0 */ 2103 if (!napi_id_valid(v.val)) 2104 v.val = 0; 2105 2106 break; 2107 #endif 2108 2109 case SO_COOKIE: 2110 lv = sizeof(u64); 2111 if (len < lv) 2112 return -EINVAL; 2113 v.val64 = sock_gen_cookie(sk); 2114 break; 2115 2116 case SO_ZEROCOPY: 2117 v.val = sock_flag(sk, SOCK_ZEROCOPY); 2118 break; 2119 2120 case SO_TXTIME: 2121 lv = sizeof(v.txtime); 2122 v.txtime.clockid = sk->sk_clockid; 2123 v.txtime.flags |= sk->sk_txtime_deadline_mode ? 2124 SOF_TXTIME_DEADLINE_MODE : 0; 2125 v.txtime.flags |= sk->sk_txtime_report_errors ? 2126 SOF_TXTIME_REPORT_ERRORS : 0; 2127 break; 2128 2129 case SO_BINDTOIFINDEX: 2130 v.val = READ_ONCE(sk->sk_bound_dev_if); 2131 break; 2132 2133 case SO_NETNS_COOKIE: 2134 lv = sizeof(u64); 2135 if (len != lv) 2136 return -EINVAL; 2137 v.val64 = sock_net(sk)->net_cookie; 2138 break; 2139 2140 case SO_BUF_LOCK: 2141 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK; 2142 break; 2143 2144 case SO_RESERVE_MEM: 2145 v.val = READ_ONCE(sk->sk_reserved_mem); 2146 break; 2147 2148 case SO_TXREHASH: 2149 if (!sk_is_tcp(sk)) 2150 return -EOPNOTSUPP; 2151 2152 /* Paired with WRITE_ONCE() in sk_setsockopt() */ 2153 v.val = READ_ONCE(sk->sk_txrehash); 2154 break; 2155 2156 default: 2157 /* We implement the SO_SNDLOWAT etc to not be settable 2158 * (1003.1g 7). 2159 */ 2160 return -ENOPROTOOPT; 2161 } 2162 2163 if (len > lv) 2164 len = lv; 2165 if (copy_to_sockptr(optval, &v, len)) 2166 return -EFAULT; 2167 lenout: 2168 if (copy_to_sockptr(optlen, &len, sizeof(int))) 2169 return -EFAULT; 2170 return 0; 2171 } 2172 2173 /* 2174 * Initialize an sk_lock. 2175 * 2176 * (We also register the sk_lock with the lock validator.) 2177 */ 2178 static inline void sock_lock_init(struct sock *sk) 2179 { 2180 sk_owner_clear(sk); 2181 2182 if (sk->sk_kern_sock) 2183 sock_lock_init_class_and_name( 2184 sk, 2185 af_family_kern_slock_key_strings[sk->sk_family], 2186 af_family_kern_slock_keys + sk->sk_family, 2187 af_family_kern_key_strings[sk->sk_family], 2188 af_family_kern_keys + sk->sk_family); 2189 else 2190 sock_lock_init_class_and_name( 2191 sk, 2192 af_family_slock_key_strings[sk->sk_family], 2193 af_family_slock_keys + sk->sk_family, 2194 af_family_key_strings[sk->sk_family], 2195 af_family_keys + sk->sk_family); 2196 } 2197 2198 /* 2199 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 2200 * even temporarily, because of RCU lookups. sk_node should also be left as is. 2201 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 2202 */ 2203 static void sock_copy(struct sock *nsk, const struct sock *osk) 2204 { 2205 const struct proto *prot = READ_ONCE(osk->sk_prot); 2206 #ifdef CONFIG_SECURITY_NETWORK 2207 void *sptr = nsk->sk_security; 2208 #endif 2209 2210 /* If we move sk_tx_queue_mapping out of the private section, 2211 * we must check if sk_tx_queue_clear() is called after 2212 * sock_copy() in sk_clone_lock(). 2213 */ 2214 BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) < 2215 offsetof(struct sock, sk_dontcopy_begin) || 2216 offsetof(struct sock, sk_tx_queue_mapping) >= 2217 offsetof(struct sock, sk_dontcopy_end)); 2218 2219 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 2220 2221 unsafe_memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 2222 prot->obj_size - offsetof(struct sock, sk_dontcopy_end), 2223 /* alloc is larger than struct, see sk_prot_alloc() */); 2224 2225 #ifdef CONFIG_SECURITY_NETWORK 2226 nsk->sk_security = sptr; 2227 security_sk_clone(osk, nsk); 2228 #endif 2229 } 2230 2231 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 2232 int family) 2233 { 2234 struct sock *sk; 2235 struct kmem_cache *slab; 2236 2237 slab = prot->slab; 2238 if (slab != NULL) { 2239 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 2240 if (!sk) 2241 return sk; 2242 if (want_init_on_alloc(priority)) 2243 sk_prot_clear_nulls(sk, prot->obj_size); 2244 } else 2245 sk = kmalloc(prot->obj_size, priority); 2246 2247 if (sk != NULL) { 2248 if (security_sk_alloc(sk, family, priority)) 2249 goto out_free; 2250 2251 if (!try_module_get(prot->owner)) 2252 goto out_free_sec; 2253 } 2254 2255 return sk; 2256 2257 out_free_sec: 2258 security_sk_free(sk); 2259 out_free: 2260 if (slab != NULL) 2261 kmem_cache_free(slab, sk); 2262 else 2263 kfree(sk); 2264 return NULL; 2265 } 2266 2267 static void sk_prot_free(struct proto *prot, struct sock *sk) 2268 { 2269 struct kmem_cache *slab; 2270 struct module *owner; 2271 2272 owner = prot->owner; 2273 slab = prot->slab; 2274 2275 cgroup_sk_free(&sk->sk_cgrp_data); 2276 mem_cgroup_sk_free(sk); 2277 security_sk_free(sk); 2278 2279 sk_owner_put(sk); 2280 2281 if (slab != NULL) 2282 kmem_cache_free(slab, sk); 2283 else 2284 kfree(sk); 2285 module_put(owner); 2286 } 2287 2288 /** 2289 * sk_alloc - All socket objects are allocated here 2290 * @net: the applicable net namespace 2291 * @family: protocol family 2292 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 2293 * @prot: struct proto associated with this new sock instance 2294 * @kern: is this to be a kernel socket? 2295 */ 2296 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 2297 struct proto *prot, int kern) 2298 { 2299 struct sock *sk; 2300 2301 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 2302 if (sk) { 2303 sk->sk_family = family; 2304 /* 2305 * See comment in struct sock definition to understand 2306 * why we need sk_prot_creator -acme 2307 */ 2308 sk->sk_prot = sk->sk_prot_creator = prot; 2309 2310 if (READ_ONCE(net->core.sysctl_bypass_prot_mem)) 2311 sk->sk_bypass_prot_mem = 1; 2312 2313 sk->sk_kern_sock = kern; 2314 sock_lock_init(sk); 2315 2316 sk->sk_net_refcnt = kern ? 0 : 1; 2317 if (likely(sk->sk_net_refcnt)) { 2318 get_net_track(net, &sk->ns_tracker, priority); 2319 sock_inuse_add(net, 1); 2320 } else { 2321 net_passive_inc(net); 2322 __netns_tracker_alloc(net, &sk->ns_tracker, 2323 false, priority); 2324 } 2325 2326 sock_net_set(sk, net); 2327 refcount_set(&sk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); 2328 2329 mem_cgroup_sk_alloc(sk); 2330 cgroup_sk_alloc(&sk->sk_cgrp_data); 2331 sock_update_classid(&sk->sk_cgrp_data); 2332 sock_update_netprioidx(&sk->sk_cgrp_data); 2333 sk_tx_queue_clear(sk); 2334 } 2335 2336 return sk; 2337 } 2338 EXPORT_SYMBOL(sk_alloc); 2339 2340 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 2341 * grace period. This is the case for UDP sockets and TCP listeners. 2342 */ 2343 static void __sk_destruct(struct rcu_head *head) 2344 { 2345 struct sock *sk = container_of(head, struct sock, sk_rcu); 2346 struct net *net = sock_net(sk); 2347 struct sk_filter *filter; 2348 2349 if (sk->sk_destruct) 2350 sk->sk_destruct(sk); 2351 2352 filter = rcu_dereference_check(sk->sk_filter, 2353 refcount_read(&sk->sk_wmem_alloc) == 0); 2354 if (filter) { 2355 sk_filter_uncharge(sk, filter); 2356 RCU_INIT_POINTER(sk->sk_filter, NULL); 2357 } 2358 2359 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 2360 2361 #ifdef CONFIG_BPF_SYSCALL 2362 bpf_sk_storage_free(sk); 2363 #endif 2364 2365 if (atomic_read(&sk->sk_omem_alloc)) 2366 pr_debug("%s: optmem leakage (%d bytes) detected\n", 2367 __func__, atomic_read(&sk->sk_omem_alloc)); 2368 2369 if (sk->sk_frag.page) { 2370 put_page(sk->sk_frag.page); 2371 sk->sk_frag.page = NULL; 2372 } 2373 2374 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */ 2375 put_cred(sk->sk_peer_cred); 2376 put_pid(sk->sk_peer_pid); 2377 2378 if (likely(sk->sk_net_refcnt)) { 2379 put_net_track(net, &sk->ns_tracker); 2380 } else { 2381 __netns_tracker_free(net, &sk->ns_tracker, false); 2382 net_passive_dec(net); 2383 } 2384 sk_prot_free(sk->sk_prot_creator, sk); 2385 } 2386 2387 void sk_net_refcnt_upgrade(struct sock *sk) 2388 { 2389 struct net *net = sock_net(sk); 2390 2391 WARN_ON_ONCE(sk->sk_net_refcnt); 2392 __netns_tracker_free(net, &sk->ns_tracker, false); 2393 net_passive_dec(net); 2394 sk->sk_net_refcnt = 1; 2395 get_net_track(net, &sk->ns_tracker, GFP_KERNEL); 2396 sock_inuse_add(net, 1); 2397 } 2398 EXPORT_SYMBOL_GPL(sk_net_refcnt_upgrade); 2399 2400 void sk_destruct(struct sock *sk) 2401 { 2402 bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE); 2403 2404 if (rcu_access_pointer(sk->sk_reuseport_cb)) { 2405 reuseport_detach_sock(sk); 2406 use_call_rcu = true; 2407 } 2408 2409 if (use_call_rcu) 2410 call_rcu(&sk->sk_rcu, __sk_destruct); 2411 else 2412 __sk_destruct(&sk->sk_rcu); 2413 } 2414 2415 static void __sk_free(struct sock *sk) 2416 { 2417 if (likely(sk->sk_net_refcnt)) 2418 sock_inuse_add(sock_net(sk), -1); 2419 2420 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 2421 sock_diag_broadcast_destroy(sk); 2422 else 2423 sk_destruct(sk); 2424 } 2425 2426 void sk_free(struct sock *sk) 2427 { 2428 /* 2429 * We subtract one from sk_wmem_alloc and can know if 2430 * some packets are still in some tx queue. 2431 * If not null, sock_wfree() will call __sk_free(sk) later 2432 */ 2433 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 2434 __sk_free(sk); 2435 } 2436 EXPORT_SYMBOL(sk_free); 2437 2438 static void sk_init_common(struct sock *sk) 2439 { 2440 skb_queue_head_init(&sk->sk_receive_queue); 2441 skb_queue_head_init(&sk->sk_write_queue); 2442 skb_queue_head_init(&sk->sk_error_queue); 2443 2444 rwlock_init(&sk->sk_callback_lock); 2445 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 2446 af_rlock_keys + sk->sk_family, 2447 af_family_rlock_key_strings[sk->sk_family]); 2448 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 2449 af_wlock_keys + sk->sk_family, 2450 af_family_wlock_key_strings[sk->sk_family]); 2451 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 2452 af_elock_keys + sk->sk_family, 2453 af_family_elock_key_strings[sk->sk_family]); 2454 if (sk->sk_kern_sock) 2455 lockdep_set_class_and_name(&sk->sk_callback_lock, 2456 af_kern_callback_keys + sk->sk_family, 2457 af_family_kern_clock_key_strings[sk->sk_family]); 2458 else 2459 lockdep_set_class_and_name(&sk->sk_callback_lock, 2460 af_callback_keys + sk->sk_family, 2461 af_family_clock_key_strings[sk->sk_family]); 2462 } 2463 2464 /** 2465 * sk_clone_lock - clone a socket, and lock its clone 2466 * @sk: the socket to clone 2467 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 2468 * 2469 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 2470 */ 2471 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 2472 { 2473 struct proto *prot = READ_ONCE(sk->sk_prot); 2474 struct sk_filter *filter; 2475 bool is_charged = true; 2476 struct sock *newsk; 2477 2478 newsk = sk_prot_alloc(prot, priority, sk->sk_family); 2479 if (!newsk) 2480 goto out; 2481 2482 sock_copy(newsk, sk); 2483 2484 newsk->sk_prot_creator = prot; 2485 2486 /* SANITY */ 2487 if (likely(newsk->sk_net_refcnt)) { 2488 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority); 2489 sock_inuse_add(sock_net(newsk), 1); 2490 } else { 2491 /* Kernel sockets are not elevating the struct net refcount. 2492 * Instead, use a tracker to more easily detect if a layer 2493 * is not properly dismantling its kernel sockets at netns 2494 * destroy time. 2495 */ 2496 net_passive_inc(sock_net(newsk)); 2497 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, 2498 false, priority); 2499 } 2500 sk_node_init(&newsk->sk_node); 2501 sock_lock_init(newsk); 2502 bh_lock_sock(newsk); 2503 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 2504 newsk->sk_backlog.len = 0; 2505 2506 atomic_set(&newsk->sk_rmem_alloc, 0); 2507 2508 refcount_set(&newsk->sk_wmem_alloc, SK_WMEM_ALLOC_BIAS); 2509 2510 atomic_set(&newsk->sk_omem_alloc, 0); 2511 sk_init_common(newsk); 2512 2513 newsk->sk_dst_cache = NULL; 2514 newsk->sk_dst_pending_confirm = 0; 2515 newsk->sk_wmem_queued = 0; 2516 newsk->sk_forward_alloc = 0; 2517 newsk->sk_reserved_mem = 0; 2518 DEBUG_NET_WARN_ON_ONCE(newsk->sk_drop_counters); 2519 sk_drops_reset(newsk); 2520 newsk->sk_send_head = NULL; 2521 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 2522 atomic_set(&newsk->sk_zckey, 0); 2523 2524 sock_reset_flag(newsk, SOCK_DONE); 2525 2526 #ifdef CONFIG_MEMCG 2527 /* sk->sk_memcg will be populated at accept() time */ 2528 newsk->sk_memcg = NULL; 2529 #endif 2530 2531 cgroup_sk_clone(&newsk->sk_cgrp_data); 2532 2533 rcu_read_lock(); 2534 filter = rcu_dereference(sk->sk_filter); 2535 if (filter != NULL) 2536 /* though it's an empty new sock, the charging may fail 2537 * if sysctl_optmem_max was changed between creation of 2538 * original socket and cloning 2539 */ 2540 is_charged = sk_filter_charge(newsk, filter); 2541 RCU_INIT_POINTER(newsk->sk_filter, filter); 2542 rcu_read_unlock(); 2543 2544 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 2545 /* We need to make sure that we don't uncharge the new 2546 * socket if we couldn't charge it in the first place 2547 * as otherwise we uncharge the parent's filter. 2548 */ 2549 if (!is_charged) 2550 RCU_INIT_POINTER(newsk->sk_filter, NULL); 2551 2552 goto free; 2553 } 2554 2555 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 2556 2557 if (bpf_sk_storage_clone(sk, newsk)) 2558 goto free; 2559 2560 /* Clear sk_user_data if parent had the pointer tagged 2561 * as not suitable for copying when cloning. 2562 */ 2563 if (sk_user_data_is_nocopy(newsk)) 2564 newsk->sk_user_data = NULL; 2565 2566 newsk->sk_err = 0; 2567 newsk->sk_err_soft = 0; 2568 newsk->sk_priority = 0; 2569 newsk->sk_incoming_cpu = raw_smp_processor_id(); 2570 2571 /* Before updating sk_refcnt, we must commit prior changes to memory 2572 * (Documentation/RCU/rculist_nulls.rst for details) 2573 */ 2574 smp_wmb(); 2575 refcount_set(&newsk->sk_refcnt, 2); 2576 2577 sk_set_socket(newsk, NULL); 2578 sk_tx_queue_clear(newsk); 2579 RCU_INIT_POINTER(newsk->sk_wq, NULL); 2580 2581 if (newsk->sk_prot->sockets_allocated) 2582 sk_sockets_allocated_inc(newsk); 2583 2584 if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP) 2585 net_enable_timestamp(); 2586 out: 2587 return newsk; 2588 free: 2589 /* It is still raw copy of parent, so invalidate 2590 * destructor and make plain sk_free() 2591 */ 2592 newsk->sk_destruct = NULL; 2593 bh_unlock_sock(newsk); 2594 sk_free(newsk); 2595 newsk = NULL; 2596 goto out; 2597 } 2598 EXPORT_SYMBOL_GPL(sk_clone_lock); 2599 2600 static u32 sk_dst_gso_max_size(struct sock *sk, const struct net_device *dev) 2601 { 2602 bool is_ipv6 = false; 2603 u32 max_size; 2604 2605 #if IS_ENABLED(CONFIG_IPV6) 2606 is_ipv6 = (sk->sk_family == AF_INET6 && 2607 !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)); 2608 #endif 2609 /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */ 2610 max_size = is_ipv6 ? READ_ONCE(dev->gso_max_size) : 2611 READ_ONCE(dev->gso_ipv4_max_size); 2612 if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk)) 2613 max_size = GSO_LEGACY_MAX_SIZE; 2614 2615 return max_size - (MAX_TCP_HEADER + 1); 2616 } 2617 2618 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 2619 { 2620 const struct net_device *dev; 2621 u32 max_segs = 1; 2622 2623 rcu_read_lock(); 2624 dev = dst_dev_rcu(dst); 2625 sk->sk_route_caps = dev->features; 2626 if (sk_is_tcp(sk)) { 2627 struct inet_connection_sock *icsk = inet_csk(sk); 2628 2629 sk->sk_route_caps |= NETIF_F_GSO; 2630 icsk->icsk_ack.dst_quick_ack = dst_metric(dst, RTAX_QUICKACK); 2631 } 2632 if (sk->sk_route_caps & NETIF_F_GSO) 2633 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 2634 if (unlikely(sk->sk_gso_disabled)) 2635 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2636 if (sk_can_gso(sk)) { 2637 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 2638 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 2639 } else { 2640 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 2641 sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dev); 2642 /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */ 2643 max_segs = max_t(u32, READ_ONCE(dev->gso_max_segs), 1); 2644 } 2645 } 2646 sk->sk_gso_max_segs = max_segs; 2647 sk_dst_set(sk, dst); 2648 rcu_read_unlock(); 2649 } 2650 EXPORT_SYMBOL_GPL(sk_setup_caps); 2651 2652 /* 2653 * Simple resource managers for sockets. 2654 */ 2655 2656 2657 /* 2658 * Write buffer destructor automatically called from kfree_skb. 2659 */ 2660 void sock_wfree(struct sk_buff *skb) 2661 { 2662 unsigned int len = skb->truesize; 2663 struct sock *sk = skb->sk; 2664 bool free; 2665 int old; 2666 2667 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 2668 if (sock_flag(sk, SOCK_RCU_FREE) && 2669 sk->sk_write_space == sock_def_write_space) { 2670 rcu_read_lock(); 2671 free = __refcount_sub_and_test(len, &sk->sk_wmem_alloc, 2672 &old); 2673 sock_def_write_space_wfree(sk, old - len); 2674 rcu_read_unlock(); 2675 if (unlikely(free)) 2676 __sk_free(sk); 2677 return; 2678 } 2679 2680 /* 2681 * Keep a reference on sk_wmem_alloc, this will be released 2682 * after sk_write_space() call 2683 */ 2684 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 2685 sk->sk_write_space(sk); 2686 len = 1; 2687 } 2688 /* 2689 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 2690 * could not do because of in-flight packets 2691 */ 2692 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 2693 __sk_free(sk); 2694 } 2695 EXPORT_SYMBOL(sock_wfree); 2696 2697 /* This variant of sock_wfree() is used by TCP, 2698 * since it sets SOCK_USE_WRITE_QUEUE. 2699 */ 2700 void __sock_wfree(struct sk_buff *skb) 2701 { 2702 struct sock *sk = skb->sk; 2703 2704 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 2705 __sk_free(sk); 2706 } 2707 2708 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 2709 { 2710 int old_wmem; 2711 2712 skb_orphan(skb); 2713 #ifdef CONFIG_INET 2714 if (unlikely(!sk_fullsock(sk))) 2715 return skb_set_owner_edemux(skb, sk); 2716 #endif 2717 skb->sk = sk; 2718 skb->destructor = sock_wfree; 2719 skb_set_hash_from_sk(skb, sk); 2720 /* 2721 * We used to take a refcount on sk, but following operation 2722 * is enough to guarantee sk_free() won't free this sock until 2723 * all in-flight packets are completed 2724 */ 2725 __refcount_add(skb->truesize, &sk->sk_wmem_alloc, &old_wmem); 2726 2727 /* (old_wmem == SK_WMEM_ALLOC_BIAS) if no other TX packet for this socket 2728 * is in a host queue (qdisc, NIC queue). 2729 * Set skb->ooo_okay so that netdev_pick_tx() can choose a TX queue 2730 * based on XPS for better performance. 2731 * Otherwise clear ooo_okay to not risk Out Of Order delivery. 2732 */ 2733 skb->ooo_okay = (old_wmem == SK_WMEM_ALLOC_BIAS); 2734 } 2735 EXPORT_SYMBOL(skb_set_owner_w); 2736 2737 static bool can_skb_orphan_partial(const struct sk_buff *skb) 2738 { 2739 /* Drivers depend on in-order delivery for crypto offload, 2740 * partial orphan breaks out-of-order-OK logic. 2741 */ 2742 if (skb_is_decrypted(skb)) 2743 return false; 2744 2745 return (skb->destructor == sock_wfree || 2746 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree)); 2747 } 2748 2749 /* This helper is used by netem, as it can hold packets in its 2750 * delay queue. We want to allow the owner socket to send more 2751 * packets, as if they were already TX completed by a typical driver. 2752 * But we also want to keep skb->sk set because some packet schedulers 2753 * rely on it (sch_fq for example). 2754 */ 2755 void skb_orphan_partial(struct sk_buff *skb) 2756 { 2757 if (skb_is_tcp_pure_ack(skb)) 2758 return; 2759 2760 if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk)) 2761 return; 2762 2763 skb_orphan(skb); 2764 } 2765 EXPORT_SYMBOL(skb_orphan_partial); 2766 2767 /* 2768 * Read buffer destructor automatically called from kfree_skb. 2769 */ 2770 void sock_rfree(struct sk_buff *skb) 2771 { 2772 struct sock *sk = skb->sk; 2773 unsigned int len = skb->truesize; 2774 2775 atomic_sub(len, &sk->sk_rmem_alloc); 2776 sk_mem_uncharge(sk, len); 2777 } 2778 EXPORT_SYMBOL(sock_rfree); 2779 2780 /* 2781 * Buffer destructor for skbs that are not used directly in read or write 2782 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 2783 */ 2784 void sock_efree(struct sk_buff *skb) 2785 { 2786 sock_put(skb->sk); 2787 } 2788 EXPORT_SYMBOL(sock_efree); 2789 2790 /* Buffer destructor for prefetch/receive path where reference count may 2791 * not be held, e.g. for listen sockets. 2792 */ 2793 #ifdef CONFIG_INET 2794 void sock_pfree(struct sk_buff *skb) 2795 { 2796 struct sock *sk = skb->sk; 2797 2798 if (!sk_is_refcounted(sk)) 2799 return; 2800 2801 if (sk->sk_state == TCP_NEW_SYN_RECV && inet_reqsk(sk)->syncookie) { 2802 inet_reqsk(sk)->rsk_listener = NULL; 2803 reqsk_free(inet_reqsk(sk)); 2804 return; 2805 } 2806 2807 sock_gen_put(sk); 2808 } 2809 EXPORT_SYMBOL(sock_pfree); 2810 #endif /* CONFIG_INET */ 2811 2812 /* 2813 * Allocate a skb from the socket's send buffer. 2814 */ 2815 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 2816 gfp_t priority) 2817 { 2818 if (force || 2819 refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) { 2820 struct sk_buff *skb = alloc_skb(size, priority); 2821 2822 if (skb) { 2823 skb_set_owner_w(skb, sk); 2824 return skb; 2825 } 2826 } 2827 return NULL; 2828 } 2829 EXPORT_SYMBOL(sock_wmalloc); 2830 2831 static void sock_ofree(struct sk_buff *skb) 2832 { 2833 struct sock *sk = skb->sk; 2834 2835 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 2836 } 2837 2838 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 2839 gfp_t priority) 2840 { 2841 struct sk_buff *skb; 2842 2843 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 2844 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 2845 READ_ONCE(sock_net(sk)->core.sysctl_optmem_max)) 2846 return NULL; 2847 2848 skb = alloc_skb(size, priority); 2849 if (!skb) 2850 return NULL; 2851 2852 atomic_add(skb->truesize, &sk->sk_omem_alloc); 2853 skb->sk = sk; 2854 skb->destructor = sock_ofree; 2855 return skb; 2856 } 2857 2858 /* 2859 * Allocate a memory block from the socket's option memory buffer. 2860 */ 2861 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 2862 { 2863 int optmem_max = READ_ONCE(sock_net(sk)->core.sysctl_optmem_max); 2864 2865 if ((unsigned int)size <= optmem_max && 2866 atomic_read(&sk->sk_omem_alloc) + size < optmem_max) { 2867 void *mem; 2868 /* First do the add, to avoid the race if kmalloc 2869 * might sleep. 2870 */ 2871 atomic_add(size, &sk->sk_omem_alloc); 2872 mem = kmalloc(size, priority); 2873 if (mem) 2874 return mem; 2875 atomic_sub(size, &sk->sk_omem_alloc); 2876 } 2877 return NULL; 2878 } 2879 EXPORT_SYMBOL(sock_kmalloc); 2880 2881 /* 2882 * Duplicate the input "src" memory block using the socket's 2883 * option memory buffer. 2884 */ 2885 void *sock_kmemdup(struct sock *sk, const void *src, 2886 int size, gfp_t priority) 2887 { 2888 void *mem; 2889 2890 mem = sock_kmalloc(sk, size, priority); 2891 if (mem) 2892 memcpy(mem, src, size); 2893 return mem; 2894 } 2895 EXPORT_SYMBOL(sock_kmemdup); 2896 2897 /* Free an option memory block. Note, we actually want the inline 2898 * here as this allows gcc to detect the nullify and fold away the 2899 * condition entirely. 2900 */ 2901 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2902 const bool nullify) 2903 { 2904 if (WARN_ON_ONCE(!mem)) 2905 return; 2906 if (nullify) 2907 kfree_sensitive(mem); 2908 else 2909 kfree(mem); 2910 atomic_sub(size, &sk->sk_omem_alloc); 2911 } 2912 2913 void sock_kfree_s(struct sock *sk, void *mem, int size) 2914 { 2915 __sock_kfree_s(sk, mem, size, false); 2916 } 2917 EXPORT_SYMBOL(sock_kfree_s); 2918 2919 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2920 { 2921 __sock_kfree_s(sk, mem, size, true); 2922 } 2923 EXPORT_SYMBOL(sock_kzfree_s); 2924 2925 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2926 I think, these locks should be removed for datagram sockets. 2927 */ 2928 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2929 { 2930 DEFINE_WAIT(wait); 2931 2932 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2933 for (;;) { 2934 if (!timeo) 2935 break; 2936 if (signal_pending(current)) 2937 break; 2938 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2939 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2940 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) 2941 break; 2942 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2943 break; 2944 if (READ_ONCE(sk->sk_err)) 2945 break; 2946 timeo = schedule_timeout(timeo); 2947 } 2948 finish_wait(sk_sleep(sk), &wait); 2949 return timeo; 2950 } 2951 2952 2953 /* 2954 * Generic send/receive buffer handlers 2955 */ 2956 2957 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2958 unsigned long data_len, int noblock, 2959 int *errcode, int max_page_order) 2960 { 2961 struct sk_buff *skb; 2962 long timeo; 2963 int err; 2964 2965 timeo = sock_sndtimeo(sk, noblock); 2966 for (;;) { 2967 err = sock_error(sk); 2968 if (err != 0) 2969 goto failure; 2970 2971 err = -EPIPE; 2972 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) 2973 goto failure; 2974 2975 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf)) 2976 break; 2977 2978 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2979 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2980 err = -EAGAIN; 2981 if (!timeo) 2982 goto failure; 2983 if (signal_pending(current)) 2984 goto interrupted; 2985 timeo = sock_wait_for_wmem(sk, timeo); 2986 } 2987 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2988 errcode, sk->sk_allocation); 2989 if (skb) 2990 skb_set_owner_w(skb, sk); 2991 return skb; 2992 2993 interrupted: 2994 err = sock_intr_errno(timeo); 2995 failure: 2996 *errcode = err; 2997 return NULL; 2998 } 2999 EXPORT_SYMBOL(sock_alloc_send_pskb); 3000 3001 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, 3002 struct sockcm_cookie *sockc) 3003 { 3004 u32 tsflags; 3005 3006 BUILD_BUG_ON(SOF_TIMESTAMPING_LAST == (1 << 31)); 3007 3008 switch (cmsg->cmsg_type) { 3009 case SO_MARK: 3010 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && 3011 !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 3012 return -EPERM; 3013 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3014 return -EINVAL; 3015 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 3016 break; 3017 case SO_TIMESTAMPING_OLD: 3018 case SO_TIMESTAMPING_NEW: 3019 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3020 return -EINVAL; 3021 3022 tsflags = *(u32 *)CMSG_DATA(cmsg); 3023 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 3024 return -EINVAL; 3025 3026 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 3027 sockc->tsflags |= tsflags; 3028 break; 3029 case SCM_TXTIME: 3030 if (!sock_flag(sk, SOCK_TXTIME)) 3031 return -EINVAL; 3032 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 3033 return -EINVAL; 3034 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 3035 break; 3036 case SCM_TS_OPT_ID: 3037 if (sk_is_tcp(sk)) 3038 return -EINVAL; 3039 tsflags = READ_ONCE(sk->sk_tsflags); 3040 if (!(tsflags & SOF_TIMESTAMPING_OPT_ID)) 3041 return -EINVAL; 3042 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3043 return -EINVAL; 3044 sockc->ts_opt_id = *(u32 *)CMSG_DATA(cmsg); 3045 sockc->tsflags |= SOCKCM_FLAG_TS_OPT_ID; 3046 break; 3047 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 3048 case SCM_RIGHTS: 3049 case SCM_CREDENTIALS: 3050 break; 3051 case SO_PRIORITY: 3052 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3053 return -EINVAL; 3054 if (!sk_set_prio_allowed(sk, *(u32 *)CMSG_DATA(cmsg))) 3055 return -EPERM; 3056 sockc->priority = *(u32 *)CMSG_DATA(cmsg); 3057 break; 3058 case SCM_DEVMEM_DMABUF: 3059 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 3060 return -EINVAL; 3061 sockc->dmabuf_id = *(u32 *)CMSG_DATA(cmsg); 3062 break; 3063 default: 3064 return -EINVAL; 3065 } 3066 return 0; 3067 } 3068 EXPORT_SYMBOL(__sock_cmsg_send); 3069 3070 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 3071 struct sockcm_cookie *sockc) 3072 { 3073 struct cmsghdr *cmsg; 3074 int ret; 3075 3076 for_each_cmsghdr(cmsg, msg) { 3077 if (!CMSG_OK(msg, cmsg)) 3078 return -EINVAL; 3079 if (cmsg->cmsg_level != SOL_SOCKET) 3080 continue; 3081 ret = __sock_cmsg_send(sk, cmsg, sockc); 3082 if (ret) 3083 return ret; 3084 } 3085 return 0; 3086 } 3087 EXPORT_SYMBOL(sock_cmsg_send); 3088 3089 static void sk_enter_memory_pressure(struct sock *sk) 3090 { 3091 if (!sk->sk_prot->enter_memory_pressure) 3092 return; 3093 3094 sk->sk_prot->enter_memory_pressure(sk); 3095 } 3096 3097 static void sk_leave_memory_pressure(struct sock *sk) 3098 { 3099 if (sk->sk_prot->leave_memory_pressure) { 3100 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure, 3101 tcp_leave_memory_pressure, sk); 3102 } else { 3103 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 3104 3105 if (memory_pressure && READ_ONCE(*memory_pressure)) 3106 WRITE_ONCE(*memory_pressure, 0); 3107 } 3108 } 3109 3110 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); 3111 3112 /** 3113 * skb_page_frag_refill - check that a page_frag contains enough room 3114 * @sz: minimum size of the fragment we want to get 3115 * @pfrag: pointer to page_frag 3116 * @gfp: priority for memory allocation 3117 * 3118 * Note: While this allocator tries to use high order pages, there is 3119 * no guarantee that allocations succeed. Therefore, @sz MUST be 3120 * less or equal than PAGE_SIZE. 3121 */ 3122 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 3123 { 3124 if (pfrag->page) { 3125 if (page_ref_count(pfrag->page) == 1) { 3126 pfrag->offset = 0; 3127 return true; 3128 } 3129 if (pfrag->offset + sz <= pfrag->size) 3130 return true; 3131 put_page(pfrag->page); 3132 } 3133 3134 pfrag->offset = 0; 3135 if (SKB_FRAG_PAGE_ORDER && 3136 !static_branch_unlikely(&net_high_order_alloc_disable_key)) { 3137 /* Avoid direct reclaim but allow kswapd to wake */ 3138 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 3139 __GFP_COMP | __GFP_NOWARN | 3140 __GFP_NORETRY, 3141 SKB_FRAG_PAGE_ORDER); 3142 if (likely(pfrag->page)) { 3143 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 3144 return true; 3145 } 3146 } 3147 pfrag->page = alloc_page(gfp); 3148 if (likely(pfrag->page)) { 3149 pfrag->size = PAGE_SIZE; 3150 return true; 3151 } 3152 return false; 3153 } 3154 EXPORT_SYMBOL(skb_page_frag_refill); 3155 3156 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 3157 { 3158 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 3159 return true; 3160 3161 if (!sk->sk_bypass_prot_mem) 3162 sk_enter_memory_pressure(sk); 3163 3164 sk_stream_moderate_sndbuf(sk); 3165 3166 return false; 3167 } 3168 EXPORT_SYMBOL(sk_page_frag_refill); 3169 3170 void __lock_sock(struct sock *sk) 3171 __releases(&sk->sk_lock.slock) 3172 __acquires(&sk->sk_lock.slock) 3173 { 3174 DEFINE_WAIT(wait); 3175 3176 for (;;) { 3177 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 3178 TASK_UNINTERRUPTIBLE); 3179 spin_unlock_bh(&sk->sk_lock.slock); 3180 schedule(); 3181 spin_lock_bh(&sk->sk_lock.slock); 3182 if (!sock_owned_by_user(sk)) 3183 break; 3184 } 3185 finish_wait(&sk->sk_lock.wq, &wait); 3186 } 3187 3188 void __release_sock(struct sock *sk) 3189 __releases(&sk->sk_lock.slock) 3190 __acquires(&sk->sk_lock.slock) 3191 { 3192 struct sk_buff *skb, *next; 3193 int nb = 0; 3194 3195 while ((skb = sk->sk_backlog.head) != NULL) { 3196 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 3197 3198 spin_unlock_bh(&sk->sk_lock.slock); 3199 3200 while (1) { 3201 next = skb->next; 3202 prefetch(next); 3203 DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb)); 3204 skb_mark_not_on_list(skb); 3205 sk_backlog_rcv(sk, skb); 3206 3207 skb = next; 3208 if (!skb) 3209 break; 3210 3211 if (!(++nb & 15)) 3212 cond_resched(); 3213 } 3214 3215 spin_lock_bh(&sk->sk_lock.slock); 3216 } 3217 3218 /* 3219 * Doing the zeroing here guarantee we can not loop forever 3220 * while a wild producer attempts to flood us. 3221 */ 3222 sk->sk_backlog.len = 0; 3223 } 3224 3225 void __sk_flush_backlog(struct sock *sk) 3226 { 3227 spin_lock_bh(&sk->sk_lock.slock); 3228 __release_sock(sk); 3229 3230 if (sk->sk_prot->release_cb) 3231 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, 3232 tcp_release_cb, sk); 3233 3234 spin_unlock_bh(&sk->sk_lock.slock); 3235 } 3236 EXPORT_SYMBOL_GPL(__sk_flush_backlog); 3237 3238 /** 3239 * sk_wait_data - wait for data to arrive at sk_receive_queue 3240 * @sk: sock to wait on 3241 * @timeo: for how long 3242 * @skb: last skb seen on sk_receive_queue 3243 * 3244 * Now socket state including sk->sk_err is changed only under lock, 3245 * hence we may omit checks after joining wait queue. 3246 * We check receive queue before schedule() only as optimization; 3247 * it is very likely that release_sock() added new data. 3248 */ 3249 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 3250 { 3251 DEFINE_WAIT_FUNC(wait, woken_wake_function); 3252 int rc; 3253 3254 add_wait_queue(sk_sleep(sk), &wait); 3255 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 3256 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 3257 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 3258 remove_wait_queue(sk_sleep(sk), &wait); 3259 return rc; 3260 } 3261 EXPORT_SYMBOL(sk_wait_data); 3262 3263 /** 3264 * __sk_mem_raise_allocated - increase memory_allocated 3265 * @sk: socket 3266 * @size: memory size to allocate 3267 * @amt: pages to allocate 3268 * @kind: allocation type 3269 * 3270 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc. 3271 * 3272 * Unlike the globally shared limits among the sockets under same protocol, 3273 * consuming the budget of a memcg won't have direct effect on other ones. 3274 * So be optimistic about memcg's tolerance, and leave the callers to decide 3275 * whether or not to raise allocated through sk_under_memory_pressure() or 3276 * its variants. 3277 */ 3278 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 3279 { 3280 bool memcg_enabled = false, charged = false; 3281 struct proto *prot = sk->sk_prot; 3282 long allocated = 0; 3283 3284 if (!sk->sk_bypass_prot_mem) { 3285 sk_memory_allocated_add(sk, amt); 3286 allocated = sk_memory_allocated(sk); 3287 } 3288 3289 if (mem_cgroup_sk_enabled(sk)) { 3290 memcg_enabled = true; 3291 charged = mem_cgroup_sk_charge(sk, amt, gfp_memcg_charge()); 3292 if (!charged) 3293 goto suppress_allocation; 3294 } 3295 3296 if (!allocated) 3297 return 1; 3298 3299 /* Under limit. */ 3300 if (allocated <= sk_prot_mem_limits(sk, 0)) { 3301 sk_leave_memory_pressure(sk); 3302 return 1; 3303 } 3304 3305 /* Under pressure. */ 3306 if (allocated > sk_prot_mem_limits(sk, 1)) 3307 sk_enter_memory_pressure(sk); 3308 3309 /* Over hard limit. */ 3310 if (allocated > sk_prot_mem_limits(sk, 2)) 3311 goto suppress_allocation; 3312 3313 /* Guarantee minimum buffer size under pressure (either global 3314 * or memcg) to make sure features described in RFC 7323 (TCP 3315 * Extensions for High Performance) work properly. 3316 * 3317 * This rule does NOT stand when exceeds global or memcg's hard 3318 * limit, or else a DoS attack can be taken place by spawning 3319 * lots of sockets whose usage are under minimum buffer size. 3320 */ 3321 if (kind == SK_MEM_RECV) { 3322 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 3323 return 1; 3324 3325 } else { /* SK_MEM_SEND */ 3326 int wmem0 = sk_get_wmem0(sk, prot); 3327 3328 if (sk->sk_type == SOCK_STREAM) { 3329 if (sk->sk_wmem_queued < wmem0) 3330 return 1; 3331 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 3332 return 1; 3333 } 3334 } 3335 3336 if (sk_has_memory_pressure(sk)) { 3337 u64 alloc; 3338 3339 /* The following 'average' heuristic is within the 3340 * scope of global accounting, so it only makes 3341 * sense for global memory pressure. 3342 */ 3343 if (!sk_under_global_memory_pressure(sk)) 3344 return 1; 3345 3346 /* Try to be fair among all the sockets under global 3347 * pressure by allowing the ones that below average 3348 * usage to raise. 3349 */ 3350 alloc = sk_sockets_allocated_read_positive(sk); 3351 if (sk_prot_mem_limits(sk, 2) > alloc * 3352 sk_mem_pages(sk->sk_wmem_queued + 3353 atomic_read(&sk->sk_rmem_alloc) + 3354 sk->sk_forward_alloc)) 3355 return 1; 3356 } 3357 3358 suppress_allocation: 3359 3360 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 3361 sk_stream_moderate_sndbuf(sk); 3362 3363 /* Fail only if socket is _under_ its sndbuf. 3364 * In this case we cannot block, so that we have to fail. 3365 */ 3366 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) { 3367 /* Force charge with __GFP_NOFAIL */ 3368 if (memcg_enabled && !charged) 3369 mem_cgroup_sk_charge(sk, amt, 3370 gfp_memcg_charge() | __GFP_NOFAIL); 3371 return 1; 3372 } 3373 } 3374 3375 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 3376 3377 if (allocated) 3378 sk_memory_allocated_sub(sk, amt); 3379 3380 if (charged) 3381 mem_cgroup_sk_uncharge(sk, amt); 3382 3383 return 0; 3384 } 3385 3386 /** 3387 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 3388 * @sk: socket 3389 * @size: memory size to allocate 3390 * @kind: allocation type 3391 * 3392 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 3393 * rmem allocation. This function assumes that protocols which have 3394 * memory_pressure use sk_wmem_queued as write buffer accounting. 3395 */ 3396 int __sk_mem_schedule(struct sock *sk, int size, int kind) 3397 { 3398 int ret, amt = sk_mem_pages(size); 3399 3400 sk_forward_alloc_add(sk, amt << PAGE_SHIFT); 3401 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 3402 if (!ret) 3403 sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT)); 3404 return ret; 3405 } 3406 EXPORT_SYMBOL(__sk_mem_schedule); 3407 3408 /** 3409 * __sk_mem_reduce_allocated - reclaim memory_allocated 3410 * @sk: socket 3411 * @amount: number of quanta 3412 * 3413 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 3414 */ 3415 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 3416 { 3417 if (mem_cgroup_sk_enabled(sk)) 3418 mem_cgroup_sk_uncharge(sk, amount); 3419 3420 if (sk->sk_bypass_prot_mem) 3421 return; 3422 3423 sk_memory_allocated_sub(sk, amount); 3424 3425 if (sk_under_global_memory_pressure(sk) && 3426 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 3427 sk_leave_memory_pressure(sk); 3428 } 3429 3430 /** 3431 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 3432 * @sk: socket 3433 * @amount: number of bytes (rounded down to a PAGE_SIZE multiple) 3434 */ 3435 void __sk_mem_reclaim(struct sock *sk, int amount) 3436 { 3437 amount >>= PAGE_SHIFT; 3438 sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT)); 3439 __sk_mem_reduce_allocated(sk, amount); 3440 } 3441 EXPORT_SYMBOL(__sk_mem_reclaim); 3442 3443 int sk_set_peek_off(struct sock *sk, int val) 3444 { 3445 WRITE_ONCE(sk->sk_peek_off, val); 3446 return 0; 3447 } 3448 EXPORT_SYMBOL_GPL(sk_set_peek_off); 3449 3450 /* 3451 * Set of default routines for initialising struct proto_ops when 3452 * the protocol does not support a particular function. In certain 3453 * cases where it makes no sense for a protocol to have a "do nothing" 3454 * function, some default processing is provided. 3455 */ 3456 3457 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 3458 { 3459 return -EOPNOTSUPP; 3460 } 3461 EXPORT_SYMBOL(sock_no_bind); 3462 3463 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 3464 int len, int flags) 3465 { 3466 return -EOPNOTSUPP; 3467 } 3468 EXPORT_SYMBOL(sock_no_connect); 3469 3470 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 3471 { 3472 return -EOPNOTSUPP; 3473 } 3474 EXPORT_SYMBOL(sock_no_socketpair); 3475 3476 int sock_no_accept(struct socket *sock, struct socket *newsock, 3477 struct proto_accept_arg *arg) 3478 { 3479 return -EOPNOTSUPP; 3480 } 3481 EXPORT_SYMBOL(sock_no_accept); 3482 3483 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 3484 int peer) 3485 { 3486 return -EOPNOTSUPP; 3487 } 3488 EXPORT_SYMBOL(sock_no_getname); 3489 3490 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 3491 { 3492 return -EOPNOTSUPP; 3493 } 3494 EXPORT_SYMBOL(sock_no_ioctl); 3495 3496 int sock_no_listen(struct socket *sock, int backlog) 3497 { 3498 return -EOPNOTSUPP; 3499 } 3500 EXPORT_SYMBOL(sock_no_listen); 3501 3502 int sock_no_shutdown(struct socket *sock, int how) 3503 { 3504 return -EOPNOTSUPP; 3505 } 3506 EXPORT_SYMBOL(sock_no_shutdown); 3507 3508 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 3509 { 3510 return -EOPNOTSUPP; 3511 } 3512 EXPORT_SYMBOL(sock_no_sendmsg); 3513 3514 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 3515 { 3516 return -EOPNOTSUPP; 3517 } 3518 EXPORT_SYMBOL(sock_no_sendmsg_locked); 3519 3520 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 3521 int flags) 3522 { 3523 return -EOPNOTSUPP; 3524 } 3525 EXPORT_SYMBOL(sock_no_recvmsg); 3526 3527 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 3528 { 3529 /* Mirror missing mmap method error code */ 3530 return -ENODEV; 3531 } 3532 EXPORT_SYMBOL(sock_no_mmap); 3533 3534 /* 3535 * When a file is received (via SCM_RIGHTS, etc), we must bump the 3536 * various sock-based usage counts. 3537 */ 3538 void __receive_sock(struct file *file) 3539 { 3540 struct socket *sock; 3541 3542 sock = sock_from_file(file); 3543 if (sock) { 3544 sock_update_netprioidx(&sock->sk->sk_cgrp_data); 3545 sock_update_classid(&sock->sk->sk_cgrp_data); 3546 } 3547 } 3548 3549 /* 3550 * Default Socket Callbacks 3551 */ 3552 3553 static void sock_def_wakeup(struct sock *sk) 3554 { 3555 struct socket_wq *wq; 3556 3557 rcu_read_lock(); 3558 wq = rcu_dereference(sk->sk_wq); 3559 if (skwq_has_sleeper(wq)) 3560 wake_up_interruptible_all(&wq->wait); 3561 rcu_read_unlock(); 3562 } 3563 3564 static void sock_def_error_report(struct sock *sk) 3565 { 3566 struct socket_wq *wq; 3567 3568 rcu_read_lock(); 3569 wq = rcu_dereference(sk->sk_wq); 3570 if (skwq_has_sleeper(wq)) 3571 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 3572 sk_wake_async_rcu(sk, SOCK_WAKE_IO, POLL_ERR); 3573 rcu_read_unlock(); 3574 } 3575 3576 void sock_def_readable(struct sock *sk) 3577 { 3578 struct socket_wq *wq; 3579 3580 trace_sk_data_ready(sk); 3581 3582 rcu_read_lock(); 3583 wq = rcu_dereference(sk->sk_wq); 3584 if (skwq_has_sleeper(wq)) 3585 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 3586 EPOLLRDNORM | EPOLLRDBAND); 3587 sk_wake_async_rcu(sk, SOCK_WAKE_WAITD, POLL_IN); 3588 rcu_read_unlock(); 3589 } 3590 3591 static void sock_def_write_space(struct sock *sk) 3592 { 3593 struct socket_wq *wq; 3594 3595 rcu_read_lock(); 3596 3597 /* Do not wake up a writer until he can make "significant" 3598 * progress. --DaveM 3599 */ 3600 if (sock_writeable(sk)) { 3601 wq = rcu_dereference(sk->sk_wq); 3602 if (skwq_has_sleeper(wq)) 3603 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 3604 EPOLLWRNORM | EPOLLWRBAND); 3605 3606 /* Should agree with poll, otherwise some programs break */ 3607 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 3608 } 3609 3610 rcu_read_unlock(); 3611 } 3612 3613 /* An optimised version of sock_def_write_space(), should only be called 3614 * for SOCK_RCU_FREE sockets under RCU read section and after putting 3615 * ->sk_wmem_alloc. 3616 */ 3617 static void sock_def_write_space_wfree(struct sock *sk, int wmem_alloc) 3618 { 3619 /* Do not wake up a writer until he can make "significant" 3620 * progress. --DaveM 3621 */ 3622 if (__sock_writeable(sk, wmem_alloc)) { 3623 struct socket_wq *wq = rcu_dereference(sk->sk_wq); 3624 3625 /* rely on refcount_sub from sock_wfree() */ 3626 smp_mb__after_atomic(); 3627 if (wq && waitqueue_active(&wq->wait)) 3628 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 3629 EPOLLWRNORM | EPOLLWRBAND); 3630 3631 /* Should agree with poll, otherwise some programs break */ 3632 sk_wake_async_rcu(sk, SOCK_WAKE_SPACE, POLL_OUT); 3633 } 3634 } 3635 3636 static void sock_def_destruct(struct sock *sk) 3637 { 3638 } 3639 3640 void sk_send_sigurg(struct sock *sk) 3641 { 3642 if (sk->sk_socket && sk->sk_socket->file) 3643 if (send_sigurg(sk->sk_socket->file)) 3644 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 3645 } 3646 EXPORT_SYMBOL(sk_send_sigurg); 3647 3648 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 3649 unsigned long expires) 3650 { 3651 if (!mod_timer(timer, expires)) 3652 sock_hold(sk); 3653 } 3654 EXPORT_SYMBOL(sk_reset_timer); 3655 3656 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 3657 { 3658 if (timer_delete(timer)) 3659 __sock_put(sk); 3660 } 3661 EXPORT_SYMBOL(sk_stop_timer); 3662 3663 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer) 3664 { 3665 if (timer_delete_sync(timer)) 3666 __sock_put(sk); 3667 } 3668 EXPORT_SYMBOL(sk_stop_timer_sync); 3669 3670 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) 3671 { 3672 sk_init_common(sk); 3673 sk->sk_send_head = NULL; 3674 3675 timer_setup(&sk->sk_timer, NULL, 0); 3676 3677 sk->sk_allocation = GFP_KERNEL; 3678 sk->sk_rcvbuf = READ_ONCE(sysctl_rmem_default); 3679 sk->sk_sndbuf = READ_ONCE(sysctl_wmem_default); 3680 sk->sk_state = TCP_CLOSE; 3681 sk->sk_use_task_frag = true; 3682 sk_set_socket(sk, sock); 3683 3684 sock_set_flag(sk, SOCK_ZAPPED); 3685 3686 if (sock) { 3687 sk->sk_type = sock->type; 3688 RCU_INIT_POINTER(sk->sk_wq, &sock->wq); 3689 sock->sk = sk; 3690 } else { 3691 RCU_INIT_POINTER(sk->sk_wq, NULL); 3692 } 3693 sk->sk_uid = uid; 3694 3695 sk->sk_state_change = sock_def_wakeup; 3696 sk->sk_data_ready = sock_def_readable; 3697 sk->sk_write_space = sock_def_write_space; 3698 sk->sk_error_report = sock_def_error_report; 3699 sk->sk_destruct = sock_def_destruct; 3700 3701 sk->sk_frag.page = NULL; 3702 sk->sk_frag.offset = 0; 3703 sk->sk_peek_off = -1; 3704 3705 sk->sk_peer_pid = NULL; 3706 sk->sk_peer_cred = NULL; 3707 spin_lock_init(&sk->sk_peer_lock); 3708 3709 sk->sk_write_pending = 0; 3710 sk->sk_rcvlowat = 1; 3711 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 3712 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 3713 3714 sk->sk_stamp = SK_DEFAULT_STAMP; 3715 #if BITS_PER_LONG==32 3716 seqlock_init(&sk->sk_stamp_seq); 3717 #endif 3718 atomic_set(&sk->sk_zckey, 0); 3719 3720 #ifdef CONFIG_NET_RX_BUSY_POLL 3721 sk->sk_napi_id = 0; 3722 sk->sk_ll_usec = READ_ONCE(sysctl_net_busy_read); 3723 #endif 3724 3725 sk->sk_max_pacing_rate = ~0UL; 3726 sk->sk_pacing_rate = ~0UL; 3727 WRITE_ONCE(sk->sk_pacing_shift, 10); 3728 sk->sk_incoming_cpu = -1; 3729 3730 sk_rx_queue_clear(sk); 3731 /* 3732 * Before updating sk_refcnt, we must commit prior changes to memory 3733 * (Documentation/RCU/rculist_nulls.rst for details) 3734 */ 3735 smp_wmb(); 3736 refcount_set(&sk->sk_refcnt, 1); 3737 sk_drops_reset(sk); 3738 } 3739 EXPORT_SYMBOL(sock_init_data_uid); 3740 3741 void sock_init_data(struct socket *sock, struct sock *sk) 3742 { 3743 kuid_t uid = sock ? 3744 SOCK_INODE(sock)->i_uid : 3745 make_kuid(sock_net(sk)->user_ns, 0); 3746 3747 sock_init_data_uid(sock, sk, uid); 3748 } 3749 EXPORT_SYMBOL(sock_init_data); 3750 3751 void lock_sock_nested(struct sock *sk, int subclass) 3752 { 3753 /* The sk_lock has mutex_lock() semantics here. */ 3754 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 3755 3756 might_sleep(); 3757 spin_lock_bh(&sk->sk_lock.slock); 3758 if (sock_owned_by_user_nocheck(sk)) 3759 __lock_sock(sk); 3760 sk->sk_lock.owned = 1; 3761 spin_unlock_bh(&sk->sk_lock.slock); 3762 } 3763 EXPORT_SYMBOL(lock_sock_nested); 3764 3765 void release_sock(struct sock *sk) 3766 { 3767 spin_lock_bh(&sk->sk_lock.slock); 3768 if (sk->sk_backlog.tail) 3769 __release_sock(sk); 3770 3771 if (sk->sk_prot->release_cb) 3772 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb, 3773 tcp_release_cb, sk); 3774 3775 sock_release_ownership(sk); 3776 if (waitqueue_active(&sk->sk_lock.wq)) 3777 wake_up(&sk->sk_lock.wq); 3778 spin_unlock_bh(&sk->sk_lock.slock); 3779 } 3780 EXPORT_SYMBOL(release_sock); 3781 3782 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock) 3783 { 3784 might_sleep(); 3785 spin_lock_bh(&sk->sk_lock.slock); 3786 3787 if (!sock_owned_by_user_nocheck(sk)) { 3788 /* 3789 * Fast path return with bottom halves disabled and 3790 * sock::sk_lock.slock held. 3791 * 3792 * The 'mutex' is not contended and holding 3793 * sock::sk_lock.slock prevents all other lockers to 3794 * proceed so the corresponding unlock_sock_fast() can 3795 * avoid the slow path of release_sock() completely and 3796 * just release slock. 3797 * 3798 * From a semantical POV this is equivalent to 'acquiring' 3799 * the 'mutex', hence the corresponding lockdep 3800 * mutex_release() has to happen in the fast path of 3801 * unlock_sock_fast(). 3802 */ 3803 return false; 3804 } 3805 3806 __lock_sock(sk); 3807 sk->sk_lock.owned = 1; 3808 __acquire(&sk->sk_lock.slock); 3809 spin_unlock_bh(&sk->sk_lock.slock); 3810 return true; 3811 } 3812 EXPORT_SYMBOL(__lock_sock_fast); 3813 3814 int sock_gettstamp(struct socket *sock, void __user *userstamp, 3815 bool timeval, bool time32) 3816 { 3817 struct sock *sk = sock->sk; 3818 struct timespec64 ts; 3819 3820 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 3821 ts = ktime_to_timespec64(sock_read_timestamp(sk)); 3822 if (ts.tv_sec == -1) 3823 return -ENOENT; 3824 if (ts.tv_sec == 0) { 3825 ktime_t kt = ktime_get_real(); 3826 sock_write_timestamp(sk, kt); 3827 ts = ktime_to_timespec64(kt); 3828 } 3829 3830 if (timeval) 3831 ts.tv_nsec /= 1000; 3832 3833 #ifdef CONFIG_COMPAT_32BIT_TIME 3834 if (time32) 3835 return put_old_timespec32(&ts, userstamp); 3836 #endif 3837 #ifdef CONFIG_SPARC64 3838 /* beware of padding in sparc64 timeval */ 3839 if (timeval && !in_compat_syscall()) { 3840 struct __kernel_old_timeval __user tv = { 3841 .tv_sec = ts.tv_sec, 3842 .tv_usec = ts.tv_nsec, 3843 }; 3844 if (copy_to_user(userstamp, &tv, sizeof(tv))) 3845 return -EFAULT; 3846 return 0; 3847 } 3848 #endif 3849 return put_timespec64(&ts, userstamp); 3850 } 3851 EXPORT_SYMBOL(sock_gettstamp); 3852 3853 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag) 3854 { 3855 if (!sock_flag(sk, flag)) { 3856 unsigned long previous_flags = sk->sk_flags; 3857 3858 sock_set_flag(sk, flag); 3859 /* 3860 * we just set one of the two flags which require net 3861 * time stamping, but time stamping might have been on 3862 * already because of the other one 3863 */ 3864 if (sock_needs_netstamp(sk) && 3865 !(previous_flags & SK_FLAGS_TIMESTAMP)) 3866 net_enable_timestamp(); 3867 } 3868 } 3869 3870 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 3871 int level, int type) 3872 { 3873 struct sock_exterr_skb *serr; 3874 struct sk_buff *skb; 3875 int copied, err; 3876 3877 err = -EAGAIN; 3878 skb = sock_dequeue_err_skb(sk); 3879 if (skb == NULL) 3880 goto out; 3881 3882 copied = skb->len; 3883 if (copied > len) { 3884 msg->msg_flags |= MSG_TRUNC; 3885 copied = len; 3886 } 3887 err = skb_copy_datagram_msg(skb, 0, msg, copied); 3888 if (err) 3889 goto out_free_skb; 3890 3891 sock_recv_timestamp(msg, sk, skb); 3892 3893 serr = SKB_EXT_ERR(skb); 3894 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 3895 3896 msg->msg_flags |= MSG_ERRQUEUE; 3897 err = copied; 3898 3899 out_free_skb: 3900 kfree_skb(skb); 3901 out: 3902 return err; 3903 } 3904 EXPORT_SYMBOL(sock_recv_errqueue); 3905 3906 /* 3907 * Get a socket option on an socket. 3908 * 3909 * FIX: POSIX 1003.1g is very ambiguous here. It states that 3910 * asynchronous errors should be reported by getsockopt. We assume 3911 * this means if you specify SO_ERROR (otherwise what is the point of it). 3912 */ 3913 int sock_common_getsockopt(struct socket *sock, int level, int optname, 3914 char __user *optval, int __user *optlen) 3915 { 3916 struct sock *sk = sock->sk; 3917 3918 /* IPV6_ADDRFORM can change sk->sk_prot under us. */ 3919 return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen); 3920 } 3921 EXPORT_SYMBOL(sock_common_getsockopt); 3922 3923 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 3924 int flags) 3925 { 3926 struct sock *sk = sock->sk; 3927 int addr_len = 0; 3928 int err; 3929 3930 err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len); 3931 if (err >= 0) 3932 msg->msg_namelen = addr_len; 3933 return err; 3934 } 3935 EXPORT_SYMBOL(sock_common_recvmsg); 3936 3937 /* 3938 * Set socket options on an inet socket. 3939 */ 3940 int sock_common_setsockopt(struct socket *sock, int level, int optname, 3941 sockptr_t optval, unsigned int optlen) 3942 { 3943 struct sock *sk = sock->sk; 3944 3945 /* IPV6_ADDRFORM can change sk->sk_prot under us. */ 3946 return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen); 3947 } 3948 EXPORT_SYMBOL(sock_common_setsockopt); 3949 3950 void sk_common_release(struct sock *sk) 3951 { 3952 if (sk->sk_prot->destroy) 3953 sk->sk_prot->destroy(sk); 3954 3955 /* 3956 * Observation: when sk_common_release is called, processes have 3957 * no access to socket. But net still has. 3958 * Step one, detach it from networking: 3959 * 3960 * A. Remove from hash tables. 3961 */ 3962 3963 sk->sk_prot->unhash(sk); 3964 3965 /* 3966 * In this point socket cannot receive new packets, but it is possible 3967 * that some packets are in flight because some CPU runs receiver and 3968 * did hash table lookup before we unhashed socket. They will achieve 3969 * receive queue and will be purged by socket destructor. 3970 * 3971 * Also we still have packets pending on receive queue and probably, 3972 * our own packets waiting in device queues. sock_destroy will drain 3973 * receive queue, but transmitted packets will delay socket destruction 3974 * until the last reference will be released. 3975 */ 3976 3977 sock_orphan(sk); 3978 3979 xfrm_sk_free_policy(sk); 3980 3981 sock_put(sk); 3982 } 3983 EXPORT_SYMBOL(sk_common_release); 3984 3985 void sk_get_meminfo(const struct sock *sk, u32 *mem) 3986 { 3987 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 3988 3989 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 3990 mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf); 3991 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 3992 mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf); 3993 mem[SK_MEMINFO_FWD_ALLOC] = READ_ONCE(sk->sk_forward_alloc); 3994 mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued); 3995 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 3996 mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len); 3997 mem[SK_MEMINFO_DROPS] = sk_drops_read(sk); 3998 } 3999 4000 #ifdef CONFIG_PROC_FS 4001 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 4002 4003 int sock_prot_inuse_get(struct net *net, struct proto *prot) 4004 { 4005 int cpu, idx = prot->inuse_idx; 4006 int res = 0; 4007 4008 for_each_possible_cpu(cpu) 4009 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 4010 4011 return res >= 0 ? res : 0; 4012 } 4013 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 4014 4015 int sock_inuse_get(struct net *net) 4016 { 4017 int cpu, res = 0; 4018 4019 for_each_possible_cpu(cpu) 4020 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all; 4021 4022 return res; 4023 } 4024 4025 EXPORT_SYMBOL_GPL(sock_inuse_get); 4026 4027 static int __net_init sock_inuse_init_net(struct net *net) 4028 { 4029 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 4030 if (net->core.prot_inuse == NULL) 4031 return -ENOMEM; 4032 return 0; 4033 } 4034 4035 static void __net_exit sock_inuse_exit_net(struct net *net) 4036 { 4037 free_percpu(net->core.prot_inuse); 4038 } 4039 4040 static struct pernet_operations net_inuse_ops = { 4041 .init = sock_inuse_init_net, 4042 .exit = sock_inuse_exit_net, 4043 }; 4044 4045 static __init int net_inuse_init(void) 4046 { 4047 if (register_pernet_subsys(&net_inuse_ops)) 4048 panic("Cannot initialize net inuse counters"); 4049 4050 return 0; 4051 } 4052 4053 core_initcall(net_inuse_init); 4054 4055 static int assign_proto_idx(struct proto *prot) 4056 { 4057 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 4058 4059 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR)) { 4060 pr_err("PROTO_INUSE_NR exhausted\n"); 4061 return -ENOSPC; 4062 } 4063 4064 set_bit(prot->inuse_idx, proto_inuse_idx); 4065 return 0; 4066 } 4067 4068 static void release_proto_idx(struct proto *prot) 4069 { 4070 if (prot->inuse_idx != PROTO_INUSE_NR) 4071 clear_bit(prot->inuse_idx, proto_inuse_idx); 4072 } 4073 #else 4074 static inline int assign_proto_idx(struct proto *prot) 4075 { 4076 return 0; 4077 } 4078 4079 static inline void release_proto_idx(struct proto *prot) 4080 { 4081 } 4082 4083 #endif 4084 4085 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot) 4086 { 4087 if (!twsk_prot) 4088 return; 4089 kfree(twsk_prot->twsk_slab_name); 4090 twsk_prot->twsk_slab_name = NULL; 4091 kmem_cache_destroy(twsk_prot->twsk_slab); 4092 twsk_prot->twsk_slab = NULL; 4093 } 4094 4095 static int tw_prot_init(const struct proto *prot) 4096 { 4097 struct timewait_sock_ops *twsk_prot = prot->twsk_prot; 4098 4099 if (!twsk_prot) 4100 return 0; 4101 4102 twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", 4103 prot->name); 4104 if (!twsk_prot->twsk_slab_name) 4105 return -ENOMEM; 4106 4107 twsk_prot->twsk_slab = 4108 kmem_cache_create(twsk_prot->twsk_slab_name, 4109 twsk_prot->twsk_obj_size, 0, 4110 SLAB_ACCOUNT | prot->slab_flags, 4111 NULL); 4112 if (!twsk_prot->twsk_slab) { 4113 pr_crit("%s: Can't create timewait sock SLAB cache!\n", 4114 prot->name); 4115 return -ENOMEM; 4116 } 4117 4118 return 0; 4119 } 4120 4121 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 4122 { 4123 if (!rsk_prot) 4124 return; 4125 kfree(rsk_prot->slab_name); 4126 rsk_prot->slab_name = NULL; 4127 kmem_cache_destroy(rsk_prot->slab); 4128 rsk_prot->slab = NULL; 4129 } 4130 4131 static int req_prot_init(const struct proto *prot) 4132 { 4133 struct request_sock_ops *rsk_prot = prot->rsk_prot; 4134 4135 if (!rsk_prot) 4136 return 0; 4137 4138 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 4139 prot->name); 4140 if (!rsk_prot->slab_name) 4141 return -ENOMEM; 4142 4143 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 4144 rsk_prot->obj_size, 0, 4145 SLAB_ACCOUNT | prot->slab_flags, 4146 NULL); 4147 4148 if (!rsk_prot->slab) { 4149 pr_crit("%s: Can't create request sock SLAB cache!\n", 4150 prot->name); 4151 return -ENOMEM; 4152 } 4153 return 0; 4154 } 4155 4156 int proto_register(struct proto *prot, int alloc_slab) 4157 { 4158 int ret = -ENOBUFS; 4159 4160 if (prot->memory_allocated && !prot->sysctl_mem) { 4161 pr_err("%s: missing sysctl_mem\n", prot->name); 4162 return -EINVAL; 4163 } 4164 if (prot->memory_allocated && !prot->per_cpu_fw_alloc) { 4165 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name); 4166 return -EINVAL; 4167 } 4168 if (alloc_slab) { 4169 prot->slab = kmem_cache_create_usercopy(prot->name, 4170 prot->obj_size, 0, 4171 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | 4172 prot->slab_flags, 4173 prot->useroffset, prot->usersize, 4174 NULL); 4175 4176 if (prot->slab == NULL) { 4177 pr_crit("%s: Can't create sock SLAB cache!\n", 4178 prot->name); 4179 goto out; 4180 } 4181 4182 if (req_prot_init(prot)) 4183 goto out_free_request_sock_slab; 4184 4185 if (tw_prot_init(prot)) 4186 goto out_free_timewait_sock_slab; 4187 } 4188 4189 mutex_lock(&proto_list_mutex); 4190 ret = assign_proto_idx(prot); 4191 if (ret) { 4192 mutex_unlock(&proto_list_mutex); 4193 goto out_free_timewait_sock_slab; 4194 } 4195 list_add(&prot->node, &proto_list); 4196 mutex_unlock(&proto_list_mutex); 4197 return ret; 4198 4199 out_free_timewait_sock_slab: 4200 if (alloc_slab) 4201 tw_prot_cleanup(prot->twsk_prot); 4202 out_free_request_sock_slab: 4203 if (alloc_slab) { 4204 req_prot_cleanup(prot->rsk_prot); 4205 4206 kmem_cache_destroy(prot->slab); 4207 prot->slab = NULL; 4208 } 4209 out: 4210 return ret; 4211 } 4212 EXPORT_SYMBOL(proto_register); 4213 4214 void proto_unregister(struct proto *prot) 4215 { 4216 mutex_lock(&proto_list_mutex); 4217 release_proto_idx(prot); 4218 list_del(&prot->node); 4219 mutex_unlock(&proto_list_mutex); 4220 4221 kmem_cache_destroy(prot->slab); 4222 prot->slab = NULL; 4223 4224 req_prot_cleanup(prot->rsk_prot); 4225 tw_prot_cleanup(prot->twsk_prot); 4226 } 4227 EXPORT_SYMBOL(proto_unregister); 4228 4229 int sock_load_diag_module(int family, int protocol) 4230 { 4231 if (!protocol) { 4232 if (!sock_is_registered(family)) 4233 return -ENOENT; 4234 4235 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 4236 NETLINK_SOCK_DIAG, family); 4237 } 4238 4239 #ifdef CONFIG_INET 4240 if (family == AF_INET && 4241 protocol != IPPROTO_RAW && 4242 protocol < MAX_INET_PROTOS && 4243 !rcu_access_pointer(inet_protos[protocol])) 4244 return -ENOENT; 4245 #endif 4246 4247 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 4248 NETLINK_SOCK_DIAG, family, protocol); 4249 } 4250 EXPORT_SYMBOL(sock_load_diag_module); 4251 4252 #ifdef CONFIG_PROC_FS 4253 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 4254 __acquires(proto_list_mutex) 4255 { 4256 mutex_lock(&proto_list_mutex); 4257 return seq_list_start_head(&proto_list, *pos); 4258 } 4259 4260 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 4261 { 4262 return seq_list_next(v, &proto_list, pos); 4263 } 4264 4265 static void proto_seq_stop(struct seq_file *seq, void *v) 4266 __releases(proto_list_mutex) 4267 { 4268 mutex_unlock(&proto_list_mutex); 4269 } 4270 4271 static char proto_method_implemented(const void *method) 4272 { 4273 return method == NULL ? 'n' : 'y'; 4274 } 4275 static long sock_prot_memory_allocated(struct proto *proto) 4276 { 4277 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 4278 } 4279 4280 static const char *sock_prot_memory_pressure(struct proto *proto) 4281 { 4282 return proto->memory_pressure != NULL ? 4283 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 4284 } 4285 4286 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 4287 { 4288 4289 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 4290 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 4291 proto->name, 4292 proto->obj_size, 4293 sock_prot_inuse_get(seq_file_net(seq), proto), 4294 sock_prot_memory_allocated(proto), 4295 sock_prot_memory_pressure(proto), 4296 proto->max_header, 4297 proto->slab == NULL ? "no" : "yes", 4298 module_name(proto->owner), 4299 proto_method_implemented(proto->close), 4300 proto_method_implemented(proto->connect), 4301 proto_method_implemented(proto->disconnect), 4302 proto_method_implemented(proto->accept), 4303 proto_method_implemented(proto->ioctl), 4304 proto_method_implemented(proto->init), 4305 proto_method_implemented(proto->destroy), 4306 proto_method_implemented(proto->shutdown), 4307 proto_method_implemented(proto->setsockopt), 4308 proto_method_implemented(proto->getsockopt), 4309 proto_method_implemented(proto->sendmsg), 4310 proto_method_implemented(proto->recvmsg), 4311 proto_method_implemented(proto->bind), 4312 proto_method_implemented(proto->backlog_rcv), 4313 proto_method_implemented(proto->hash), 4314 proto_method_implemented(proto->unhash), 4315 proto_method_implemented(proto->get_port), 4316 proto_method_implemented(proto->enter_memory_pressure)); 4317 } 4318 4319 static int proto_seq_show(struct seq_file *seq, void *v) 4320 { 4321 if (v == &proto_list) 4322 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 4323 "protocol", 4324 "size", 4325 "sockets", 4326 "memory", 4327 "press", 4328 "maxhdr", 4329 "slab", 4330 "module", 4331 "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n"); 4332 else 4333 proto_seq_printf(seq, list_entry(v, struct proto, node)); 4334 return 0; 4335 } 4336 4337 static const struct seq_operations proto_seq_ops = { 4338 .start = proto_seq_start, 4339 .next = proto_seq_next, 4340 .stop = proto_seq_stop, 4341 .show = proto_seq_show, 4342 }; 4343 4344 static __net_init int proto_init_net(struct net *net) 4345 { 4346 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, 4347 sizeof(struct seq_net_private))) 4348 return -ENOMEM; 4349 4350 return 0; 4351 } 4352 4353 static __net_exit void proto_exit_net(struct net *net) 4354 { 4355 remove_proc_entry("protocols", net->proc_net); 4356 } 4357 4358 4359 static __net_initdata struct pernet_operations proto_net_ops = { 4360 .init = proto_init_net, 4361 .exit = proto_exit_net, 4362 }; 4363 4364 static int __init proto_init(void) 4365 { 4366 return register_pernet_subsys(&proto_net_ops); 4367 } 4368 4369 subsys_initcall(proto_init); 4370 4371 #endif /* PROC_FS */ 4372 4373 #ifdef CONFIG_NET_RX_BUSY_POLL 4374 bool sk_busy_loop_end(void *p, unsigned long start_time) 4375 { 4376 struct sock *sk = p; 4377 4378 if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) 4379 return true; 4380 4381 if (sk_is_udp(sk) && 4382 !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) 4383 return true; 4384 4385 return sk_busy_loop_timeout(sk, start_time); 4386 } 4387 EXPORT_SYMBOL(sk_busy_loop_end); 4388 #endif /* CONFIG_NET_RX_BUSY_POLL */ 4389 4390 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len) 4391 { 4392 if (!sk->sk_prot->bind_add) 4393 return -EOPNOTSUPP; 4394 return sk->sk_prot->bind_add(sk, addr, addr_len); 4395 } 4396 EXPORT_SYMBOL(sock_bind_add); 4397 4398 /* Copy 'size' bytes from userspace and return `size` back to userspace */ 4399 int sock_ioctl_inout(struct sock *sk, unsigned int cmd, 4400 void __user *arg, void *karg, size_t size) 4401 { 4402 int ret; 4403 4404 if (copy_from_user(karg, arg, size)) 4405 return -EFAULT; 4406 4407 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg); 4408 if (ret) 4409 return ret; 4410 4411 if (copy_to_user(arg, karg, size)) 4412 return -EFAULT; 4413 4414 return 0; 4415 } 4416 EXPORT_SYMBOL(sock_ioctl_inout); 4417 4418 /* This is the most common ioctl prep function, where the result (4 bytes) is 4419 * copied back to userspace if the ioctl() returns successfully. No input is 4420 * copied from userspace as input argument. 4421 */ 4422 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg) 4423 { 4424 int ret, karg = 0; 4425 4426 ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg); 4427 if (ret) 4428 return ret; 4429 4430 return put_user(karg, (int __user *)arg); 4431 } 4432 4433 /* A wrapper around sock ioctls, which copies the data from userspace 4434 * (depending on the protocol/ioctl), and copies back the result to userspace. 4435 * The main motivation for this function is to pass kernel memory to the 4436 * protocol ioctl callbacks, instead of userspace memory. 4437 */ 4438 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) 4439 { 4440 int rc = 1; 4441 4442 if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET) 4443 rc = ipmr_sk_ioctl(sk, cmd, arg); 4444 else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6) 4445 rc = ip6mr_sk_ioctl(sk, cmd, arg); 4446 else if (sk_is_phonet(sk)) 4447 rc = phonet_sk_ioctl(sk, cmd, arg); 4448 4449 /* If ioctl was processed, returns its value */ 4450 if (rc <= 0) 4451 return rc; 4452 4453 /* Otherwise call the default handler */ 4454 return sock_ioctl_out(sk, cmd, arg); 4455 } 4456 EXPORT_SYMBOL(sk_ioctl); 4457 4458 static int __init sock_struct_check(void) 4459 { 4460 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_drops); 4461 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_peek_off); 4462 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_error_queue); 4463 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_receive_queue); 4464 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rx, sk_backlog); 4465 4466 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst); 4467 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_ifindex); 4468 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rx_dst_cookie); 4469 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvbuf); 4470 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_filter); 4471 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_wq); 4472 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_data_ready); 4473 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvtimeo); 4474 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rx, sk_rcvlowat); 4475 4476 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_err); 4477 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_socket); 4478 #ifdef CONFIG_MEMCG 4479 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_rxtx, sk_memcg); 4480 #endif 4481 4482 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_lock); 4483 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_reserved_mem); 4484 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_forward_alloc); 4485 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_rxtx, sk_tsflags); 4486 4487 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); 4488 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_omem_alloc); 4489 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_err_soft); 4490 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_queued); 4491 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_wmem_alloc); 4492 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tsq_flags); 4493 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_send_head); 4494 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_queue); 4495 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_write_pending); 4496 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_dst_pending_confirm); 4497 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_status); 4498 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_frag); 4499 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_timer); 4500 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_pacing_rate); 4501 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_zckey); 4502 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_write_tx, sk_tskey); 4503 4504 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_max_pacing_rate); 4505 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndtimeo); 4506 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_priority); 4507 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_mark); 4508 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_uid); 4509 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_protocol); 4510 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_dst_cache); 4511 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_route_caps); 4512 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_type); 4513 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_size); 4514 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_allocation); 4515 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_txhash); 4516 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_sndbuf); 4517 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_gso_max_segs); 4518 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_pacing_shift); 4519 CACHELINE_ASSERT_GROUP_MEMBER(struct sock, sock_read_tx, sk_use_task_frag); 4520 return 0; 4521 } 4522 4523 core_initcall(sock_struct_check); 4524