1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Generic socket support routines. Memory allocators, socket lock/release 7 * handler for protocols to use and generic option handler. 8 * 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 * 85 * 86 * This program is free software; you can redistribute it and/or 87 * modify it under the terms of the GNU General Public License 88 * as published by the Free Software Foundation; either version 89 * 2 of the License, or (at your option) any later version. 90 */ 91 92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 93 94 #include <asm/unaligned.h> 95 #include <linux/capability.h> 96 #include <linux/errno.h> 97 #include <linux/errqueue.h> 98 #include <linux/types.h> 99 #include <linux/socket.h> 100 #include <linux/in.h> 101 #include <linux/kernel.h> 102 #include <linux/module.h> 103 #include <linux/proc_fs.h> 104 #include <linux/seq_file.h> 105 #include <linux/sched.h> 106 #include <linux/sched/mm.h> 107 #include <linux/timer.h> 108 #include <linux/string.h> 109 #include <linux/sockios.h> 110 #include <linux/net.h> 111 #include <linux/mm.h> 112 #include <linux/slab.h> 113 #include <linux/interrupt.h> 114 #include <linux/poll.h> 115 #include <linux/tcp.h> 116 #include <linux/init.h> 117 #include <linux/highmem.h> 118 #include <linux/user_namespace.h> 119 #include <linux/static_key.h> 120 #include <linux/memcontrol.h> 121 #include <linux/prefetch.h> 122 123 #include <linux/uaccess.h> 124 125 #include <linux/netdevice.h> 126 #include <net/protocol.h> 127 #include <linux/skbuff.h> 128 #include <net/net_namespace.h> 129 #include <net/request_sock.h> 130 #include <net/sock.h> 131 #include <linux/net_tstamp.h> 132 #include <net/xfrm.h> 133 #include <linux/ipsec.h> 134 #include <net/cls_cgroup.h> 135 #include <net/netprio_cgroup.h> 136 #include <linux/sock_diag.h> 137 138 #include <linux/filter.h> 139 #include <net/sock_reuseport.h> 140 141 #include <trace/events/sock.h> 142 143 #include <net/tcp.h> 144 #include <net/busy_poll.h> 145 146 static DEFINE_MUTEX(proto_list_mutex); 147 static LIST_HEAD(proto_list); 148 149 static void sock_inuse_add(struct net *net, int val); 150 151 /** 152 * sk_ns_capable - General socket capability test 153 * @sk: Socket to use a capability on or through 154 * @user_ns: The user namespace of the capability to use 155 * @cap: The capability to use 156 * 157 * Test to see if the opener of the socket had when the socket was 158 * created and the current process has the capability @cap in the user 159 * namespace @user_ns. 160 */ 161 bool sk_ns_capable(const struct sock *sk, 162 struct user_namespace *user_ns, int cap) 163 { 164 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 165 ns_capable(user_ns, cap); 166 } 167 EXPORT_SYMBOL(sk_ns_capable); 168 169 /** 170 * sk_capable - Socket global capability test 171 * @sk: Socket to use a capability on or through 172 * @cap: The global capability to use 173 * 174 * Test to see if the opener of the socket had when the socket was 175 * created and the current process has the capability @cap in all user 176 * namespaces. 177 */ 178 bool sk_capable(const struct sock *sk, int cap) 179 { 180 return sk_ns_capable(sk, &init_user_ns, cap); 181 } 182 EXPORT_SYMBOL(sk_capable); 183 184 /** 185 * sk_net_capable - Network namespace socket capability test 186 * @sk: Socket to use a capability on or through 187 * @cap: The capability to use 188 * 189 * Test to see if the opener of the socket had when the socket was created 190 * and the current process has the capability @cap over the network namespace 191 * the socket is a member of. 192 */ 193 bool sk_net_capable(const struct sock *sk, int cap) 194 { 195 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 196 } 197 EXPORT_SYMBOL(sk_net_capable); 198 199 /* 200 * Each address family might have different locking rules, so we have 201 * one slock key per address family and separate keys for internal and 202 * userspace sockets. 203 */ 204 static struct lock_class_key af_family_keys[AF_MAX]; 205 static struct lock_class_key af_family_kern_keys[AF_MAX]; 206 static struct lock_class_key af_family_slock_keys[AF_MAX]; 207 static struct lock_class_key af_family_kern_slock_keys[AF_MAX]; 208 209 /* 210 * Make lock validator output more readable. (we pre-construct these 211 * strings build-time, so that runtime initialization of socket 212 * locks is fast): 213 */ 214 215 #define _sock_locks(x) \ 216 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \ 217 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \ 218 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \ 219 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \ 220 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \ 221 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \ 222 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \ 223 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \ 224 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \ 225 x "27" , x "28" , x "AF_CAN" , \ 226 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \ 227 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \ 228 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \ 229 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \ 230 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \ 231 x "AF_MAX" 232 233 static const char *const af_family_key_strings[AF_MAX+1] = { 234 _sock_locks("sk_lock-") 235 }; 236 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 237 _sock_locks("slock-") 238 }; 239 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 240 _sock_locks("clock-") 241 }; 242 243 static const char *const af_family_kern_key_strings[AF_MAX+1] = { 244 _sock_locks("k-sk_lock-") 245 }; 246 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { 247 _sock_locks("k-slock-") 248 }; 249 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { 250 _sock_locks("k-clock-") 251 }; 252 static const char *const af_family_rlock_key_strings[AF_MAX+1] = { 253 _sock_locks("rlock-") 254 }; 255 static const char *const af_family_wlock_key_strings[AF_MAX+1] = { 256 _sock_locks("wlock-") 257 }; 258 static const char *const af_family_elock_key_strings[AF_MAX+1] = { 259 _sock_locks("elock-") 260 }; 261 262 /* 263 * sk_callback_lock and sk queues locking rules are per-address-family, 264 * so split the lock classes by using a per-AF key: 265 */ 266 static struct lock_class_key af_callback_keys[AF_MAX]; 267 static struct lock_class_key af_rlock_keys[AF_MAX]; 268 static struct lock_class_key af_wlock_keys[AF_MAX]; 269 static struct lock_class_key af_elock_keys[AF_MAX]; 270 static struct lock_class_key af_kern_callback_keys[AF_MAX]; 271 272 /* Run time adjustable parameters. */ 273 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 274 EXPORT_SYMBOL(sysctl_wmem_max); 275 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 276 EXPORT_SYMBOL(sysctl_rmem_max); 277 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 278 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 279 280 /* Maximal space eaten by iovec or ancillary data plus some space */ 281 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 282 EXPORT_SYMBOL(sysctl_optmem_max); 283 284 int sysctl_tstamp_allow_data __read_mostly = 1; 285 286 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key); 287 EXPORT_SYMBOL_GPL(memalloc_socks_key); 288 289 /** 290 * sk_set_memalloc - sets %SOCK_MEMALLOC 291 * @sk: socket to set it on 292 * 293 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 294 * It's the responsibility of the admin to adjust min_free_kbytes 295 * to meet the requirements 296 */ 297 void sk_set_memalloc(struct sock *sk) 298 { 299 sock_set_flag(sk, SOCK_MEMALLOC); 300 sk->sk_allocation |= __GFP_MEMALLOC; 301 static_branch_inc(&memalloc_socks_key); 302 } 303 EXPORT_SYMBOL_GPL(sk_set_memalloc); 304 305 void sk_clear_memalloc(struct sock *sk) 306 { 307 sock_reset_flag(sk, SOCK_MEMALLOC); 308 sk->sk_allocation &= ~__GFP_MEMALLOC; 309 static_branch_dec(&memalloc_socks_key); 310 311 /* 312 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 313 * progress of swapping. SOCK_MEMALLOC may be cleared while 314 * it has rmem allocations due to the last swapfile being deactivated 315 * but there is a risk that the socket is unusable due to exceeding 316 * the rmem limits. Reclaim the reserves and obey rmem limits again. 317 */ 318 sk_mem_reclaim(sk); 319 } 320 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 321 322 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 323 { 324 int ret; 325 unsigned int noreclaim_flag; 326 327 /* these should have been dropped before queueing */ 328 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 329 330 noreclaim_flag = memalloc_noreclaim_save(); 331 ret = sk->sk_backlog_rcv(sk, skb); 332 memalloc_noreclaim_restore(noreclaim_flag); 333 334 return ret; 335 } 336 EXPORT_SYMBOL(__sk_backlog_rcv); 337 338 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) 339 { 340 struct timeval tv; 341 342 if (optlen < sizeof(tv)) 343 return -EINVAL; 344 if (copy_from_user(&tv, optval, sizeof(tv))) 345 return -EFAULT; 346 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 347 return -EDOM; 348 349 if (tv.tv_sec < 0) { 350 static int warned __read_mostly; 351 352 *timeo_p = 0; 353 if (warned < 10 && net_ratelimit()) { 354 warned++; 355 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 356 __func__, current->comm, task_pid_nr(current)); 357 } 358 return 0; 359 } 360 *timeo_p = MAX_SCHEDULE_TIMEOUT; 361 if (tv.tv_sec == 0 && tv.tv_usec == 0) 362 return 0; 363 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) 364 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ); 365 return 0; 366 } 367 368 static void sock_warn_obsolete_bsdism(const char *name) 369 { 370 static int warned; 371 static char warncomm[TASK_COMM_LEN]; 372 if (strcmp(warncomm, current->comm) && warned < 5) { 373 strcpy(warncomm, current->comm); 374 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n", 375 warncomm, name); 376 warned++; 377 } 378 } 379 380 static bool sock_needs_netstamp(const struct sock *sk) 381 { 382 switch (sk->sk_family) { 383 case AF_UNSPEC: 384 case AF_UNIX: 385 return false; 386 default: 387 return true; 388 } 389 } 390 391 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 392 { 393 if (sk->sk_flags & flags) { 394 sk->sk_flags &= ~flags; 395 if (sock_needs_netstamp(sk) && 396 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 397 net_disable_timestamp(); 398 } 399 } 400 401 402 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 403 { 404 unsigned long flags; 405 struct sk_buff_head *list = &sk->sk_receive_queue; 406 407 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { 408 atomic_inc(&sk->sk_drops); 409 trace_sock_rcvqueue_full(sk, skb); 410 return -ENOMEM; 411 } 412 413 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 414 atomic_inc(&sk->sk_drops); 415 return -ENOBUFS; 416 } 417 418 skb->dev = NULL; 419 skb_set_owner_r(skb, sk); 420 421 /* we escape from rcu protected region, make sure we dont leak 422 * a norefcounted dst 423 */ 424 skb_dst_force(skb); 425 426 spin_lock_irqsave(&list->lock, flags); 427 sock_skb_set_dropcount(sk, skb); 428 __skb_queue_tail(list, skb); 429 spin_unlock_irqrestore(&list->lock, flags); 430 431 if (!sock_flag(sk, SOCK_DEAD)) 432 sk->sk_data_ready(sk); 433 return 0; 434 } 435 EXPORT_SYMBOL(__sock_queue_rcv_skb); 436 437 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 438 { 439 int err; 440 441 err = sk_filter(sk, skb); 442 if (err) 443 return err; 444 445 return __sock_queue_rcv_skb(sk, skb); 446 } 447 EXPORT_SYMBOL(sock_queue_rcv_skb); 448 449 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, 450 const int nested, unsigned int trim_cap, bool refcounted) 451 { 452 int rc = NET_RX_SUCCESS; 453 454 if (sk_filter_trim_cap(sk, skb, trim_cap)) 455 goto discard_and_relse; 456 457 skb->dev = NULL; 458 459 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { 460 atomic_inc(&sk->sk_drops); 461 goto discard_and_relse; 462 } 463 if (nested) 464 bh_lock_sock_nested(sk); 465 else 466 bh_lock_sock(sk); 467 if (!sock_owned_by_user(sk)) { 468 /* 469 * trylock + unlock semantics: 470 */ 471 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 472 473 rc = sk_backlog_rcv(sk, skb); 474 475 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); 476 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) { 477 bh_unlock_sock(sk); 478 atomic_inc(&sk->sk_drops); 479 goto discard_and_relse; 480 } 481 482 bh_unlock_sock(sk); 483 out: 484 if (refcounted) 485 sock_put(sk); 486 return rc; 487 discard_and_relse: 488 kfree_skb(skb); 489 goto out; 490 } 491 EXPORT_SYMBOL(__sk_receive_skb); 492 493 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 494 { 495 struct dst_entry *dst = __sk_dst_get(sk); 496 497 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 498 sk_tx_queue_clear(sk); 499 sk->sk_dst_pending_confirm = 0; 500 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 501 dst_release(dst); 502 return NULL; 503 } 504 505 return dst; 506 } 507 EXPORT_SYMBOL(__sk_dst_check); 508 509 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 510 { 511 struct dst_entry *dst = sk_dst_get(sk); 512 513 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 514 sk_dst_reset(sk); 515 dst_release(dst); 516 return NULL; 517 } 518 519 return dst; 520 } 521 EXPORT_SYMBOL(sk_dst_check); 522 523 static int sock_setbindtodevice_locked(struct sock *sk, int ifindex) 524 { 525 int ret = -ENOPROTOOPT; 526 #ifdef CONFIG_NETDEVICES 527 struct net *net = sock_net(sk); 528 529 /* Sorry... */ 530 ret = -EPERM; 531 if (!ns_capable(net->user_ns, CAP_NET_RAW)) 532 goto out; 533 534 ret = -EINVAL; 535 if (ifindex < 0) 536 goto out; 537 538 sk->sk_bound_dev_if = ifindex; 539 if (sk->sk_prot->rehash) 540 sk->sk_prot->rehash(sk); 541 sk_dst_reset(sk); 542 543 ret = 0; 544 545 out: 546 #endif 547 548 return ret; 549 } 550 551 static int sock_setbindtodevice(struct sock *sk, char __user *optval, 552 int optlen) 553 { 554 int ret = -ENOPROTOOPT; 555 #ifdef CONFIG_NETDEVICES 556 struct net *net = sock_net(sk); 557 char devname[IFNAMSIZ]; 558 int index; 559 560 ret = -EINVAL; 561 if (optlen < 0) 562 goto out; 563 564 /* Bind this socket to a particular device like "eth0", 565 * as specified in the passed interface name. If the 566 * name is "" or the option length is zero the socket 567 * is not bound. 568 */ 569 if (optlen > IFNAMSIZ - 1) 570 optlen = IFNAMSIZ - 1; 571 memset(devname, 0, sizeof(devname)); 572 573 ret = -EFAULT; 574 if (copy_from_user(devname, optval, optlen)) 575 goto out; 576 577 index = 0; 578 if (devname[0] != '\0') { 579 struct net_device *dev; 580 581 rcu_read_lock(); 582 dev = dev_get_by_name_rcu(net, devname); 583 if (dev) 584 index = dev->ifindex; 585 rcu_read_unlock(); 586 ret = -ENODEV; 587 if (!dev) 588 goto out; 589 } 590 591 lock_sock(sk); 592 ret = sock_setbindtodevice_locked(sk, index); 593 release_sock(sk); 594 595 out: 596 #endif 597 598 return ret; 599 } 600 601 static int sock_getbindtodevice(struct sock *sk, char __user *optval, 602 int __user *optlen, int len) 603 { 604 int ret = -ENOPROTOOPT; 605 #ifdef CONFIG_NETDEVICES 606 struct net *net = sock_net(sk); 607 char devname[IFNAMSIZ]; 608 609 if (sk->sk_bound_dev_if == 0) { 610 len = 0; 611 goto zero; 612 } 613 614 ret = -EINVAL; 615 if (len < IFNAMSIZ) 616 goto out; 617 618 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); 619 if (ret) 620 goto out; 621 622 len = strlen(devname) + 1; 623 624 ret = -EFAULT; 625 if (copy_to_user(optval, devname, len)) 626 goto out; 627 628 zero: 629 ret = -EFAULT; 630 if (put_user(len, optlen)) 631 goto out; 632 633 ret = 0; 634 635 out: 636 #endif 637 638 return ret; 639 } 640 641 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) 642 { 643 if (valbool) 644 sock_set_flag(sk, bit); 645 else 646 sock_reset_flag(sk, bit); 647 } 648 649 bool sk_mc_loop(struct sock *sk) 650 { 651 if (dev_recursion_level()) 652 return false; 653 if (!sk) 654 return true; 655 switch (sk->sk_family) { 656 case AF_INET: 657 return inet_sk(sk)->mc_loop; 658 #if IS_ENABLED(CONFIG_IPV6) 659 case AF_INET6: 660 return inet6_sk(sk)->mc_loop; 661 #endif 662 } 663 WARN_ON(1); 664 return true; 665 } 666 EXPORT_SYMBOL(sk_mc_loop); 667 668 /* 669 * This is meant for all protocols to use and covers goings on 670 * at the socket level. Everything here is generic. 671 */ 672 673 int sock_setsockopt(struct socket *sock, int level, int optname, 674 char __user *optval, unsigned int optlen) 675 { 676 struct sock_txtime sk_txtime; 677 struct sock *sk = sock->sk; 678 int val; 679 int valbool; 680 struct linger ling; 681 int ret = 0; 682 683 /* 684 * Options without arguments 685 */ 686 687 if (optname == SO_BINDTODEVICE) 688 return sock_setbindtodevice(sk, optval, optlen); 689 690 if (optlen < sizeof(int)) 691 return -EINVAL; 692 693 if (get_user(val, (int __user *)optval)) 694 return -EFAULT; 695 696 valbool = val ? 1 : 0; 697 698 lock_sock(sk); 699 700 switch (optname) { 701 case SO_DEBUG: 702 if (val && !capable(CAP_NET_ADMIN)) 703 ret = -EACCES; 704 else 705 sock_valbool_flag(sk, SOCK_DBG, valbool); 706 break; 707 case SO_REUSEADDR: 708 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 709 break; 710 case SO_REUSEPORT: 711 sk->sk_reuseport = valbool; 712 break; 713 case SO_TYPE: 714 case SO_PROTOCOL: 715 case SO_DOMAIN: 716 case SO_ERROR: 717 ret = -ENOPROTOOPT; 718 break; 719 case SO_DONTROUTE: 720 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 721 sk_dst_reset(sk); 722 break; 723 case SO_BROADCAST: 724 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 725 break; 726 case SO_SNDBUF: 727 /* Don't error on this BSD doesn't and if you think 728 * about it this is right. Otherwise apps have to 729 * play 'guess the biggest size' games. RCVBUF/SNDBUF 730 * are treated in BSD as hints 731 */ 732 val = min_t(u32, val, sysctl_wmem_max); 733 set_sndbuf: 734 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 735 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF); 736 /* Wake up sending tasks if we upped the value. */ 737 sk->sk_write_space(sk); 738 break; 739 740 case SO_SNDBUFFORCE: 741 if (!capable(CAP_NET_ADMIN)) { 742 ret = -EPERM; 743 break; 744 } 745 goto set_sndbuf; 746 747 case SO_RCVBUF: 748 /* Don't error on this BSD doesn't and if you think 749 * about it this is right. Otherwise apps have to 750 * play 'guess the biggest size' games. RCVBUF/SNDBUF 751 * are treated in BSD as hints 752 */ 753 val = min_t(u32, val, sysctl_rmem_max); 754 set_rcvbuf: 755 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 756 /* 757 * We double it on the way in to account for 758 * "struct sk_buff" etc. overhead. Applications 759 * assume that the SO_RCVBUF setting they make will 760 * allow that much actual data to be received on that 761 * socket. 762 * 763 * Applications are unaware that "struct sk_buff" and 764 * other overheads allocate from the receive buffer 765 * during socket buffer allocation. 766 * 767 * And after considering the possible alternatives, 768 * returning the value we actually used in getsockopt 769 * is the most desirable behavior. 770 */ 771 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF); 772 break; 773 774 case SO_RCVBUFFORCE: 775 if (!capable(CAP_NET_ADMIN)) { 776 ret = -EPERM; 777 break; 778 } 779 goto set_rcvbuf; 780 781 case SO_KEEPALIVE: 782 if (sk->sk_prot->keepalive) 783 sk->sk_prot->keepalive(sk, valbool); 784 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 785 break; 786 787 case SO_OOBINLINE: 788 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 789 break; 790 791 case SO_NO_CHECK: 792 sk->sk_no_check_tx = valbool; 793 break; 794 795 case SO_PRIORITY: 796 if ((val >= 0 && val <= 6) || 797 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 798 sk->sk_priority = val; 799 else 800 ret = -EPERM; 801 break; 802 803 case SO_LINGER: 804 if (optlen < sizeof(ling)) { 805 ret = -EINVAL; /* 1003.1g */ 806 break; 807 } 808 if (copy_from_user(&ling, optval, sizeof(ling))) { 809 ret = -EFAULT; 810 break; 811 } 812 if (!ling.l_onoff) 813 sock_reset_flag(sk, SOCK_LINGER); 814 else { 815 #if (BITS_PER_LONG == 32) 816 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) 817 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; 818 else 819 #endif 820 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; 821 sock_set_flag(sk, SOCK_LINGER); 822 } 823 break; 824 825 case SO_BSDCOMPAT: 826 sock_warn_obsolete_bsdism("setsockopt"); 827 break; 828 829 case SO_PASSCRED: 830 if (valbool) 831 set_bit(SOCK_PASSCRED, &sock->flags); 832 else 833 clear_bit(SOCK_PASSCRED, &sock->flags); 834 break; 835 836 case SO_TIMESTAMP: 837 case SO_TIMESTAMPNS: 838 if (valbool) { 839 if (optname == SO_TIMESTAMP) 840 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 841 else 842 sock_set_flag(sk, SOCK_RCVTSTAMPNS); 843 sock_set_flag(sk, SOCK_RCVTSTAMP); 844 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 845 } else { 846 sock_reset_flag(sk, SOCK_RCVTSTAMP); 847 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 848 } 849 break; 850 851 case SO_TIMESTAMPING: 852 if (val & ~SOF_TIMESTAMPING_MASK) { 853 ret = -EINVAL; 854 break; 855 } 856 857 if (val & SOF_TIMESTAMPING_OPT_ID && 858 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 859 if (sk->sk_protocol == IPPROTO_TCP && 860 sk->sk_type == SOCK_STREAM) { 861 if ((1 << sk->sk_state) & 862 (TCPF_CLOSE | TCPF_LISTEN)) { 863 ret = -EINVAL; 864 break; 865 } 866 sk->sk_tskey = tcp_sk(sk)->snd_una; 867 } else { 868 sk->sk_tskey = 0; 869 } 870 } 871 872 if (val & SOF_TIMESTAMPING_OPT_STATS && 873 !(val & SOF_TIMESTAMPING_OPT_TSONLY)) { 874 ret = -EINVAL; 875 break; 876 } 877 878 sk->sk_tsflags = val; 879 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 880 sock_enable_timestamp(sk, 881 SOCK_TIMESTAMPING_RX_SOFTWARE); 882 else 883 sock_disable_timestamp(sk, 884 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 885 break; 886 887 case SO_RCVLOWAT: 888 if (val < 0) 889 val = INT_MAX; 890 if (sock->ops->set_rcvlowat) 891 ret = sock->ops->set_rcvlowat(sk, val); 892 else 893 sk->sk_rcvlowat = val ? : 1; 894 break; 895 896 case SO_RCVTIMEO: 897 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen); 898 break; 899 900 case SO_SNDTIMEO: 901 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen); 902 break; 903 904 case SO_ATTACH_FILTER: 905 ret = -EINVAL; 906 if (optlen == sizeof(struct sock_fprog)) { 907 struct sock_fprog fprog; 908 909 ret = -EFAULT; 910 if (copy_from_user(&fprog, optval, sizeof(fprog))) 911 break; 912 913 ret = sk_attach_filter(&fprog, sk); 914 } 915 break; 916 917 case SO_ATTACH_BPF: 918 ret = -EINVAL; 919 if (optlen == sizeof(u32)) { 920 u32 ufd; 921 922 ret = -EFAULT; 923 if (copy_from_user(&ufd, optval, sizeof(ufd))) 924 break; 925 926 ret = sk_attach_bpf(ufd, sk); 927 } 928 break; 929 930 case SO_ATTACH_REUSEPORT_CBPF: 931 ret = -EINVAL; 932 if (optlen == sizeof(struct sock_fprog)) { 933 struct sock_fprog fprog; 934 935 ret = -EFAULT; 936 if (copy_from_user(&fprog, optval, sizeof(fprog))) 937 break; 938 939 ret = sk_reuseport_attach_filter(&fprog, sk); 940 } 941 break; 942 943 case SO_ATTACH_REUSEPORT_EBPF: 944 ret = -EINVAL; 945 if (optlen == sizeof(u32)) { 946 u32 ufd; 947 948 ret = -EFAULT; 949 if (copy_from_user(&ufd, optval, sizeof(ufd))) 950 break; 951 952 ret = sk_reuseport_attach_bpf(ufd, sk); 953 } 954 break; 955 956 case SO_DETACH_FILTER: 957 ret = sk_detach_filter(sk); 958 break; 959 960 case SO_LOCK_FILTER: 961 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 962 ret = -EPERM; 963 else 964 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 965 break; 966 967 case SO_PASSSEC: 968 if (valbool) 969 set_bit(SOCK_PASSSEC, &sock->flags); 970 else 971 clear_bit(SOCK_PASSSEC, &sock->flags); 972 break; 973 case SO_MARK: 974 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 975 ret = -EPERM; 976 } else if (val != sk->sk_mark) { 977 sk->sk_mark = val; 978 sk_dst_reset(sk); 979 } 980 break; 981 982 case SO_RXQ_OVFL: 983 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 984 break; 985 986 case SO_WIFI_STATUS: 987 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 988 break; 989 990 case SO_PEEK_OFF: 991 if (sock->ops->set_peek_off) 992 ret = sock->ops->set_peek_off(sk, val); 993 else 994 ret = -EOPNOTSUPP; 995 break; 996 997 case SO_NOFCS: 998 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 999 break; 1000 1001 case SO_SELECT_ERR_QUEUE: 1002 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 1003 break; 1004 1005 #ifdef CONFIG_NET_RX_BUSY_POLL 1006 case SO_BUSY_POLL: 1007 /* allow unprivileged users to decrease the value */ 1008 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) 1009 ret = -EPERM; 1010 else { 1011 if (val < 0) 1012 ret = -EINVAL; 1013 else 1014 sk->sk_ll_usec = val; 1015 } 1016 break; 1017 #endif 1018 1019 case SO_MAX_PACING_RATE: 1020 if (val != ~0U) 1021 cmpxchg(&sk->sk_pacing_status, 1022 SK_PACING_NONE, 1023 SK_PACING_NEEDED); 1024 sk->sk_max_pacing_rate = (val == ~0U) ? ~0UL : val; 1025 sk->sk_pacing_rate = min(sk->sk_pacing_rate, 1026 sk->sk_max_pacing_rate); 1027 break; 1028 1029 case SO_INCOMING_CPU: 1030 sk->sk_incoming_cpu = val; 1031 break; 1032 1033 case SO_CNX_ADVICE: 1034 if (val == 1) 1035 dst_negative_advice(sk); 1036 break; 1037 1038 case SO_ZEROCOPY: 1039 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) { 1040 if (!((sk->sk_type == SOCK_STREAM && 1041 sk->sk_protocol == IPPROTO_TCP) || 1042 (sk->sk_type == SOCK_DGRAM && 1043 sk->sk_protocol == IPPROTO_UDP))) 1044 ret = -ENOTSUPP; 1045 } else if (sk->sk_family != PF_RDS) { 1046 ret = -ENOTSUPP; 1047 } 1048 if (!ret) { 1049 if (val < 0 || val > 1) 1050 ret = -EINVAL; 1051 else 1052 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); 1053 } 1054 break; 1055 1056 case SO_TXTIME: 1057 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { 1058 ret = -EPERM; 1059 } else if (optlen != sizeof(struct sock_txtime)) { 1060 ret = -EINVAL; 1061 } else if (copy_from_user(&sk_txtime, optval, 1062 sizeof(struct sock_txtime))) { 1063 ret = -EFAULT; 1064 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) { 1065 ret = -EINVAL; 1066 } else { 1067 sock_valbool_flag(sk, SOCK_TXTIME, true); 1068 sk->sk_clockid = sk_txtime.clockid; 1069 sk->sk_txtime_deadline_mode = 1070 !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE); 1071 sk->sk_txtime_report_errors = 1072 !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS); 1073 } 1074 break; 1075 1076 case SO_BINDTOIFINDEX: 1077 ret = sock_setbindtodevice_locked(sk, val); 1078 break; 1079 1080 default: 1081 ret = -ENOPROTOOPT; 1082 break; 1083 } 1084 release_sock(sk); 1085 return ret; 1086 } 1087 EXPORT_SYMBOL(sock_setsockopt); 1088 1089 1090 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1091 struct ucred *ucred) 1092 { 1093 ucred->pid = pid_vnr(pid); 1094 ucred->uid = ucred->gid = -1; 1095 if (cred) { 1096 struct user_namespace *current_ns = current_user_ns(); 1097 1098 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1099 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1100 } 1101 } 1102 1103 static int groups_to_user(gid_t __user *dst, const struct group_info *src) 1104 { 1105 struct user_namespace *user_ns = current_user_ns(); 1106 int i; 1107 1108 for (i = 0; i < src->ngroups; i++) 1109 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) 1110 return -EFAULT; 1111 1112 return 0; 1113 } 1114 1115 int sock_getsockopt(struct socket *sock, int level, int optname, 1116 char __user *optval, int __user *optlen) 1117 { 1118 struct sock *sk = sock->sk; 1119 1120 union { 1121 int val; 1122 u64 val64; 1123 struct linger ling; 1124 struct timeval tm; 1125 struct sock_txtime txtime; 1126 } v; 1127 1128 int lv = sizeof(int); 1129 int len; 1130 1131 if (get_user(len, optlen)) 1132 return -EFAULT; 1133 if (len < 0) 1134 return -EINVAL; 1135 1136 memset(&v, 0, sizeof(v)); 1137 1138 switch (optname) { 1139 case SO_DEBUG: 1140 v.val = sock_flag(sk, SOCK_DBG); 1141 break; 1142 1143 case SO_DONTROUTE: 1144 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1145 break; 1146 1147 case SO_BROADCAST: 1148 v.val = sock_flag(sk, SOCK_BROADCAST); 1149 break; 1150 1151 case SO_SNDBUF: 1152 v.val = sk->sk_sndbuf; 1153 break; 1154 1155 case SO_RCVBUF: 1156 v.val = sk->sk_rcvbuf; 1157 break; 1158 1159 case SO_REUSEADDR: 1160 v.val = sk->sk_reuse; 1161 break; 1162 1163 case SO_REUSEPORT: 1164 v.val = sk->sk_reuseport; 1165 break; 1166 1167 case SO_KEEPALIVE: 1168 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1169 break; 1170 1171 case SO_TYPE: 1172 v.val = sk->sk_type; 1173 break; 1174 1175 case SO_PROTOCOL: 1176 v.val = sk->sk_protocol; 1177 break; 1178 1179 case SO_DOMAIN: 1180 v.val = sk->sk_family; 1181 break; 1182 1183 case SO_ERROR: 1184 v.val = -sock_error(sk); 1185 if (v.val == 0) 1186 v.val = xchg(&sk->sk_err_soft, 0); 1187 break; 1188 1189 case SO_OOBINLINE: 1190 v.val = sock_flag(sk, SOCK_URGINLINE); 1191 break; 1192 1193 case SO_NO_CHECK: 1194 v.val = sk->sk_no_check_tx; 1195 break; 1196 1197 case SO_PRIORITY: 1198 v.val = sk->sk_priority; 1199 break; 1200 1201 case SO_LINGER: 1202 lv = sizeof(v.ling); 1203 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1204 v.ling.l_linger = sk->sk_lingertime / HZ; 1205 break; 1206 1207 case SO_BSDCOMPAT: 1208 sock_warn_obsolete_bsdism("getsockopt"); 1209 break; 1210 1211 case SO_TIMESTAMP: 1212 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1213 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1214 break; 1215 1216 case SO_TIMESTAMPNS: 1217 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS); 1218 break; 1219 1220 case SO_TIMESTAMPING: 1221 v.val = sk->sk_tsflags; 1222 break; 1223 1224 case SO_RCVTIMEO: 1225 lv = sizeof(struct timeval); 1226 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { 1227 v.tm.tv_sec = 0; 1228 v.tm.tv_usec = 0; 1229 } else { 1230 v.tm.tv_sec = sk->sk_rcvtimeo / HZ; 1231 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ; 1232 } 1233 break; 1234 1235 case SO_SNDTIMEO: 1236 lv = sizeof(struct timeval); 1237 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) { 1238 v.tm.tv_sec = 0; 1239 v.tm.tv_usec = 0; 1240 } else { 1241 v.tm.tv_sec = sk->sk_sndtimeo / HZ; 1242 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ; 1243 } 1244 break; 1245 1246 case SO_RCVLOWAT: 1247 v.val = sk->sk_rcvlowat; 1248 break; 1249 1250 case SO_SNDLOWAT: 1251 v.val = 1; 1252 break; 1253 1254 case SO_PASSCRED: 1255 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); 1256 break; 1257 1258 case SO_PEERCRED: 1259 { 1260 struct ucred peercred; 1261 if (len > sizeof(peercred)) 1262 len = sizeof(peercred); 1263 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1264 if (copy_to_user(optval, &peercred, len)) 1265 return -EFAULT; 1266 goto lenout; 1267 } 1268 1269 case SO_PEERGROUPS: 1270 { 1271 int ret, n; 1272 1273 if (!sk->sk_peer_cred) 1274 return -ENODATA; 1275 1276 n = sk->sk_peer_cred->group_info->ngroups; 1277 if (len < n * sizeof(gid_t)) { 1278 len = n * sizeof(gid_t); 1279 return put_user(len, optlen) ? -EFAULT : -ERANGE; 1280 } 1281 len = n * sizeof(gid_t); 1282 1283 ret = groups_to_user((gid_t __user *)optval, 1284 sk->sk_peer_cred->group_info); 1285 if (ret) 1286 return ret; 1287 goto lenout; 1288 } 1289 1290 case SO_PEERNAME: 1291 { 1292 char address[128]; 1293 1294 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2); 1295 if (lv < 0) 1296 return -ENOTCONN; 1297 if (lv < len) 1298 return -EINVAL; 1299 if (copy_to_user(optval, address, len)) 1300 return -EFAULT; 1301 goto lenout; 1302 } 1303 1304 /* Dubious BSD thing... Probably nobody even uses it, but 1305 * the UNIX standard wants it for whatever reason... -DaveM 1306 */ 1307 case SO_ACCEPTCONN: 1308 v.val = sk->sk_state == TCP_LISTEN; 1309 break; 1310 1311 case SO_PASSSEC: 1312 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); 1313 break; 1314 1315 case SO_PEERSEC: 1316 return security_socket_getpeersec_stream(sock, optval, optlen, len); 1317 1318 case SO_MARK: 1319 v.val = sk->sk_mark; 1320 break; 1321 1322 case SO_RXQ_OVFL: 1323 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 1324 break; 1325 1326 case SO_WIFI_STATUS: 1327 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 1328 break; 1329 1330 case SO_PEEK_OFF: 1331 if (!sock->ops->set_peek_off) 1332 return -EOPNOTSUPP; 1333 1334 v.val = sk->sk_peek_off; 1335 break; 1336 case SO_NOFCS: 1337 v.val = sock_flag(sk, SOCK_NOFCS); 1338 break; 1339 1340 case SO_BINDTODEVICE: 1341 return sock_getbindtodevice(sk, optval, optlen, len); 1342 1343 case SO_GET_FILTER: 1344 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); 1345 if (len < 0) 1346 return len; 1347 1348 goto lenout; 1349 1350 case SO_LOCK_FILTER: 1351 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 1352 break; 1353 1354 case SO_BPF_EXTENSIONS: 1355 v.val = bpf_tell_extensions(); 1356 break; 1357 1358 case SO_SELECT_ERR_QUEUE: 1359 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 1360 break; 1361 1362 #ifdef CONFIG_NET_RX_BUSY_POLL 1363 case SO_BUSY_POLL: 1364 v.val = sk->sk_ll_usec; 1365 break; 1366 #endif 1367 1368 case SO_MAX_PACING_RATE: 1369 /* 32bit version */ 1370 v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U); 1371 break; 1372 1373 case SO_INCOMING_CPU: 1374 v.val = sk->sk_incoming_cpu; 1375 break; 1376 1377 case SO_MEMINFO: 1378 { 1379 u32 meminfo[SK_MEMINFO_VARS]; 1380 1381 if (get_user(len, optlen)) 1382 return -EFAULT; 1383 1384 sk_get_meminfo(sk, meminfo); 1385 1386 len = min_t(unsigned int, len, sizeof(meminfo)); 1387 if (copy_to_user(optval, &meminfo, len)) 1388 return -EFAULT; 1389 1390 goto lenout; 1391 } 1392 1393 #ifdef CONFIG_NET_RX_BUSY_POLL 1394 case SO_INCOMING_NAPI_ID: 1395 v.val = READ_ONCE(sk->sk_napi_id); 1396 1397 /* aggregate non-NAPI IDs down to 0 */ 1398 if (v.val < MIN_NAPI_ID) 1399 v.val = 0; 1400 1401 break; 1402 #endif 1403 1404 case SO_COOKIE: 1405 lv = sizeof(u64); 1406 if (len < lv) 1407 return -EINVAL; 1408 v.val64 = sock_gen_cookie(sk); 1409 break; 1410 1411 case SO_ZEROCOPY: 1412 v.val = sock_flag(sk, SOCK_ZEROCOPY); 1413 break; 1414 1415 case SO_TXTIME: 1416 lv = sizeof(v.txtime); 1417 v.txtime.clockid = sk->sk_clockid; 1418 v.txtime.flags |= sk->sk_txtime_deadline_mode ? 1419 SOF_TXTIME_DEADLINE_MODE : 0; 1420 v.txtime.flags |= sk->sk_txtime_report_errors ? 1421 SOF_TXTIME_REPORT_ERRORS : 0; 1422 break; 1423 1424 case SO_BINDTOIFINDEX: 1425 v.val = sk->sk_bound_dev_if; 1426 break; 1427 1428 default: 1429 /* We implement the SO_SNDLOWAT etc to not be settable 1430 * (1003.1g 7). 1431 */ 1432 return -ENOPROTOOPT; 1433 } 1434 1435 if (len > lv) 1436 len = lv; 1437 if (copy_to_user(optval, &v, len)) 1438 return -EFAULT; 1439 lenout: 1440 if (put_user(len, optlen)) 1441 return -EFAULT; 1442 return 0; 1443 } 1444 1445 /* 1446 * Initialize an sk_lock. 1447 * 1448 * (We also register the sk_lock with the lock validator.) 1449 */ 1450 static inline void sock_lock_init(struct sock *sk) 1451 { 1452 if (sk->sk_kern_sock) 1453 sock_lock_init_class_and_name( 1454 sk, 1455 af_family_kern_slock_key_strings[sk->sk_family], 1456 af_family_kern_slock_keys + sk->sk_family, 1457 af_family_kern_key_strings[sk->sk_family], 1458 af_family_kern_keys + sk->sk_family); 1459 else 1460 sock_lock_init_class_and_name( 1461 sk, 1462 af_family_slock_key_strings[sk->sk_family], 1463 af_family_slock_keys + sk->sk_family, 1464 af_family_key_strings[sk->sk_family], 1465 af_family_keys + sk->sk_family); 1466 } 1467 1468 /* 1469 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 1470 * even temporarly, because of RCU lookups. sk_node should also be left as is. 1471 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 1472 */ 1473 static void sock_copy(struct sock *nsk, const struct sock *osk) 1474 { 1475 #ifdef CONFIG_SECURITY_NETWORK 1476 void *sptr = nsk->sk_security; 1477 #endif 1478 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 1479 1480 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 1481 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); 1482 1483 #ifdef CONFIG_SECURITY_NETWORK 1484 nsk->sk_security = sptr; 1485 security_sk_clone(osk, nsk); 1486 #endif 1487 } 1488 1489 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 1490 int family) 1491 { 1492 struct sock *sk; 1493 struct kmem_cache *slab; 1494 1495 slab = prot->slab; 1496 if (slab != NULL) { 1497 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 1498 if (!sk) 1499 return sk; 1500 if (priority & __GFP_ZERO) 1501 sk_prot_clear_nulls(sk, prot->obj_size); 1502 } else 1503 sk = kmalloc(prot->obj_size, priority); 1504 1505 if (sk != NULL) { 1506 if (security_sk_alloc(sk, family, priority)) 1507 goto out_free; 1508 1509 if (!try_module_get(prot->owner)) 1510 goto out_free_sec; 1511 sk_tx_queue_clear(sk); 1512 } 1513 1514 return sk; 1515 1516 out_free_sec: 1517 security_sk_free(sk); 1518 out_free: 1519 if (slab != NULL) 1520 kmem_cache_free(slab, sk); 1521 else 1522 kfree(sk); 1523 return NULL; 1524 } 1525 1526 static void sk_prot_free(struct proto *prot, struct sock *sk) 1527 { 1528 struct kmem_cache *slab; 1529 struct module *owner; 1530 1531 owner = prot->owner; 1532 slab = prot->slab; 1533 1534 cgroup_sk_free(&sk->sk_cgrp_data); 1535 mem_cgroup_sk_free(sk); 1536 security_sk_free(sk); 1537 if (slab != NULL) 1538 kmem_cache_free(slab, sk); 1539 else 1540 kfree(sk); 1541 module_put(owner); 1542 } 1543 1544 /** 1545 * sk_alloc - All socket objects are allocated here 1546 * @net: the applicable net namespace 1547 * @family: protocol family 1548 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1549 * @prot: struct proto associated with this new sock instance 1550 * @kern: is this to be a kernel socket? 1551 */ 1552 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 1553 struct proto *prot, int kern) 1554 { 1555 struct sock *sk; 1556 1557 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 1558 if (sk) { 1559 sk->sk_family = family; 1560 /* 1561 * See comment in struct sock definition to understand 1562 * why we need sk_prot_creator -acme 1563 */ 1564 sk->sk_prot = sk->sk_prot_creator = prot; 1565 sk->sk_kern_sock = kern; 1566 sock_lock_init(sk); 1567 sk->sk_net_refcnt = kern ? 0 : 1; 1568 if (likely(sk->sk_net_refcnt)) { 1569 get_net(net); 1570 sock_inuse_add(net, 1); 1571 } 1572 1573 sock_net_set(sk, net); 1574 refcount_set(&sk->sk_wmem_alloc, 1); 1575 1576 mem_cgroup_sk_alloc(sk); 1577 cgroup_sk_alloc(&sk->sk_cgrp_data); 1578 sock_update_classid(&sk->sk_cgrp_data); 1579 sock_update_netprioidx(&sk->sk_cgrp_data); 1580 } 1581 1582 return sk; 1583 } 1584 EXPORT_SYMBOL(sk_alloc); 1585 1586 /* Sockets having SOCK_RCU_FREE will call this function after one RCU 1587 * grace period. This is the case for UDP sockets and TCP listeners. 1588 */ 1589 static void __sk_destruct(struct rcu_head *head) 1590 { 1591 struct sock *sk = container_of(head, struct sock, sk_rcu); 1592 struct sk_filter *filter; 1593 1594 if (sk->sk_destruct) 1595 sk->sk_destruct(sk); 1596 1597 filter = rcu_dereference_check(sk->sk_filter, 1598 refcount_read(&sk->sk_wmem_alloc) == 0); 1599 if (filter) { 1600 sk_filter_uncharge(sk, filter); 1601 RCU_INIT_POINTER(sk->sk_filter, NULL); 1602 } 1603 if (rcu_access_pointer(sk->sk_reuseport_cb)) 1604 reuseport_detach_sock(sk); 1605 1606 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 1607 1608 if (atomic_read(&sk->sk_omem_alloc)) 1609 pr_debug("%s: optmem leakage (%d bytes) detected\n", 1610 __func__, atomic_read(&sk->sk_omem_alloc)); 1611 1612 if (sk->sk_frag.page) { 1613 put_page(sk->sk_frag.page); 1614 sk->sk_frag.page = NULL; 1615 } 1616 1617 if (sk->sk_peer_cred) 1618 put_cred(sk->sk_peer_cred); 1619 put_pid(sk->sk_peer_pid); 1620 if (likely(sk->sk_net_refcnt)) 1621 put_net(sock_net(sk)); 1622 sk_prot_free(sk->sk_prot_creator, sk); 1623 } 1624 1625 void sk_destruct(struct sock *sk) 1626 { 1627 if (sock_flag(sk, SOCK_RCU_FREE)) 1628 call_rcu(&sk->sk_rcu, __sk_destruct); 1629 else 1630 __sk_destruct(&sk->sk_rcu); 1631 } 1632 1633 static void __sk_free(struct sock *sk) 1634 { 1635 if (likely(sk->sk_net_refcnt)) 1636 sock_inuse_add(sock_net(sk), -1); 1637 1638 if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk))) 1639 sock_diag_broadcast_destroy(sk); 1640 else 1641 sk_destruct(sk); 1642 } 1643 1644 void sk_free(struct sock *sk) 1645 { 1646 /* 1647 * We subtract one from sk_wmem_alloc and can know if 1648 * some packets are still in some tx queue. 1649 * If not null, sock_wfree() will call __sk_free(sk) later 1650 */ 1651 if (refcount_dec_and_test(&sk->sk_wmem_alloc)) 1652 __sk_free(sk); 1653 } 1654 EXPORT_SYMBOL(sk_free); 1655 1656 static void sk_init_common(struct sock *sk) 1657 { 1658 skb_queue_head_init(&sk->sk_receive_queue); 1659 skb_queue_head_init(&sk->sk_write_queue); 1660 skb_queue_head_init(&sk->sk_error_queue); 1661 1662 rwlock_init(&sk->sk_callback_lock); 1663 lockdep_set_class_and_name(&sk->sk_receive_queue.lock, 1664 af_rlock_keys + sk->sk_family, 1665 af_family_rlock_key_strings[sk->sk_family]); 1666 lockdep_set_class_and_name(&sk->sk_write_queue.lock, 1667 af_wlock_keys + sk->sk_family, 1668 af_family_wlock_key_strings[sk->sk_family]); 1669 lockdep_set_class_and_name(&sk->sk_error_queue.lock, 1670 af_elock_keys + sk->sk_family, 1671 af_family_elock_key_strings[sk->sk_family]); 1672 lockdep_set_class_and_name(&sk->sk_callback_lock, 1673 af_callback_keys + sk->sk_family, 1674 af_family_clock_key_strings[sk->sk_family]); 1675 } 1676 1677 /** 1678 * sk_clone_lock - clone a socket, and lock its clone 1679 * @sk: the socket to clone 1680 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1681 * 1682 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 1683 */ 1684 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 1685 { 1686 struct sock *newsk; 1687 bool is_charged = true; 1688 1689 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family); 1690 if (newsk != NULL) { 1691 struct sk_filter *filter; 1692 1693 sock_copy(newsk, sk); 1694 1695 newsk->sk_prot_creator = sk->sk_prot; 1696 1697 /* SANITY */ 1698 if (likely(newsk->sk_net_refcnt)) 1699 get_net(sock_net(newsk)); 1700 sk_node_init(&newsk->sk_node); 1701 sock_lock_init(newsk); 1702 bh_lock_sock(newsk); 1703 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 1704 newsk->sk_backlog.len = 0; 1705 1706 atomic_set(&newsk->sk_rmem_alloc, 0); 1707 /* 1708 * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) 1709 */ 1710 refcount_set(&newsk->sk_wmem_alloc, 1); 1711 atomic_set(&newsk->sk_omem_alloc, 0); 1712 sk_init_common(newsk); 1713 1714 newsk->sk_dst_cache = NULL; 1715 newsk->sk_dst_pending_confirm = 0; 1716 newsk->sk_wmem_queued = 0; 1717 newsk->sk_forward_alloc = 0; 1718 atomic_set(&newsk->sk_drops, 0); 1719 newsk->sk_send_head = NULL; 1720 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 1721 atomic_set(&newsk->sk_zckey, 0); 1722 1723 sock_reset_flag(newsk, SOCK_DONE); 1724 mem_cgroup_sk_alloc(newsk); 1725 cgroup_sk_alloc(&newsk->sk_cgrp_data); 1726 1727 rcu_read_lock(); 1728 filter = rcu_dereference(sk->sk_filter); 1729 if (filter != NULL) 1730 /* though it's an empty new sock, the charging may fail 1731 * if sysctl_optmem_max was changed between creation of 1732 * original socket and cloning 1733 */ 1734 is_charged = sk_filter_charge(newsk, filter); 1735 RCU_INIT_POINTER(newsk->sk_filter, filter); 1736 rcu_read_unlock(); 1737 1738 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 1739 /* We need to make sure that we don't uncharge the new 1740 * socket if we couldn't charge it in the first place 1741 * as otherwise we uncharge the parent's filter. 1742 */ 1743 if (!is_charged) 1744 RCU_INIT_POINTER(newsk->sk_filter, NULL); 1745 sk_free_unlock_clone(newsk); 1746 newsk = NULL; 1747 goto out; 1748 } 1749 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); 1750 1751 newsk->sk_err = 0; 1752 newsk->sk_err_soft = 0; 1753 newsk->sk_priority = 0; 1754 newsk->sk_incoming_cpu = raw_smp_processor_id(); 1755 atomic64_set(&newsk->sk_cookie, 0); 1756 if (likely(newsk->sk_net_refcnt)) 1757 sock_inuse_add(sock_net(newsk), 1); 1758 1759 /* 1760 * Before updating sk_refcnt, we must commit prior changes to memory 1761 * (Documentation/RCU/rculist_nulls.txt for details) 1762 */ 1763 smp_wmb(); 1764 refcount_set(&newsk->sk_refcnt, 2); 1765 1766 /* 1767 * Increment the counter in the same struct proto as the master 1768 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that 1769 * is the same as sk->sk_prot->socks, as this field was copied 1770 * with memcpy). 1771 * 1772 * This _changes_ the previous behaviour, where 1773 * tcp_create_openreq_child always was incrementing the 1774 * equivalent to tcp_prot->socks (inet_sock_nr), so this have 1775 * to be taken into account in all callers. -acme 1776 */ 1777 sk_refcnt_debug_inc(newsk); 1778 sk_set_socket(newsk, NULL); 1779 newsk->sk_wq = NULL; 1780 1781 if (newsk->sk_prot->sockets_allocated) 1782 sk_sockets_allocated_inc(newsk); 1783 1784 if (sock_needs_netstamp(sk) && 1785 newsk->sk_flags & SK_FLAGS_TIMESTAMP) 1786 net_enable_timestamp(); 1787 } 1788 out: 1789 return newsk; 1790 } 1791 EXPORT_SYMBOL_GPL(sk_clone_lock); 1792 1793 void sk_free_unlock_clone(struct sock *sk) 1794 { 1795 /* It is still raw copy of parent, so invalidate 1796 * destructor and make plain sk_free() */ 1797 sk->sk_destruct = NULL; 1798 bh_unlock_sock(sk); 1799 sk_free(sk); 1800 } 1801 EXPORT_SYMBOL_GPL(sk_free_unlock_clone); 1802 1803 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 1804 { 1805 u32 max_segs = 1; 1806 1807 sk_dst_set(sk, dst); 1808 sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps; 1809 if (sk->sk_route_caps & NETIF_F_GSO) 1810 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 1811 sk->sk_route_caps &= ~sk->sk_route_nocaps; 1812 if (sk_can_gso(sk)) { 1813 if (dst->header_len && !xfrm_dst_offload_ok(dst)) { 1814 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 1815 } else { 1816 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 1817 sk->sk_gso_max_size = dst->dev->gso_max_size; 1818 max_segs = max_t(u32, dst->dev->gso_max_segs, 1); 1819 } 1820 } 1821 sk->sk_gso_max_segs = max_segs; 1822 } 1823 EXPORT_SYMBOL_GPL(sk_setup_caps); 1824 1825 /* 1826 * Simple resource managers for sockets. 1827 */ 1828 1829 1830 /* 1831 * Write buffer destructor automatically called from kfree_skb. 1832 */ 1833 void sock_wfree(struct sk_buff *skb) 1834 { 1835 struct sock *sk = skb->sk; 1836 unsigned int len = skb->truesize; 1837 1838 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 1839 /* 1840 * Keep a reference on sk_wmem_alloc, this will be released 1841 * after sk_write_space() call 1842 */ 1843 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc)); 1844 sk->sk_write_space(sk); 1845 len = 1; 1846 } 1847 /* 1848 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 1849 * could not do because of in-flight packets 1850 */ 1851 if (refcount_sub_and_test(len, &sk->sk_wmem_alloc)) 1852 __sk_free(sk); 1853 } 1854 EXPORT_SYMBOL(sock_wfree); 1855 1856 /* This variant of sock_wfree() is used by TCP, 1857 * since it sets SOCK_USE_WRITE_QUEUE. 1858 */ 1859 void __sock_wfree(struct sk_buff *skb) 1860 { 1861 struct sock *sk = skb->sk; 1862 1863 if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)) 1864 __sk_free(sk); 1865 } 1866 1867 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 1868 { 1869 skb_orphan(skb); 1870 skb->sk = sk; 1871 #ifdef CONFIG_INET 1872 if (unlikely(!sk_fullsock(sk))) { 1873 skb->destructor = sock_edemux; 1874 sock_hold(sk); 1875 return; 1876 } 1877 #endif 1878 skb->destructor = sock_wfree; 1879 skb_set_hash_from_sk(skb, sk); 1880 /* 1881 * We used to take a refcount on sk, but following operation 1882 * is enough to guarantee sk_free() wont free this sock until 1883 * all in-flight packets are completed 1884 */ 1885 refcount_add(skb->truesize, &sk->sk_wmem_alloc); 1886 } 1887 EXPORT_SYMBOL(skb_set_owner_w); 1888 1889 /* This helper is used by netem, as it can hold packets in its 1890 * delay queue. We want to allow the owner socket to send more 1891 * packets, as if they were already TX completed by a typical driver. 1892 * But we also want to keep skb->sk set because some packet schedulers 1893 * rely on it (sch_fq for example). 1894 */ 1895 void skb_orphan_partial(struct sk_buff *skb) 1896 { 1897 if (skb_is_tcp_pure_ack(skb)) 1898 return; 1899 1900 if (skb->destructor == sock_wfree 1901 #ifdef CONFIG_INET 1902 || skb->destructor == tcp_wfree 1903 #endif 1904 ) { 1905 struct sock *sk = skb->sk; 1906 1907 if (refcount_inc_not_zero(&sk->sk_refcnt)) { 1908 WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc)); 1909 skb->destructor = sock_efree; 1910 } 1911 } else { 1912 skb_orphan(skb); 1913 } 1914 } 1915 EXPORT_SYMBOL(skb_orphan_partial); 1916 1917 /* 1918 * Read buffer destructor automatically called from kfree_skb. 1919 */ 1920 void sock_rfree(struct sk_buff *skb) 1921 { 1922 struct sock *sk = skb->sk; 1923 unsigned int len = skb->truesize; 1924 1925 atomic_sub(len, &sk->sk_rmem_alloc); 1926 sk_mem_uncharge(sk, len); 1927 } 1928 EXPORT_SYMBOL(sock_rfree); 1929 1930 /* 1931 * Buffer destructor for skbs that are not used directly in read or write 1932 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 1933 */ 1934 void sock_efree(struct sk_buff *skb) 1935 { 1936 sock_put(skb->sk); 1937 } 1938 EXPORT_SYMBOL(sock_efree); 1939 1940 kuid_t sock_i_uid(struct sock *sk) 1941 { 1942 kuid_t uid; 1943 1944 read_lock_bh(&sk->sk_callback_lock); 1945 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; 1946 read_unlock_bh(&sk->sk_callback_lock); 1947 return uid; 1948 } 1949 EXPORT_SYMBOL(sock_i_uid); 1950 1951 unsigned long sock_i_ino(struct sock *sk) 1952 { 1953 unsigned long ino; 1954 1955 read_lock_bh(&sk->sk_callback_lock); 1956 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 1957 read_unlock_bh(&sk->sk_callback_lock); 1958 return ino; 1959 } 1960 EXPORT_SYMBOL(sock_i_ino); 1961 1962 /* 1963 * Allocate a skb from the socket's send buffer. 1964 */ 1965 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 1966 gfp_t priority) 1967 { 1968 if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { 1969 struct sk_buff *skb = alloc_skb(size, priority); 1970 if (skb) { 1971 skb_set_owner_w(skb, sk); 1972 return skb; 1973 } 1974 } 1975 return NULL; 1976 } 1977 EXPORT_SYMBOL(sock_wmalloc); 1978 1979 static void sock_ofree(struct sk_buff *skb) 1980 { 1981 struct sock *sk = skb->sk; 1982 1983 atomic_sub(skb->truesize, &sk->sk_omem_alloc); 1984 } 1985 1986 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, 1987 gfp_t priority) 1988 { 1989 struct sk_buff *skb; 1990 1991 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */ 1992 if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) > 1993 sysctl_optmem_max) 1994 return NULL; 1995 1996 skb = alloc_skb(size, priority); 1997 if (!skb) 1998 return NULL; 1999 2000 atomic_add(skb->truesize, &sk->sk_omem_alloc); 2001 skb->sk = sk; 2002 skb->destructor = sock_ofree; 2003 return skb; 2004 } 2005 2006 /* 2007 * Allocate a memory block from the socket's option memory buffer. 2008 */ 2009 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 2010 { 2011 if ((unsigned int)size <= sysctl_optmem_max && 2012 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 2013 void *mem; 2014 /* First do the add, to avoid the race if kmalloc 2015 * might sleep. 2016 */ 2017 atomic_add(size, &sk->sk_omem_alloc); 2018 mem = kmalloc(size, priority); 2019 if (mem) 2020 return mem; 2021 atomic_sub(size, &sk->sk_omem_alloc); 2022 } 2023 return NULL; 2024 } 2025 EXPORT_SYMBOL(sock_kmalloc); 2026 2027 /* Free an option memory block. Note, we actually want the inline 2028 * here as this allows gcc to detect the nullify and fold away the 2029 * condition entirely. 2030 */ 2031 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 2032 const bool nullify) 2033 { 2034 if (WARN_ON_ONCE(!mem)) 2035 return; 2036 if (nullify) 2037 kzfree(mem); 2038 else 2039 kfree(mem); 2040 atomic_sub(size, &sk->sk_omem_alloc); 2041 } 2042 2043 void sock_kfree_s(struct sock *sk, void *mem, int size) 2044 { 2045 __sock_kfree_s(sk, mem, size, false); 2046 } 2047 EXPORT_SYMBOL(sock_kfree_s); 2048 2049 void sock_kzfree_s(struct sock *sk, void *mem, int size) 2050 { 2051 __sock_kfree_s(sk, mem, size, true); 2052 } 2053 EXPORT_SYMBOL(sock_kzfree_s); 2054 2055 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 2056 I think, these locks should be removed for datagram sockets. 2057 */ 2058 static long sock_wait_for_wmem(struct sock *sk, long timeo) 2059 { 2060 DEFINE_WAIT(wait); 2061 2062 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2063 for (;;) { 2064 if (!timeo) 2065 break; 2066 if (signal_pending(current)) 2067 break; 2068 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2069 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2070 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) 2071 break; 2072 if (sk->sk_shutdown & SEND_SHUTDOWN) 2073 break; 2074 if (sk->sk_err) 2075 break; 2076 timeo = schedule_timeout(timeo); 2077 } 2078 finish_wait(sk_sleep(sk), &wait); 2079 return timeo; 2080 } 2081 2082 2083 /* 2084 * Generic send/receive buffer handlers 2085 */ 2086 2087 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 2088 unsigned long data_len, int noblock, 2089 int *errcode, int max_page_order) 2090 { 2091 struct sk_buff *skb; 2092 long timeo; 2093 int err; 2094 2095 timeo = sock_sndtimeo(sk, noblock); 2096 for (;;) { 2097 err = sock_error(sk); 2098 if (err != 0) 2099 goto failure; 2100 2101 err = -EPIPE; 2102 if (sk->sk_shutdown & SEND_SHUTDOWN) 2103 goto failure; 2104 2105 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf) 2106 break; 2107 2108 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 2109 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 2110 err = -EAGAIN; 2111 if (!timeo) 2112 goto failure; 2113 if (signal_pending(current)) 2114 goto interrupted; 2115 timeo = sock_wait_for_wmem(sk, timeo); 2116 } 2117 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 2118 errcode, sk->sk_allocation); 2119 if (skb) 2120 skb_set_owner_w(skb, sk); 2121 return skb; 2122 2123 interrupted: 2124 err = sock_intr_errno(timeo); 2125 failure: 2126 *errcode = err; 2127 return NULL; 2128 } 2129 EXPORT_SYMBOL(sock_alloc_send_pskb); 2130 2131 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 2132 int noblock, int *errcode) 2133 { 2134 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); 2135 } 2136 EXPORT_SYMBOL(sock_alloc_send_skb); 2137 2138 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, 2139 struct sockcm_cookie *sockc) 2140 { 2141 u32 tsflags; 2142 2143 switch (cmsg->cmsg_type) { 2144 case SO_MARK: 2145 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 2146 return -EPERM; 2147 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2148 return -EINVAL; 2149 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 2150 break; 2151 case SO_TIMESTAMPING: 2152 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 2153 return -EINVAL; 2154 2155 tsflags = *(u32 *)CMSG_DATA(cmsg); 2156 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK) 2157 return -EINVAL; 2158 2159 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK; 2160 sockc->tsflags |= tsflags; 2161 break; 2162 case SCM_TXTIME: 2163 if (!sock_flag(sk, SOCK_TXTIME)) 2164 return -EINVAL; 2165 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64))) 2166 return -EINVAL; 2167 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg)); 2168 break; 2169 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */ 2170 case SCM_RIGHTS: 2171 case SCM_CREDENTIALS: 2172 break; 2173 default: 2174 return -EINVAL; 2175 } 2176 return 0; 2177 } 2178 EXPORT_SYMBOL(__sock_cmsg_send); 2179 2180 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 2181 struct sockcm_cookie *sockc) 2182 { 2183 struct cmsghdr *cmsg; 2184 int ret; 2185 2186 for_each_cmsghdr(cmsg, msg) { 2187 if (!CMSG_OK(msg, cmsg)) 2188 return -EINVAL; 2189 if (cmsg->cmsg_level != SOL_SOCKET) 2190 continue; 2191 ret = __sock_cmsg_send(sk, msg, cmsg, sockc); 2192 if (ret) 2193 return ret; 2194 } 2195 return 0; 2196 } 2197 EXPORT_SYMBOL(sock_cmsg_send); 2198 2199 static void sk_enter_memory_pressure(struct sock *sk) 2200 { 2201 if (!sk->sk_prot->enter_memory_pressure) 2202 return; 2203 2204 sk->sk_prot->enter_memory_pressure(sk); 2205 } 2206 2207 static void sk_leave_memory_pressure(struct sock *sk) 2208 { 2209 if (sk->sk_prot->leave_memory_pressure) { 2210 sk->sk_prot->leave_memory_pressure(sk); 2211 } else { 2212 unsigned long *memory_pressure = sk->sk_prot->memory_pressure; 2213 2214 if (memory_pressure && *memory_pressure) 2215 *memory_pressure = 0; 2216 } 2217 } 2218 2219 /* On 32bit arches, an skb frag is limited to 2^15 */ 2220 #define SKB_FRAG_PAGE_ORDER get_order(32768) 2221 2222 /** 2223 * skb_page_frag_refill - check that a page_frag contains enough room 2224 * @sz: minimum size of the fragment we want to get 2225 * @pfrag: pointer to page_frag 2226 * @gfp: priority for memory allocation 2227 * 2228 * Note: While this allocator tries to use high order pages, there is 2229 * no guarantee that allocations succeed. Therefore, @sz MUST be 2230 * less or equal than PAGE_SIZE. 2231 */ 2232 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 2233 { 2234 if (pfrag->page) { 2235 if (page_ref_count(pfrag->page) == 1) { 2236 pfrag->offset = 0; 2237 return true; 2238 } 2239 if (pfrag->offset + sz <= pfrag->size) 2240 return true; 2241 put_page(pfrag->page); 2242 } 2243 2244 pfrag->offset = 0; 2245 if (SKB_FRAG_PAGE_ORDER) { 2246 /* Avoid direct reclaim but allow kswapd to wake */ 2247 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 2248 __GFP_COMP | __GFP_NOWARN | 2249 __GFP_NORETRY, 2250 SKB_FRAG_PAGE_ORDER); 2251 if (likely(pfrag->page)) { 2252 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 2253 return true; 2254 } 2255 } 2256 pfrag->page = alloc_page(gfp); 2257 if (likely(pfrag->page)) { 2258 pfrag->size = PAGE_SIZE; 2259 return true; 2260 } 2261 return false; 2262 } 2263 EXPORT_SYMBOL(skb_page_frag_refill); 2264 2265 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 2266 { 2267 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 2268 return true; 2269 2270 sk_enter_memory_pressure(sk); 2271 sk_stream_moderate_sndbuf(sk); 2272 return false; 2273 } 2274 EXPORT_SYMBOL(sk_page_frag_refill); 2275 2276 static void __lock_sock(struct sock *sk) 2277 __releases(&sk->sk_lock.slock) 2278 __acquires(&sk->sk_lock.slock) 2279 { 2280 DEFINE_WAIT(wait); 2281 2282 for (;;) { 2283 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 2284 TASK_UNINTERRUPTIBLE); 2285 spin_unlock_bh(&sk->sk_lock.slock); 2286 schedule(); 2287 spin_lock_bh(&sk->sk_lock.slock); 2288 if (!sock_owned_by_user(sk)) 2289 break; 2290 } 2291 finish_wait(&sk->sk_lock.wq, &wait); 2292 } 2293 2294 void __release_sock(struct sock *sk) 2295 __releases(&sk->sk_lock.slock) 2296 __acquires(&sk->sk_lock.slock) 2297 { 2298 struct sk_buff *skb, *next; 2299 2300 while ((skb = sk->sk_backlog.head) != NULL) { 2301 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 2302 2303 spin_unlock_bh(&sk->sk_lock.slock); 2304 2305 do { 2306 next = skb->next; 2307 prefetch(next); 2308 WARN_ON_ONCE(skb_dst_is_noref(skb)); 2309 skb_mark_not_on_list(skb); 2310 sk_backlog_rcv(sk, skb); 2311 2312 cond_resched(); 2313 2314 skb = next; 2315 } while (skb != NULL); 2316 2317 spin_lock_bh(&sk->sk_lock.slock); 2318 } 2319 2320 /* 2321 * Doing the zeroing here guarantee we can not loop forever 2322 * while a wild producer attempts to flood us. 2323 */ 2324 sk->sk_backlog.len = 0; 2325 } 2326 2327 void __sk_flush_backlog(struct sock *sk) 2328 { 2329 spin_lock_bh(&sk->sk_lock.slock); 2330 __release_sock(sk); 2331 spin_unlock_bh(&sk->sk_lock.slock); 2332 } 2333 2334 /** 2335 * sk_wait_data - wait for data to arrive at sk_receive_queue 2336 * @sk: sock to wait on 2337 * @timeo: for how long 2338 * @skb: last skb seen on sk_receive_queue 2339 * 2340 * Now socket state including sk->sk_err is changed only under lock, 2341 * hence we may omit checks after joining wait queue. 2342 * We check receive queue before schedule() only as optimization; 2343 * it is very likely that release_sock() added new data. 2344 */ 2345 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 2346 { 2347 DEFINE_WAIT_FUNC(wait, woken_wake_function); 2348 int rc; 2349 2350 add_wait_queue(sk_sleep(sk), &wait); 2351 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2352 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait); 2353 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2354 remove_wait_queue(sk_sleep(sk), &wait); 2355 return rc; 2356 } 2357 EXPORT_SYMBOL(sk_wait_data); 2358 2359 /** 2360 * __sk_mem_raise_allocated - increase memory_allocated 2361 * @sk: socket 2362 * @size: memory size to allocate 2363 * @amt: pages to allocate 2364 * @kind: allocation type 2365 * 2366 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc 2367 */ 2368 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind) 2369 { 2370 struct proto *prot = sk->sk_prot; 2371 long allocated = sk_memory_allocated_add(sk, amt); 2372 bool charged = true; 2373 2374 if (mem_cgroup_sockets_enabled && sk->sk_memcg && 2375 !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt))) 2376 goto suppress_allocation; 2377 2378 /* Under limit. */ 2379 if (allocated <= sk_prot_mem_limits(sk, 0)) { 2380 sk_leave_memory_pressure(sk); 2381 return 1; 2382 } 2383 2384 /* Under pressure. */ 2385 if (allocated > sk_prot_mem_limits(sk, 1)) 2386 sk_enter_memory_pressure(sk); 2387 2388 /* Over hard limit. */ 2389 if (allocated > sk_prot_mem_limits(sk, 2)) 2390 goto suppress_allocation; 2391 2392 /* guarantee minimum buffer size under pressure */ 2393 if (kind == SK_MEM_RECV) { 2394 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot)) 2395 return 1; 2396 2397 } else { /* SK_MEM_SEND */ 2398 int wmem0 = sk_get_wmem0(sk, prot); 2399 2400 if (sk->sk_type == SOCK_STREAM) { 2401 if (sk->sk_wmem_queued < wmem0) 2402 return 1; 2403 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) { 2404 return 1; 2405 } 2406 } 2407 2408 if (sk_has_memory_pressure(sk)) { 2409 int alloc; 2410 2411 if (!sk_under_memory_pressure(sk)) 2412 return 1; 2413 alloc = sk_sockets_allocated_read_positive(sk); 2414 if (sk_prot_mem_limits(sk, 2) > alloc * 2415 sk_mem_pages(sk->sk_wmem_queued + 2416 atomic_read(&sk->sk_rmem_alloc) + 2417 sk->sk_forward_alloc)) 2418 return 1; 2419 } 2420 2421 suppress_allocation: 2422 2423 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 2424 sk_stream_moderate_sndbuf(sk); 2425 2426 /* Fail only if socket is _under_ its sndbuf. 2427 * In this case we cannot block, so that we have to fail. 2428 */ 2429 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) 2430 return 1; 2431 } 2432 2433 if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged)) 2434 trace_sock_exceed_buf_limit(sk, prot, allocated, kind); 2435 2436 sk_memory_allocated_sub(sk, amt); 2437 2438 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2439 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); 2440 2441 return 0; 2442 } 2443 EXPORT_SYMBOL(__sk_mem_raise_allocated); 2444 2445 /** 2446 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 2447 * @sk: socket 2448 * @size: memory size to allocate 2449 * @kind: allocation type 2450 * 2451 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 2452 * rmem allocation. This function assumes that protocols which have 2453 * memory_pressure use sk_wmem_queued as write buffer accounting. 2454 */ 2455 int __sk_mem_schedule(struct sock *sk, int size, int kind) 2456 { 2457 int ret, amt = sk_mem_pages(size); 2458 2459 sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT; 2460 ret = __sk_mem_raise_allocated(sk, size, amt, kind); 2461 if (!ret) 2462 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT; 2463 return ret; 2464 } 2465 EXPORT_SYMBOL(__sk_mem_schedule); 2466 2467 /** 2468 * __sk_mem_reduce_allocated - reclaim memory_allocated 2469 * @sk: socket 2470 * @amount: number of quanta 2471 * 2472 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc 2473 */ 2474 void __sk_mem_reduce_allocated(struct sock *sk, int amount) 2475 { 2476 sk_memory_allocated_sub(sk, amount); 2477 2478 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2479 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); 2480 2481 if (sk_under_memory_pressure(sk) && 2482 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 2483 sk_leave_memory_pressure(sk); 2484 } 2485 EXPORT_SYMBOL(__sk_mem_reduce_allocated); 2486 2487 /** 2488 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated 2489 * @sk: socket 2490 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) 2491 */ 2492 void __sk_mem_reclaim(struct sock *sk, int amount) 2493 { 2494 amount >>= SK_MEM_QUANTUM_SHIFT; 2495 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; 2496 __sk_mem_reduce_allocated(sk, amount); 2497 } 2498 EXPORT_SYMBOL(__sk_mem_reclaim); 2499 2500 int sk_set_peek_off(struct sock *sk, int val) 2501 { 2502 sk->sk_peek_off = val; 2503 return 0; 2504 } 2505 EXPORT_SYMBOL_GPL(sk_set_peek_off); 2506 2507 /* 2508 * Set of default routines for initialising struct proto_ops when 2509 * the protocol does not support a particular function. In certain 2510 * cases where it makes no sense for a protocol to have a "do nothing" 2511 * function, some default processing is provided. 2512 */ 2513 2514 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 2515 { 2516 return -EOPNOTSUPP; 2517 } 2518 EXPORT_SYMBOL(sock_no_bind); 2519 2520 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 2521 int len, int flags) 2522 { 2523 return -EOPNOTSUPP; 2524 } 2525 EXPORT_SYMBOL(sock_no_connect); 2526 2527 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 2528 { 2529 return -EOPNOTSUPP; 2530 } 2531 EXPORT_SYMBOL(sock_no_socketpair); 2532 2533 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags, 2534 bool kern) 2535 { 2536 return -EOPNOTSUPP; 2537 } 2538 EXPORT_SYMBOL(sock_no_accept); 2539 2540 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 2541 int peer) 2542 { 2543 return -EOPNOTSUPP; 2544 } 2545 EXPORT_SYMBOL(sock_no_getname); 2546 2547 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 2548 { 2549 return -EOPNOTSUPP; 2550 } 2551 EXPORT_SYMBOL(sock_no_ioctl); 2552 2553 int sock_no_listen(struct socket *sock, int backlog) 2554 { 2555 return -EOPNOTSUPP; 2556 } 2557 EXPORT_SYMBOL(sock_no_listen); 2558 2559 int sock_no_shutdown(struct socket *sock, int how) 2560 { 2561 return -EOPNOTSUPP; 2562 } 2563 EXPORT_SYMBOL(sock_no_shutdown); 2564 2565 int sock_no_setsockopt(struct socket *sock, int level, int optname, 2566 char __user *optval, unsigned int optlen) 2567 { 2568 return -EOPNOTSUPP; 2569 } 2570 EXPORT_SYMBOL(sock_no_setsockopt); 2571 2572 int sock_no_getsockopt(struct socket *sock, int level, int optname, 2573 char __user *optval, int __user *optlen) 2574 { 2575 return -EOPNOTSUPP; 2576 } 2577 EXPORT_SYMBOL(sock_no_getsockopt); 2578 2579 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 2580 { 2581 return -EOPNOTSUPP; 2582 } 2583 EXPORT_SYMBOL(sock_no_sendmsg); 2584 2585 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len) 2586 { 2587 return -EOPNOTSUPP; 2588 } 2589 EXPORT_SYMBOL(sock_no_sendmsg_locked); 2590 2591 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 2592 int flags) 2593 { 2594 return -EOPNOTSUPP; 2595 } 2596 EXPORT_SYMBOL(sock_no_recvmsg); 2597 2598 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 2599 { 2600 /* Mirror missing mmap method error code */ 2601 return -ENODEV; 2602 } 2603 EXPORT_SYMBOL(sock_no_mmap); 2604 2605 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) 2606 { 2607 ssize_t res; 2608 struct msghdr msg = {.msg_flags = flags}; 2609 struct kvec iov; 2610 char *kaddr = kmap(page); 2611 iov.iov_base = kaddr + offset; 2612 iov.iov_len = size; 2613 res = kernel_sendmsg(sock, &msg, &iov, 1, size); 2614 kunmap(page); 2615 return res; 2616 } 2617 EXPORT_SYMBOL(sock_no_sendpage); 2618 2619 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, 2620 int offset, size_t size, int flags) 2621 { 2622 ssize_t res; 2623 struct msghdr msg = {.msg_flags = flags}; 2624 struct kvec iov; 2625 char *kaddr = kmap(page); 2626 2627 iov.iov_base = kaddr + offset; 2628 iov.iov_len = size; 2629 res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size); 2630 kunmap(page); 2631 return res; 2632 } 2633 EXPORT_SYMBOL(sock_no_sendpage_locked); 2634 2635 /* 2636 * Default Socket Callbacks 2637 */ 2638 2639 static void sock_def_wakeup(struct sock *sk) 2640 { 2641 struct socket_wq *wq; 2642 2643 rcu_read_lock(); 2644 wq = rcu_dereference(sk->sk_wq); 2645 if (skwq_has_sleeper(wq)) 2646 wake_up_interruptible_all(&wq->wait); 2647 rcu_read_unlock(); 2648 } 2649 2650 static void sock_def_error_report(struct sock *sk) 2651 { 2652 struct socket_wq *wq; 2653 2654 rcu_read_lock(); 2655 wq = rcu_dereference(sk->sk_wq); 2656 if (skwq_has_sleeper(wq)) 2657 wake_up_interruptible_poll(&wq->wait, EPOLLERR); 2658 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 2659 rcu_read_unlock(); 2660 } 2661 2662 static void sock_def_readable(struct sock *sk) 2663 { 2664 struct socket_wq *wq; 2665 2666 rcu_read_lock(); 2667 wq = rcu_dereference(sk->sk_wq); 2668 if (skwq_has_sleeper(wq)) 2669 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI | 2670 EPOLLRDNORM | EPOLLRDBAND); 2671 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 2672 rcu_read_unlock(); 2673 } 2674 2675 static void sock_def_write_space(struct sock *sk) 2676 { 2677 struct socket_wq *wq; 2678 2679 rcu_read_lock(); 2680 2681 /* Do not wake up a writer until he can make "significant" 2682 * progress. --DaveM 2683 */ 2684 if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { 2685 wq = rcu_dereference(sk->sk_wq); 2686 if (skwq_has_sleeper(wq)) 2687 wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT | 2688 EPOLLWRNORM | EPOLLWRBAND); 2689 2690 /* Should agree with poll, otherwise some programs break */ 2691 if (sock_writeable(sk)) 2692 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 2693 } 2694 2695 rcu_read_unlock(); 2696 } 2697 2698 static void sock_def_destruct(struct sock *sk) 2699 { 2700 } 2701 2702 void sk_send_sigurg(struct sock *sk) 2703 { 2704 if (sk->sk_socket && sk->sk_socket->file) 2705 if (send_sigurg(&sk->sk_socket->file->f_owner)) 2706 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 2707 } 2708 EXPORT_SYMBOL(sk_send_sigurg); 2709 2710 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 2711 unsigned long expires) 2712 { 2713 if (!mod_timer(timer, expires)) 2714 sock_hold(sk); 2715 } 2716 EXPORT_SYMBOL(sk_reset_timer); 2717 2718 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 2719 { 2720 if (del_timer(timer)) 2721 __sock_put(sk); 2722 } 2723 EXPORT_SYMBOL(sk_stop_timer); 2724 2725 void sock_init_data(struct socket *sock, struct sock *sk) 2726 { 2727 sk_init_common(sk); 2728 sk->sk_send_head = NULL; 2729 2730 timer_setup(&sk->sk_timer, NULL, 0); 2731 2732 sk->sk_allocation = GFP_KERNEL; 2733 sk->sk_rcvbuf = sysctl_rmem_default; 2734 sk->sk_sndbuf = sysctl_wmem_default; 2735 sk->sk_state = TCP_CLOSE; 2736 sk_set_socket(sk, sock); 2737 2738 sock_set_flag(sk, SOCK_ZAPPED); 2739 2740 if (sock) { 2741 sk->sk_type = sock->type; 2742 sk->sk_wq = sock->wq; 2743 sock->sk = sk; 2744 sk->sk_uid = SOCK_INODE(sock)->i_uid; 2745 } else { 2746 sk->sk_wq = NULL; 2747 sk->sk_uid = make_kuid(sock_net(sk)->user_ns, 0); 2748 } 2749 2750 rwlock_init(&sk->sk_callback_lock); 2751 if (sk->sk_kern_sock) 2752 lockdep_set_class_and_name( 2753 &sk->sk_callback_lock, 2754 af_kern_callback_keys + sk->sk_family, 2755 af_family_kern_clock_key_strings[sk->sk_family]); 2756 else 2757 lockdep_set_class_and_name( 2758 &sk->sk_callback_lock, 2759 af_callback_keys + sk->sk_family, 2760 af_family_clock_key_strings[sk->sk_family]); 2761 2762 sk->sk_state_change = sock_def_wakeup; 2763 sk->sk_data_ready = sock_def_readable; 2764 sk->sk_write_space = sock_def_write_space; 2765 sk->sk_error_report = sock_def_error_report; 2766 sk->sk_destruct = sock_def_destruct; 2767 2768 sk->sk_frag.page = NULL; 2769 sk->sk_frag.offset = 0; 2770 sk->sk_peek_off = -1; 2771 2772 sk->sk_peer_pid = NULL; 2773 sk->sk_peer_cred = NULL; 2774 sk->sk_write_pending = 0; 2775 sk->sk_rcvlowat = 1; 2776 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 2777 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 2778 2779 sk->sk_stamp = SK_DEFAULT_STAMP; 2780 #if BITS_PER_LONG==32 2781 seqlock_init(&sk->sk_stamp_seq); 2782 #endif 2783 atomic_set(&sk->sk_zckey, 0); 2784 2785 #ifdef CONFIG_NET_RX_BUSY_POLL 2786 sk->sk_napi_id = 0; 2787 sk->sk_ll_usec = sysctl_net_busy_read; 2788 #endif 2789 2790 sk->sk_max_pacing_rate = ~0UL; 2791 sk->sk_pacing_rate = ~0UL; 2792 sk->sk_pacing_shift = 10; 2793 sk->sk_incoming_cpu = -1; 2794 2795 sk_rx_queue_clear(sk); 2796 /* 2797 * Before updating sk_refcnt, we must commit prior changes to memory 2798 * (Documentation/RCU/rculist_nulls.txt for details) 2799 */ 2800 smp_wmb(); 2801 refcount_set(&sk->sk_refcnt, 1); 2802 atomic_set(&sk->sk_drops, 0); 2803 } 2804 EXPORT_SYMBOL(sock_init_data); 2805 2806 void lock_sock_nested(struct sock *sk, int subclass) 2807 { 2808 might_sleep(); 2809 spin_lock_bh(&sk->sk_lock.slock); 2810 if (sk->sk_lock.owned) 2811 __lock_sock(sk); 2812 sk->sk_lock.owned = 1; 2813 spin_unlock(&sk->sk_lock.slock); 2814 /* 2815 * The sk_lock has mutex_lock() semantics here: 2816 */ 2817 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 2818 local_bh_enable(); 2819 } 2820 EXPORT_SYMBOL(lock_sock_nested); 2821 2822 void release_sock(struct sock *sk) 2823 { 2824 spin_lock_bh(&sk->sk_lock.slock); 2825 if (sk->sk_backlog.tail) 2826 __release_sock(sk); 2827 2828 /* Warning : release_cb() might need to release sk ownership, 2829 * ie call sock_release_ownership(sk) before us. 2830 */ 2831 if (sk->sk_prot->release_cb) 2832 sk->sk_prot->release_cb(sk); 2833 2834 sock_release_ownership(sk); 2835 if (waitqueue_active(&sk->sk_lock.wq)) 2836 wake_up(&sk->sk_lock.wq); 2837 spin_unlock_bh(&sk->sk_lock.slock); 2838 } 2839 EXPORT_SYMBOL(release_sock); 2840 2841 /** 2842 * lock_sock_fast - fast version of lock_sock 2843 * @sk: socket 2844 * 2845 * This version should be used for very small section, where process wont block 2846 * return false if fast path is taken: 2847 * 2848 * sk_lock.slock locked, owned = 0, BH disabled 2849 * 2850 * return true if slow path is taken: 2851 * 2852 * sk_lock.slock unlocked, owned = 1, BH enabled 2853 */ 2854 bool lock_sock_fast(struct sock *sk) 2855 { 2856 might_sleep(); 2857 spin_lock_bh(&sk->sk_lock.slock); 2858 2859 if (!sk->sk_lock.owned) 2860 /* 2861 * Note : We must disable BH 2862 */ 2863 return false; 2864 2865 __lock_sock(sk); 2866 sk->sk_lock.owned = 1; 2867 spin_unlock(&sk->sk_lock.slock); 2868 /* 2869 * The sk_lock has mutex_lock() semantics here: 2870 */ 2871 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); 2872 local_bh_enable(); 2873 return true; 2874 } 2875 EXPORT_SYMBOL(lock_sock_fast); 2876 2877 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) 2878 { 2879 struct timeval tv; 2880 2881 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 2882 tv = ktime_to_timeval(sock_read_timestamp(sk)); 2883 if (tv.tv_sec == -1) 2884 return -ENOENT; 2885 if (tv.tv_sec == 0) { 2886 ktime_t kt = ktime_get_real(); 2887 sock_write_timestamp(sk, kt); 2888 tv = ktime_to_timeval(kt); 2889 } 2890 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0; 2891 } 2892 EXPORT_SYMBOL(sock_get_timestamp); 2893 2894 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) 2895 { 2896 struct timespec ts; 2897 2898 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 2899 ts = ktime_to_timespec(sock_read_timestamp(sk)); 2900 if (ts.tv_sec == -1) 2901 return -ENOENT; 2902 if (ts.tv_sec == 0) { 2903 ktime_t kt = ktime_get_real(); 2904 sock_write_timestamp(sk, kt); 2905 ts = ktime_to_timespec(sk->sk_stamp); 2906 } 2907 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0; 2908 } 2909 EXPORT_SYMBOL(sock_get_timestampns); 2910 2911 void sock_enable_timestamp(struct sock *sk, int flag) 2912 { 2913 if (!sock_flag(sk, flag)) { 2914 unsigned long previous_flags = sk->sk_flags; 2915 2916 sock_set_flag(sk, flag); 2917 /* 2918 * we just set one of the two flags which require net 2919 * time stamping, but time stamping might have been on 2920 * already because of the other one 2921 */ 2922 if (sock_needs_netstamp(sk) && 2923 !(previous_flags & SK_FLAGS_TIMESTAMP)) 2924 net_enable_timestamp(); 2925 } 2926 } 2927 2928 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 2929 int level, int type) 2930 { 2931 struct sock_exterr_skb *serr; 2932 struct sk_buff *skb; 2933 int copied, err; 2934 2935 err = -EAGAIN; 2936 skb = sock_dequeue_err_skb(sk); 2937 if (skb == NULL) 2938 goto out; 2939 2940 copied = skb->len; 2941 if (copied > len) { 2942 msg->msg_flags |= MSG_TRUNC; 2943 copied = len; 2944 } 2945 err = skb_copy_datagram_msg(skb, 0, msg, copied); 2946 if (err) 2947 goto out_free_skb; 2948 2949 sock_recv_timestamp(msg, sk, skb); 2950 2951 serr = SKB_EXT_ERR(skb); 2952 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 2953 2954 msg->msg_flags |= MSG_ERRQUEUE; 2955 err = copied; 2956 2957 out_free_skb: 2958 kfree_skb(skb); 2959 out: 2960 return err; 2961 } 2962 EXPORT_SYMBOL(sock_recv_errqueue); 2963 2964 /* 2965 * Get a socket option on an socket. 2966 * 2967 * FIX: POSIX 1003.1g is very ambiguous here. It states that 2968 * asynchronous errors should be reported by getsockopt. We assume 2969 * this means if you specify SO_ERROR (otherwise whats the point of it). 2970 */ 2971 int sock_common_getsockopt(struct socket *sock, int level, int optname, 2972 char __user *optval, int __user *optlen) 2973 { 2974 struct sock *sk = sock->sk; 2975 2976 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 2977 } 2978 EXPORT_SYMBOL(sock_common_getsockopt); 2979 2980 #ifdef CONFIG_COMPAT 2981 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname, 2982 char __user *optval, int __user *optlen) 2983 { 2984 struct sock *sk = sock->sk; 2985 2986 if (sk->sk_prot->compat_getsockopt != NULL) 2987 return sk->sk_prot->compat_getsockopt(sk, level, optname, 2988 optval, optlen); 2989 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 2990 } 2991 EXPORT_SYMBOL(compat_sock_common_getsockopt); 2992 #endif 2993 2994 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2995 int flags) 2996 { 2997 struct sock *sk = sock->sk; 2998 int addr_len = 0; 2999 int err; 3000 3001 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, 3002 flags & ~MSG_DONTWAIT, &addr_len); 3003 if (err >= 0) 3004 msg->msg_namelen = addr_len; 3005 return err; 3006 } 3007 EXPORT_SYMBOL(sock_common_recvmsg); 3008 3009 /* 3010 * Set socket options on an inet socket. 3011 */ 3012 int sock_common_setsockopt(struct socket *sock, int level, int optname, 3013 char __user *optval, unsigned int optlen) 3014 { 3015 struct sock *sk = sock->sk; 3016 3017 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 3018 } 3019 EXPORT_SYMBOL(sock_common_setsockopt); 3020 3021 #ifdef CONFIG_COMPAT 3022 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname, 3023 char __user *optval, unsigned int optlen) 3024 { 3025 struct sock *sk = sock->sk; 3026 3027 if (sk->sk_prot->compat_setsockopt != NULL) 3028 return sk->sk_prot->compat_setsockopt(sk, level, optname, 3029 optval, optlen); 3030 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 3031 } 3032 EXPORT_SYMBOL(compat_sock_common_setsockopt); 3033 #endif 3034 3035 void sk_common_release(struct sock *sk) 3036 { 3037 if (sk->sk_prot->destroy) 3038 sk->sk_prot->destroy(sk); 3039 3040 /* 3041 * Observation: when sock_common_release is called, processes have 3042 * no access to socket. But net still has. 3043 * Step one, detach it from networking: 3044 * 3045 * A. Remove from hash tables. 3046 */ 3047 3048 sk->sk_prot->unhash(sk); 3049 3050 /* 3051 * In this point socket cannot receive new packets, but it is possible 3052 * that some packets are in flight because some CPU runs receiver and 3053 * did hash table lookup before we unhashed socket. They will achieve 3054 * receive queue and will be purged by socket destructor. 3055 * 3056 * Also we still have packets pending on receive queue and probably, 3057 * our own packets waiting in device queues. sock_destroy will drain 3058 * receive queue, but transmitted packets will delay socket destruction 3059 * until the last reference will be released. 3060 */ 3061 3062 sock_orphan(sk); 3063 3064 xfrm_sk_free_policy(sk); 3065 3066 sk_refcnt_debug_release(sk); 3067 3068 sock_put(sk); 3069 } 3070 EXPORT_SYMBOL(sk_common_release); 3071 3072 void sk_get_meminfo(const struct sock *sk, u32 *mem) 3073 { 3074 memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); 3075 3076 mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); 3077 mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; 3078 mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); 3079 mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; 3080 mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; 3081 mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; 3082 mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); 3083 mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; 3084 mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); 3085 } 3086 3087 #ifdef CONFIG_PROC_FS 3088 #define PROTO_INUSE_NR 64 /* should be enough for the first time */ 3089 struct prot_inuse { 3090 int val[PROTO_INUSE_NR]; 3091 }; 3092 3093 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 3094 3095 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 3096 { 3097 __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); 3098 } 3099 EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 3100 3101 int sock_prot_inuse_get(struct net *net, struct proto *prot) 3102 { 3103 int cpu, idx = prot->inuse_idx; 3104 int res = 0; 3105 3106 for_each_possible_cpu(cpu) 3107 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; 3108 3109 return res >= 0 ? res : 0; 3110 } 3111 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 3112 3113 static void sock_inuse_add(struct net *net, int val) 3114 { 3115 this_cpu_add(*net->core.sock_inuse, val); 3116 } 3117 3118 int sock_inuse_get(struct net *net) 3119 { 3120 int cpu, res = 0; 3121 3122 for_each_possible_cpu(cpu) 3123 res += *per_cpu_ptr(net->core.sock_inuse, cpu); 3124 3125 return res; 3126 } 3127 3128 EXPORT_SYMBOL_GPL(sock_inuse_get); 3129 3130 static int __net_init sock_inuse_init_net(struct net *net) 3131 { 3132 net->core.prot_inuse = alloc_percpu(struct prot_inuse); 3133 if (net->core.prot_inuse == NULL) 3134 return -ENOMEM; 3135 3136 net->core.sock_inuse = alloc_percpu(int); 3137 if (net->core.sock_inuse == NULL) 3138 goto out; 3139 3140 return 0; 3141 3142 out: 3143 free_percpu(net->core.prot_inuse); 3144 return -ENOMEM; 3145 } 3146 3147 static void __net_exit sock_inuse_exit_net(struct net *net) 3148 { 3149 free_percpu(net->core.prot_inuse); 3150 free_percpu(net->core.sock_inuse); 3151 } 3152 3153 static struct pernet_operations net_inuse_ops = { 3154 .init = sock_inuse_init_net, 3155 .exit = sock_inuse_exit_net, 3156 }; 3157 3158 static __init int net_inuse_init(void) 3159 { 3160 if (register_pernet_subsys(&net_inuse_ops)) 3161 panic("Cannot initialize net inuse counters"); 3162 3163 return 0; 3164 } 3165 3166 core_initcall(net_inuse_init); 3167 3168 static void assign_proto_idx(struct proto *prot) 3169 { 3170 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 3171 3172 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 3173 pr_err("PROTO_INUSE_NR exhausted\n"); 3174 return; 3175 } 3176 3177 set_bit(prot->inuse_idx, proto_inuse_idx); 3178 } 3179 3180 static void release_proto_idx(struct proto *prot) 3181 { 3182 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 3183 clear_bit(prot->inuse_idx, proto_inuse_idx); 3184 } 3185 #else 3186 static inline void assign_proto_idx(struct proto *prot) 3187 { 3188 } 3189 3190 static inline void release_proto_idx(struct proto *prot) 3191 { 3192 } 3193 3194 static void sock_inuse_add(struct net *net, int val) 3195 { 3196 } 3197 #endif 3198 3199 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 3200 { 3201 if (!rsk_prot) 3202 return; 3203 kfree(rsk_prot->slab_name); 3204 rsk_prot->slab_name = NULL; 3205 kmem_cache_destroy(rsk_prot->slab); 3206 rsk_prot->slab = NULL; 3207 } 3208 3209 static int req_prot_init(const struct proto *prot) 3210 { 3211 struct request_sock_ops *rsk_prot = prot->rsk_prot; 3212 3213 if (!rsk_prot) 3214 return 0; 3215 3216 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 3217 prot->name); 3218 if (!rsk_prot->slab_name) 3219 return -ENOMEM; 3220 3221 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 3222 rsk_prot->obj_size, 0, 3223 SLAB_ACCOUNT | prot->slab_flags, 3224 NULL); 3225 3226 if (!rsk_prot->slab) { 3227 pr_crit("%s: Can't create request sock SLAB cache!\n", 3228 prot->name); 3229 return -ENOMEM; 3230 } 3231 return 0; 3232 } 3233 3234 int proto_register(struct proto *prot, int alloc_slab) 3235 { 3236 if (alloc_slab) { 3237 prot->slab = kmem_cache_create_usercopy(prot->name, 3238 prot->obj_size, 0, 3239 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | 3240 prot->slab_flags, 3241 prot->useroffset, prot->usersize, 3242 NULL); 3243 3244 if (prot->slab == NULL) { 3245 pr_crit("%s: Can't create sock SLAB cache!\n", 3246 prot->name); 3247 goto out; 3248 } 3249 3250 if (req_prot_init(prot)) 3251 goto out_free_request_sock_slab; 3252 3253 if (prot->twsk_prot != NULL) { 3254 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); 3255 3256 if (prot->twsk_prot->twsk_slab_name == NULL) 3257 goto out_free_request_sock_slab; 3258 3259 prot->twsk_prot->twsk_slab = 3260 kmem_cache_create(prot->twsk_prot->twsk_slab_name, 3261 prot->twsk_prot->twsk_obj_size, 3262 0, 3263 SLAB_ACCOUNT | 3264 prot->slab_flags, 3265 NULL); 3266 if (prot->twsk_prot->twsk_slab == NULL) 3267 goto out_free_timewait_sock_slab_name; 3268 } 3269 } 3270 3271 mutex_lock(&proto_list_mutex); 3272 list_add(&prot->node, &proto_list); 3273 assign_proto_idx(prot); 3274 mutex_unlock(&proto_list_mutex); 3275 return 0; 3276 3277 out_free_timewait_sock_slab_name: 3278 kfree(prot->twsk_prot->twsk_slab_name); 3279 out_free_request_sock_slab: 3280 req_prot_cleanup(prot->rsk_prot); 3281 3282 kmem_cache_destroy(prot->slab); 3283 prot->slab = NULL; 3284 out: 3285 return -ENOBUFS; 3286 } 3287 EXPORT_SYMBOL(proto_register); 3288 3289 void proto_unregister(struct proto *prot) 3290 { 3291 mutex_lock(&proto_list_mutex); 3292 release_proto_idx(prot); 3293 list_del(&prot->node); 3294 mutex_unlock(&proto_list_mutex); 3295 3296 kmem_cache_destroy(prot->slab); 3297 prot->slab = NULL; 3298 3299 req_prot_cleanup(prot->rsk_prot); 3300 3301 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { 3302 kmem_cache_destroy(prot->twsk_prot->twsk_slab); 3303 kfree(prot->twsk_prot->twsk_slab_name); 3304 prot->twsk_prot->twsk_slab = NULL; 3305 } 3306 } 3307 EXPORT_SYMBOL(proto_unregister); 3308 3309 int sock_load_diag_module(int family, int protocol) 3310 { 3311 if (!protocol) { 3312 if (!sock_is_registered(family)) 3313 return -ENOENT; 3314 3315 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK, 3316 NETLINK_SOCK_DIAG, family); 3317 } 3318 3319 #ifdef CONFIG_INET 3320 if (family == AF_INET && 3321 protocol != IPPROTO_RAW && 3322 !rcu_access_pointer(inet_protos[protocol])) 3323 return -ENOENT; 3324 #endif 3325 3326 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK, 3327 NETLINK_SOCK_DIAG, family, protocol); 3328 } 3329 EXPORT_SYMBOL(sock_load_diag_module); 3330 3331 #ifdef CONFIG_PROC_FS 3332 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 3333 __acquires(proto_list_mutex) 3334 { 3335 mutex_lock(&proto_list_mutex); 3336 return seq_list_start_head(&proto_list, *pos); 3337 } 3338 3339 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 3340 { 3341 return seq_list_next(v, &proto_list, pos); 3342 } 3343 3344 static void proto_seq_stop(struct seq_file *seq, void *v) 3345 __releases(proto_list_mutex) 3346 { 3347 mutex_unlock(&proto_list_mutex); 3348 } 3349 3350 static char proto_method_implemented(const void *method) 3351 { 3352 return method == NULL ? 'n' : 'y'; 3353 } 3354 static long sock_prot_memory_allocated(struct proto *proto) 3355 { 3356 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 3357 } 3358 3359 static char *sock_prot_memory_pressure(struct proto *proto) 3360 { 3361 return proto->memory_pressure != NULL ? 3362 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 3363 } 3364 3365 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 3366 { 3367 3368 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 3369 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 3370 proto->name, 3371 proto->obj_size, 3372 sock_prot_inuse_get(seq_file_net(seq), proto), 3373 sock_prot_memory_allocated(proto), 3374 sock_prot_memory_pressure(proto), 3375 proto->max_header, 3376 proto->slab == NULL ? "no" : "yes", 3377 module_name(proto->owner), 3378 proto_method_implemented(proto->close), 3379 proto_method_implemented(proto->connect), 3380 proto_method_implemented(proto->disconnect), 3381 proto_method_implemented(proto->accept), 3382 proto_method_implemented(proto->ioctl), 3383 proto_method_implemented(proto->init), 3384 proto_method_implemented(proto->destroy), 3385 proto_method_implemented(proto->shutdown), 3386 proto_method_implemented(proto->setsockopt), 3387 proto_method_implemented(proto->getsockopt), 3388 proto_method_implemented(proto->sendmsg), 3389 proto_method_implemented(proto->recvmsg), 3390 proto_method_implemented(proto->sendpage), 3391 proto_method_implemented(proto->bind), 3392 proto_method_implemented(proto->backlog_rcv), 3393 proto_method_implemented(proto->hash), 3394 proto_method_implemented(proto->unhash), 3395 proto_method_implemented(proto->get_port), 3396 proto_method_implemented(proto->enter_memory_pressure)); 3397 } 3398 3399 static int proto_seq_show(struct seq_file *seq, void *v) 3400 { 3401 if (v == &proto_list) 3402 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 3403 "protocol", 3404 "size", 3405 "sockets", 3406 "memory", 3407 "press", 3408 "maxhdr", 3409 "slab", 3410 "module", 3411 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); 3412 else 3413 proto_seq_printf(seq, list_entry(v, struct proto, node)); 3414 return 0; 3415 } 3416 3417 static const struct seq_operations proto_seq_ops = { 3418 .start = proto_seq_start, 3419 .next = proto_seq_next, 3420 .stop = proto_seq_stop, 3421 .show = proto_seq_show, 3422 }; 3423 3424 static __net_init int proto_init_net(struct net *net) 3425 { 3426 if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops, 3427 sizeof(struct seq_net_private))) 3428 return -ENOMEM; 3429 3430 return 0; 3431 } 3432 3433 static __net_exit void proto_exit_net(struct net *net) 3434 { 3435 remove_proc_entry("protocols", net->proc_net); 3436 } 3437 3438 3439 static __net_initdata struct pernet_operations proto_net_ops = { 3440 .init = proto_init_net, 3441 .exit = proto_exit_net, 3442 }; 3443 3444 static int __init proto_init(void) 3445 { 3446 return register_pernet_subsys(&proto_net_ops); 3447 } 3448 3449 subsys_initcall(proto_init); 3450 3451 #endif /* PROC_FS */ 3452 3453 #ifdef CONFIG_NET_RX_BUSY_POLL 3454 bool sk_busy_loop_end(void *p, unsigned long start_time) 3455 { 3456 struct sock *sk = p; 3457 3458 return !skb_queue_empty(&sk->sk_receive_queue) || 3459 sk_busy_loop_timeout(sk, start_time); 3460 } 3461 EXPORT_SYMBOL(sk_busy_loop_end); 3462 #endif /* CONFIG_NET_RX_BUSY_POLL */ 3463