1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * Generic socket support routines. Memory allocators, socket lock/release 7 * handler for protocols to use and generic option handler. 8 * 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Florian La Roche, <flla@stud.uni-sb.de> 13 * Alan Cox, <A.Cox@swansea.ac.uk> 14 * 15 * Fixes: 16 * Alan Cox : Numerous verify_area() problems 17 * Alan Cox : Connecting on a connecting socket 18 * now returns an error for tcp. 19 * Alan Cox : sock->protocol is set correctly. 20 * and is not sometimes left as 0. 21 * Alan Cox : connect handles icmp errors on a 22 * connect properly. Unfortunately there 23 * is a restart syscall nasty there. I 24 * can't match BSD without hacking the C 25 * library. Ideas urgently sought! 26 * Alan Cox : Disallow bind() to addresses that are 27 * not ours - especially broadcast ones!! 28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) 29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, 30 * instead they leave that for the DESTROY timer. 31 * Alan Cox : Clean up error flag in accept 32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer 33 * was buggy. Put a remove_sock() in the handler 34 * for memory when we hit 0. Also altered the timer 35 * code. The ACK stuff can wait and needs major 36 * TCP layer surgery. 37 * Alan Cox : Fixed TCP ack bug, removed remove sock 38 * and fixed timer/inet_bh race. 39 * Alan Cox : Added zapped flag for TCP 40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code 41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb 42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources 43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. 44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... 45 * Rick Sladkey : Relaxed UDP rules for matching packets. 46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support 47 * Pauline Middelink : identd support 48 * Alan Cox : Fixed connect() taking signals I think. 49 * Alan Cox : SO_LINGER supported 50 * Alan Cox : Error reporting fixes 51 * Anonymous : inet_create tidied up (sk->reuse setting) 52 * Alan Cox : inet sockets don't set sk->type! 53 * Alan Cox : Split socket option code 54 * Alan Cox : Callbacks 55 * Alan Cox : Nagle flag for Charles & Johannes stuff 56 * Alex : Removed restriction on inet fioctl 57 * Alan Cox : Splitting INET from NET core 58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() 59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code 60 * Alan Cox : Split IP from generic code 61 * Alan Cox : New kfree_skbmem() 62 * Alan Cox : Make SO_DEBUG superuser only. 63 * Alan Cox : Allow anyone to clear SO_DEBUG 64 * (compatibility fix) 65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. 66 * Alan Cox : Allocator for a socket is settable. 67 * Alan Cox : SO_ERROR includes soft errors. 68 * Alan Cox : Allow NULL arguments on some SO_ opts 69 * Alan Cox : Generic socket allocation to make hooks 70 * easier (suggested by Craig Metz). 71 * Michael Pall : SO_ERROR returns positive errno again 72 * Steve Whitehouse: Added default destructor to free 73 * protocol private data. 74 * Steve Whitehouse: Added various other default routines 75 * common to several socket families. 76 * Chris Evans : Call suser() check last on F_SETOWN 77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. 78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() 79 * Andi Kleen : Fix write_space callback 80 * Chris Evans : Security fixes - signedness again 81 * Arnaldo C. Melo : cleanups, use skb_queue_purge 82 * 83 * To Fix: 84 * 85 * 86 * This program is free software; you can redistribute it and/or 87 * modify it under the terms of the GNU General Public License 88 * as published by the Free Software Foundation; either version 89 * 2 of the License, or (at your option) any later version. 90 */ 91 92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 93 94 #include <linux/capability.h> 95 #include <linux/errno.h> 96 #include <linux/errqueue.h> 97 #include <linux/types.h> 98 #include <linux/socket.h> 99 #include <linux/in.h> 100 #include <linux/kernel.h> 101 #include <linux/module.h> 102 #include <linux/proc_fs.h> 103 #include <linux/seq_file.h> 104 #include <linux/sched.h> 105 #include <linux/timer.h> 106 #include <linux/string.h> 107 #include <linux/sockios.h> 108 #include <linux/net.h> 109 #include <linux/mm.h> 110 #include <linux/slab.h> 111 #include <linux/interrupt.h> 112 #include <linux/poll.h> 113 #include <linux/tcp.h> 114 #include <linux/init.h> 115 #include <linux/highmem.h> 116 #include <linux/user_namespace.h> 117 #include <linux/static_key.h> 118 #include <linux/memcontrol.h> 119 #include <linux/prefetch.h> 120 121 #include <asm/uaccess.h> 122 123 #include <linux/netdevice.h> 124 #include <net/protocol.h> 125 #include <linux/skbuff.h> 126 #include <net/net_namespace.h> 127 #include <net/request_sock.h> 128 #include <net/sock.h> 129 #include <linux/net_tstamp.h> 130 #include <net/xfrm.h> 131 #include <linux/ipsec.h> 132 #include <net/cls_cgroup.h> 133 #include <net/netprio_cgroup.h> 134 #include <linux/sock_diag.h> 135 136 #include <linux/filter.h> 137 #include <net/sock_reuseport.h> 138 139 #include <trace/events/sock.h> 140 141 #ifdef CONFIG_INET 142 #include <net/tcp.h> 143 #endif 144 145 #include <net/busy_poll.h> 146 147 static DEFINE_MUTEX(proto_list_mutex); 148 static LIST_HEAD(proto_list); 149 150 /** 151 * sk_ns_capable - General socket capability test 152 * @sk: Socket to use a capability on or through 153 * @user_ns: The user namespace of the capability to use 154 * @cap: The capability to use 155 * 156 * Test to see if the opener of the socket had when the socket was 157 * created and the current process has the capability @cap in the user 158 * namespace @user_ns. 159 */ 160 bool sk_ns_capable(const struct sock *sk, 161 struct user_namespace *user_ns, int cap) 162 { 163 return file_ns_capable(sk->sk_socket->file, user_ns, cap) && 164 ns_capable(user_ns, cap); 165 } 166 EXPORT_SYMBOL(sk_ns_capable); 167 168 /** 169 * sk_capable - Socket global capability test 170 * @sk: Socket to use a capability on or through 171 * @cap: The global capability to use 172 * 173 * Test to see if the opener of the socket had when the socket was 174 * created and the current process has the capability @cap in all user 175 * namespaces. 176 */ 177 bool sk_capable(const struct sock *sk, int cap) 178 { 179 return sk_ns_capable(sk, &init_user_ns, cap); 180 } 181 EXPORT_SYMBOL(sk_capable); 182 183 /** 184 * sk_net_capable - Network namespace socket capability test 185 * @sk: Socket to use a capability on or through 186 * @cap: The capability to use 187 * 188 * Test to see if the opener of the socket had when the socket was created 189 * and the current process has the capability @cap over the network namespace 190 * the socket is a member of. 191 */ 192 bool sk_net_capable(const struct sock *sk, int cap) 193 { 194 return sk_ns_capable(sk, sock_net(sk)->user_ns, cap); 195 } 196 EXPORT_SYMBOL(sk_net_capable); 197 198 /* 199 * Each address family might have different locking rules, so we have 200 * one slock key per address family: 201 */ 202 static struct lock_class_key af_family_keys[AF_MAX]; 203 static struct lock_class_key af_family_slock_keys[AF_MAX]; 204 205 /* 206 * Make lock validator output more readable. (we pre-construct these 207 * strings build-time, so that runtime initialization of socket 208 * locks is fast): 209 */ 210 static const char *const af_family_key_strings[AF_MAX+1] = { 211 "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX" , "sk_lock-AF_INET" , 212 "sk_lock-AF_AX25" , "sk_lock-AF_IPX" , "sk_lock-AF_APPLETALK", 213 "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE" , "sk_lock-AF_ATMPVC" , 214 "sk_lock-AF_X25" , "sk_lock-AF_INET6" , "sk_lock-AF_ROSE" , 215 "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" , 216 "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" , 217 "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" , 218 "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" , 219 "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" , 220 "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" , 221 "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , 222 "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , 223 "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" , 224 "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX" 225 }; 226 static const char *const af_family_slock_key_strings[AF_MAX+1] = { 227 "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , 228 "slock-AF_AX25" , "slock-AF_IPX" , "slock-AF_APPLETALK", 229 "slock-AF_NETROM", "slock-AF_BRIDGE" , "slock-AF_ATMPVC" , 230 "slock-AF_X25" , "slock-AF_INET6" , "slock-AF_ROSE" , 231 "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" , 232 "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" , 233 "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" , 234 "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" , 235 "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" , 236 "slock-27" , "slock-28" , "slock-AF_CAN" , 237 "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , 238 "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , 239 "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" , 240 "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX" 241 }; 242 static const char *const af_family_clock_key_strings[AF_MAX+1] = { 243 "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , 244 "clock-AF_AX25" , "clock-AF_IPX" , "clock-AF_APPLETALK", 245 "clock-AF_NETROM", "clock-AF_BRIDGE" , "clock-AF_ATMPVC" , 246 "clock-AF_X25" , "clock-AF_INET6" , "clock-AF_ROSE" , 247 "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" , 248 "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" , 249 "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" , 250 "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" , 251 "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" , 252 "clock-27" , "clock-28" , "clock-AF_CAN" , 253 "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , 254 "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , 255 "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" , 256 "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX" 257 }; 258 259 /* 260 * sk_callback_lock locking rules are per-address-family, 261 * so split the lock classes by using a per-AF key: 262 */ 263 static struct lock_class_key af_callback_keys[AF_MAX]; 264 265 /* Take into consideration the size of the struct sk_buff overhead in the 266 * determination of these values, since that is non-constant across 267 * platforms. This makes socket queueing behavior and performance 268 * not depend upon such differences. 269 */ 270 #define _SK_MEM_PACKETS 256 271 #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) 272 #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) 273 #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) 274 275 /* Run time adjustable parameters. */ 276 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX; 277 EXPORT_SYMBOL(sysctl_wmem_max); 278 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX; 279 EXPORT_SYMBOL(sysctl_rmem_max); 280 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX; 281 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; 282 283 /* Maximal space eaten by iovec or ancillary data plus some space */ 284 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); 285 EXPORT_SYMBOL(sysctl_optmem_max); 286 287 int sysctl_tstamp_allow_data __read_mostly = 1; 288 289 struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE; 290 EXPORT_SYMBOL_GPL(memalloc_socks); 291 292 /** 293 * sk_set_memalloc - sets %SOCK_MEMALLOC 294 * @sk: socket to set it on 295 * 296 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. 297 * It's the responsibility of the admin to adjust min_free_kbytes 298 * to meet the requirements 299 */ 300 void sk_set_memalloc(struct sock *sk) 301 { 302 sock_set_flag(sk, SOCK_MEMALLOC); 303 sk->sk_allocation |= __GFP_MEMALLOC; 304 static_key_slow_inc(&memalloc_socks); 305 } 306 EXPORT_SYMBOL_GPL(sk_set_memalloc); 307 308 void sk_clear_memalloc(struct sock *sk) 309 { 310 sock_reset_flag(sk, SOCK_MEMALLOC); 311 sk->sk_allocation &= ~__GFP_MEMALLOC; 312 static_key_slow_dec(&memalloc_socks); 313 314 /* 315 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward 316 * progress of swapping. SOCK_MEMALLOC may be cleared while 317 * it has rmem allocations due to the last swapfile being deactivated 318 * but there is a risk that the socket is unusable due to exceeding 319 * the rmem limits. Reclaim the reserves and obey rmem limits again. 320 */ 321 sk_mem_reclaim(sk); 322 } 323 EXPORT_SYMBOL_GPL(sk_clear_memalloc); 324 325 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) 326 { 327 int ret; 328 unsigned long pflags = current->flags; 329 330 /* these should have been dropped before queueing */ 331 BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); 332 333 current->flags |= PF_MEMALLOC; 334 ret = sk->sk_backlog_rcv(sk, skb); 335 tsk_restore_flags(current, pflags, PF_MEMALLOC); 336 337 return ret; 338 } 339 EXPORT_SYMBOL(__sk_backlog_rcv); 340 341 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) 342 { 343 struct timeval tv; 344 345 if (optlen < sizeof(tv)) 346 return -EINVAL; 347 if (copy_from_user(&tv, optval, sizeof(tv))) 348 return -EFAULT; 349 if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC) 350 return -EDOM; 351 352 if (tv.tv_sec < 0) { 353 static int warned __read_mostly; 354 355 *timeo_p = 0; 356 if (warned < 10 && net_ratelimit()) { 357 warned++; 358 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n", 359 __func__, current->comm, task_pid_nr(current)); 360 } 361 return 0; 362 } 363 *timeo_p = MAX_SCHEDULE_TIMEOUT; 364 if (tv.tv_sec == 0 && tv.tv_usec == 0) 365 return 0; 366 if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) 367 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ); 368 return 0; 369 } 370 371 static void sock_warn_obsolete_bsdism(const char *name) 372 { 373 static int warned; 374 static char warncomm[TASK_COMM_LEN]; 375 if (strcmp(warncomm, current->comm) && warned < 5) { 376 strcpy(warncomm, current->comm); 377 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n", 378 warncomm, name); 379 warned++; 380 } 381 } 382 383 static bool sock_needs_netstamp(const struct sock *sk) 384 { 385 switch (sk->sk_family) { 386 case AF_UNSPEC: 387 case AF_UNIX: 388 return false; 389 default: 390 return true; 391 } 392 } 393 394 static void sock_disable_timestamp(struct sock *sk, unsigned long flags) 395 { 396 if (sk->sk_flags & flags) { 397 sk->sk_flags &= ~flags; 398 if (sock_needs_netstamp(sk) && 399 !(sk->sk_flags & SK_FLAGS_TIMESTAMP)) 400 net_disable_timestamp(); 401 } 402 } 403 404 405 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) 406 { 407 int err; 408 unsigned long flags; 409 struct sk_buff_head *list = &sk->sk_receive_queue; 410 411 if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) { 412 atomic_inc(&sk->sk_drops); 413 trace_sock_rcvqueue_full(sk, skb); 414 return -ENOMEM; 415 } 416 417 err = sk_filter(sk, skb); 418 if (err) 419 return err; 420 421 if (!sk_rmem_schedule(sk, skb, skb->truesize)) { 422 atomic_inc(&sk->sk_drops); 423 return -ENOBUFS; 424 } 425 426 skb->dev = NULL; 427 skb_set_owner_r(skb, sk); 428 429 /* we escape from rcu protected region, make sure we dont leak 430 * a norefcounted dst 431 */ 432 skb_dst_force(skb); 433 434 spin_lock_irqsave(&list->lock, flags); 435 sock_skb_set_dropcount(sk, skb); 436 __skb_queue_tail(list, skb); 437 spin_unlock_irqrestore(&list->lock, flags); 438 439 if (!sock_flag(sk, SOCK_DEAD)) 440 sk->sk_data_ready(sk); 441 return 0; 442 } 443 EXPORT_SYMBOL(sock_queue_rcv_skb); 444 445 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) 446 { 447 int rc = NET_RX_SUCCESS; 448 449 if (sk_filter(sk, skb)) 450 goto discard_and_relse; 451 452 skb->dev = NULL; 453 454 if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) { 455 atomic_inc(&sk->sk_drops); 456 goto discard_and_relse; 457 } 458 if (nested) 459 bh_lock_sock_nested(sk); 460 else 461 bh_lock_sock(sk); 462 if (!sock_owned_by_user(sk)) { 463 /* 464 * trylock + unlock semantics: 465 */ 466 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); 467 468 rc = sk_backlog_rcv(sk, skb); 469 470 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); 471 } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) { 472 bh_unlock_sock(sk); 473 atomic_inc(&sk->sk_drops); 474 goto discard_and_relse; 475 } 476 477 bh_unlock_sock(sk); 478 out: 479 sock_put(sk); 480 return rc; 481 discard_and_relse: 482 kfree_skb(skb); 483 goto out; 484 } 485 EXPORT_SYMBOL(sk_receive_skb); 486 487 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie) 488 { 489 struct dst_entry *dst = __sk_dst_get(sk); 490 491 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 492 sk_tx_queue_clear(sk); 493 RCU_INIT_POINTER(sk->sk_dst_cache, NULL); 494 dst_release(dst); 495 return NULL; 496 } 497 498 return dst; 499 } 500 EXPORT_SYMBOL(__sk_dst_check); 501 502 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie) 503 { 504 struct dst_entry *dst = sk_dst_get(sk); 505 506 if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) { 507 sk_dst_reset(sk); 508 dst_release(dst); 509 return NULL; 510 } 511 512 return dst; 513 } 514 EXPORT_SYMBOL(sk_dst_check); 515 516 static int sock_setbindtodevice(struct sock *sk, char __user *optval, 517 int optlen) 518 { 519 int ret = -ENOPROTOOPT; 520 #ifdef CONFIG_NETDEVICES 521 struct net *net = sock_net(sk); 522 char devname[IFNAMSIZ]; 523 int index; 524 525 /* Sorry... */ 526 ret = -EPERM; 527 if (!ns_capable(net->user_ns, CAP_NET_RAW)) 528 goto out; 529 530 ret = -EINVAL; 531 if (optlen < 0) 532 goto out; 533 534 /* Bind this socket to a particular device like "eth0", 535 * as specified in the passed interface name. If the 536 * name is "" or the option length is zero the socket 537 * is not bound. 538 */ 539 if (optlen > IFNAMSIZ - 1) 540 optlen = IFNAMSIZ - 1; 541 memset(devname, 0, sizeof(devname)); 542 543 ret = -EFAULT; 544 if (copy_from_user(devname, optval, optlen)) 545 goto out; 546 547 index = 0; 548 if (devname[0] != '\0') { 549 struct net_device *dev; 550 551 rcu_read_lock(); 552 dev = dev_get_by_name_rcu(net, devname); 553 if (dev) 554 index = dev->ifindex; 555 rcu_read_unlock(); 556 ret = -ENODEV; 557 if (!dev) 558 goto out; 559 } 560 561 lock_sock(sk); 562 sk->sk_bound_dev_if = index; 563 sk_dst_reset(sk); 564 release_sock(sk); 565 566 ret = 0; 567 568 out: 569 #endif 570 571 return ret; 572 } 573 574 static int sock_getbindtodevice(struct sock *sk, char __user *optval, 575 int __user *optlen, int len) 576 { 577 int ret = -ENOPROTOOPT; 578 #ifdef CONFIG_NETDEVICES 579 struct net *net = sock_net(sk); 580 char devname[IFNAMSIZ]; 581 582 if (sk->sk_bound_dev_if == 0) { 583 len = 0; 584 goto zero; 585 } 586 587 ret = -EINVAL; 588 if (len < IFNAMSIZ) 589 goto out; 590 591 ret = netdev_get_name(net, devname, sk->sk_bound_dev_if); 592 if (ret) 593 goto out; 594 595 len = strlen(devname) + 1; 596 597 ret = -EFAULT; 598 if (copy_to_user(optval, devname, len)) 599 goto out; 600 601 zero: 602 ret = -EFAULT; 603 if (put_user(len, optlen)) 604 goto out; 605 606 ret = 0; 607 608 out: 609 #endif 610 611 return ret; 612 } 613 614 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) 615 { 616 if (valbool) 617 sock_set_flag(sk, bit); 618 else 619 sock_reset_flag(sk, bit); 620 } 621 622 bool sk_mc_loop(struct sock *sk) 623 { 624 if (dev_recursion_level()) 625 return false; 626 if (!sk) 627 return true; 628 switch (sk->sk_family) { 629 case AF_INET: 630 return inet_sk(sk)->mc_loop; 631 #if IS_ENABLED(CONFIG_IPV6) 632 case AF_INET6: 633 return inet6_sk(sk)->mc_loop; 634 #endif 635 } 636 WARN_ON(1); 637 return true; 638 } 639 EXPORT_SYMBOL(sk_mc_loop); 640 641 /* 642 * This is meant for all protocols to use and covers goings on 643 * at the socket level. Everything here is generic. 644 */ 645 646 int sock_setsockopt(struct socket *sock, int level, int optname, 647 char __user *optval, unsigned int optlen) 648 { 649 struct sock *sk = sock->sk; 650 int val; 651 int valbool; 652 struct linger ling; 653 int ret = 0; 654 655 /* 656 * Options without arguments 657 */ 658 659 if (optname == SO_BINDTODEVICE) 660 return sock_setbindtodevice(sk, optval, optlen); 661 662 if (optlen < sizeof(int)) 663 return -EINVAL; 664 665 if (get_user(val, (int __user *)optval)) 666 return -EFAULT; 667 668 valbool = val ? 1 : 0; 669 670 lock_sock(sk); 671 672 switch (optname) { 673 case SO_DEBUG: 674 if (val && !capable(CAP_NET_ADMIN)) 675 ret = -EACCES; 676 else 677 sock_valbool_flag(sk, SOCK_DBG, valbool); 678 break; 679 case SO_REUSEADDR: 680 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE); 681 break; 682 case SO_REUSEPORT: 683 sk->sk_reuseport = valbool; 684 break; 685 case SO_TYPE: 686 case SO_PROTOCOL: 687 case SO_DOMAIN: 688 case SO_ERROR: 689 ret = -ENOPROTOOPT; 690 break; 691 case SO_DONTROUTE: 692 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); 693 break; 694 case SO_BROADCAST: 695 sock_valbool_flag(sk, SOCK_BROADCAST, valbool); 696 break; 697 case SO_SNDBUF: 698 /* Don't error on this BSD doesn't and if you think 699 * about it this is right. Otherwise apps have to 700 * play 'guess the biggest size' games. RCVBUF/SNDBUF 701 * are treated in BSD as hints 702 */ 703 val = min_t(u32, val, sysctl_wmem_max); 704 set_sndbuf: 705 sk->sk_userlocks |= SOCK_SNDBUF_LOCK; 706 sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF); 707 /* Wake up sending tasks if we upped the value. */ 708 sk->sk_write_space(sk); 709 break; 710 711 case SO_SNDBUFFORCE: 712 if (!capable(CAP_NET_ADMIN)) { 713 ret = -EPERM; 714 break; 715 } 716 goto set_sndbuf; 717 718 case SO_RCVBUF: 719 /* Don't error on this BSD doesn't and if you think 720 * about it this is right. Otherwise apps have to 721 * play 'guess the biggest size' games. RCVBUF/SNDBUF 722 * are treated in BSD as hints 723 */ 724 val = min_t(u32, val, sysctl_rmem_max); 725 set_rcvbuf: 726 sk->sk_userlocks |= SOCK_RCVBUF_LOCK; 727 /* 728 * We double it on the way in to account for 729 * "struct sk_buff" etc. overhead. Applications 730 * assume that the SO_RCVBUF setting they make will 731 * allow that much actual data to be received on that 732 * socket. 733 * 734 * Applications are unaware that "struct sk_buff" and 735 * other overheads allocate from the receive buffer 736 * during socket buffer allocation. 737 * 738 * And after considering the possible alternatives, 739 * returning the value we actually used in getsockopt 740 * is the most desirable behavior. 741 */ 742 sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF); 743 break; 744 745 case SO_RCVBUFFORCE: 746 if (!capable(CAP_NET_ADMIN)) { 747 ret = -EPERM; 748 break; 749 } 750 goto set_rcvbuf; 751 752 case SO_KEEPALIVE: 753 #ifdef CONFIG_INET 754 if (sk->sk_protocol == IPPROTO_TCP && 755 sk->sk_type == SOCK_STREAM) 756 tcp_set_keepalive(sk, valbool); 757 #endif 758 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); 759 break; 760 761 case SO_OOBINLINE: 762 sock_valbool_flag(sk, SOCK_URGINLINE, valbool); 763 break; 764 765 case SO_NO_CHECK: 766 sk->sk_no_check_tx = valbool; 767 break; 768 769 case SO_PRIORITY: 770 if ((val >= 0 && val <= 6) || 771 ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 772 sk->sk_priority = val; 773 else 774 ret = -EPERM; 775 break; 776 777 case SO_LINGER: 778 if (optlen < sizeof(ling)) { 779 ret = -EINVAL; /* 1003.1g */ 780 break; 781 } 782 if (copy_from_user(&ling, optval, sizeof(ling))) { 783 ret = -EFAULT; 784 break; 785 } 786 if (!ling.l_onoff) 787 sock_reset_flag(sk, SOCK_LINGER); 788 else { 789 #if (BITS_PER_LONG == 32) 790 if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ) 791 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; 792 else 793 #endif 794 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ; 795 sock_set_flag(sk, SOCK_LINGER); 796 } 797 break; 798 799 case SO_BSDCOMPAT: 800 sock_warn_obsolete_bsdism("setsockopt"); 801 break; 802 803 case SO_PASSCRED: 804 if (valbool) 805 set_bit(SOCK_PASSCRED, &sock->flags); 806 else 807 clear_bit(SOCK_PASSCRED, &sock->flags); 808 break; 809 810 case SO_TIMESTAMP: 811 case SO_TIMESTAMPNS: 812 if (valbool) { 813 if (optname == SO_TIMESTAMP) 814 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 815 else 816 sock_set_flag(sk, SOCK_RCVTSTAMPNS); 817 sock_set_flag(sk, SOCK_RCVTSTAMP); 818 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 819 } else { 820 sock_reset_flag(sk, SOCK_RCVTSTAMP); 821 sock_reset_flag(sk, SOCK_RCVTSTAMPNS); 822 } 823 break; 824 825 case SO_TIMESTAMPING: 826 if (val & ~SOF_TIMESTAMPING_MASK) { 827 ret = -EINVAL; 828 break; 829 } 830 831 if (val & SOF_TIMESTAMPING_OPT_ID && 832 !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) { 833 if (sk->sk_protocol == IPPROTO_TCP && 834 sk->sk_type == SOCK_STREAM) { 835 if (sk->sk_state != TCP_ESTABLISHED) { 836 ret = -EINVAL; 837 break; 838 } 839 sk->sk_tskey = tcp_sk(sk)->snd_una; 840 } else { 841 sk->sk_tskey = 0; 842 } 843 } 844 sk->sk_tsflags = val; 845 if (val & SOF_TIMESTAMPING_RX_SOFTWARE) 846 sock_enable_timestamp(sk, 847 SOCK_TIMESTAMPING_RX_SOFTWARE); 848 else 849 sock_disable_timestamp(sk, 850 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)); 851 break; 852 853 case SO_RCVLOWAT: 854 if (val < 0) 855 val = INT_MAX; 856 sk->sk_rcvlowat = val ? : 1; 857 break; 858 859 case SO_RCVTIMEO: 860 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen); 861 break; 862 863 case SO_SNDTIMEO: 864 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen); 865 break; 866 867 case SO_ATTACH_FILTER: 868 ret = -EINVAL; 869 if (optlen == sizeof(struct sock_fprog)) { 870 struct sock_fprog fprog; 871 872 ret = -EFAULT; 873 if (copy_from_user(&fprog, optval, sizeof(fprog))) 874 break; 875 876 ret = sk_attach_filter(&fprog, sk); 877 } 878 break; 879 880 case SO_ATTACH_BPF: 881 ret = -EINVAL; 882 if (optlen == sizeof(u32)) { 883 u32 ufd; 884 885 ret = -EFAULT; 886 if (copy_from_user(&ufd, optval, sizeof(ufd))) 887 break; 888 889 ret = sk_attach_bpf(ufd, sk); 890 } 891 break; 892 893 case SO_ATTACH_REUSEPORT_CBPF: 894 ret = -EINVAL; 895 if (optlen == sizeof(struct sock_fprog)) { 896 struct sock_fprog fprog; 897 898 ret = -EFAULT; 899 if (copy_from_user(&fprog, optval, sizeof(fprog))) 900 break; 901 902 ret = sk_reuseport_attach_filter(&fprog, sk); 903 } 904 break; 905 906 case SO_ATTACH_REUSEPORT_EBPF: 907 ret = -EINVAL; 908 if (optlen == sizeof(u32)) { 909 u32 ufd; 910 911 ret = -EFAULT; 912 if (copy_from_user(&ufd, optval, sizeof(ufd))) 913 break; 914 915 ret = sk_reuseport_attach_bpf(ufd, sk); 916 } 917 break; 918 919 case SO_DETACH_FILTER: 920 ret = sk_detach_filter(sk); 921 break; 922 923 case SO_LOCK_FILTER: 924 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool) 925 ret = -EPERM; 926 else 927 sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool); 928 break; 929 930 case SO_PASSSEC: 931 if (valbool) 932 set_bit(SOCK_PASSSEC, &sock->flags); 933 else 934 clear_bit(SOCK_PASSSEC, &sock->flags); 935 break; 936 case SO_MARK: 937 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 938 ret = -EPERM; 939 else 940 sk->sk_mark = val; 941 break; 942 943 case SO_RXQ_OVFL: 944 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool); 945 break; 946 947 case SO_WIFI_STATUS: 948 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool); 949 break; 950 951 case SO_PEEK_OFF: 952 if (sock->ops->set_peek_off) 953 ret = sock->ops->set_peek_off(sk, val); 954 else 955 ret = -EOPNOTSUPP; 956 break; 957 958 case SO_NOFCS: 959 sock_valbool_flag(sk, SOCK_NOFCS, valbool); 960 break; 961 962 case SO_SELECT_ERR_QUEUE: 963 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool); 964 break; 965 966 #ifdef CONFIG_NET_RX_BUSY_POLL 967 case SO_BUSY_POLL: 968 /* allow unprivileged users to decrease the value */ 969 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) 970 ret = -EPERM; 971 else { 972 if (val < 0) 973 ret = -EINVAL; 974 else 975 sk->sk_ll_usec = val; 976 } 977 break; 978 #endif 979 980 case SO_MAX_PACING_RATE: 981 sk->sk_max_pacing_rate = val; 982 sk->sk_pacing_rate = min(sk->sk_pacing_rate, 983 sk->sk_max_pacing_rate); 984 break; 985 986 case SO_INCOMING_CPU: 987 sk->sk_incoming_cpu = val; 988 break; 989 990 default: 991 ret = -ENOPROTOOPT; 992 break; 993 } 994 release_sock(sk); 995 return ret; 996 } 997 EXPORT_SYMBOL(sock_setsockopt); 998 999 1000 static void cred_to_ucred(struct pid *pid, const struct cred *cred, 1001 struct ucred *ucred) 1002 { 1003 ucred->pid = pid_vnr(pid); 1004 ucred->uid = ucred->gid = -1; 1005 if (cred) { 1006 struct user_namespace *current_ns = current_user_ns(); 1007 1008 ucred->uid = from_kuid_munged(current_ns, cred->euid); 1009 ucred->gid = from_kgid_munged(current_ns, cred->egid); 1010 } 1011 } 1012 1013 int sock_getsockopt(struct socket *sock, int level, int optname, 1014 char __user *optval, int __user *optlen) 1015 { 1016 struct sock *sk = sock->sk; 1017 1018 union { 1019 int val; 1020 struct linger ling; 1021 struct timeval tm; 1022 } v; 1023 1024 int lv = sizeof(int); 1025 int len; 1026 1027 if (get_user(len, optlen)) 1028 return -EFAULT; 1029 if (len < 0) 1030 return -EINVAL; 1031 1032 memset(&v, 0, sizeof(v)); 1033 1034 switch (optname) { 1035 case SO_DEBUG: 1036 v.val = sock_flag(sk, SOCK_DBG); 1037 break; 1038 1039 case SO_DONTROUTE: 1040 v.val = sock_flag(sk, SOCK_LOCALROUTE); 1041 break; 1042 1043 case SO_BROADCAST: 1044 v.val = sock_flag(sk, SOCK_BROADCAST); 1045 break; 1046 1047 case SO_SNDBUF: 1048 v.val = sk->sk_sndbuf; 1049 break; 1050 1051 case SO_RCVBUF: 1052 v.val = sk->sk_rcvbuf; 1053 break; 1054 1055 case SO_REUSEADDR: 1056 v.val = sk->sk_reuse; 1057 break; 1058 1059 case SO_REUSEPORT: 1060 v.val = sk->sk_reuseport; 1061 break; 1062 1063 case SO_KEEPALIVE: 1064 v.val = sock_flag(sk, SOCK_KEEPOPEN); 1065 break; 1066 1067 case SO_TYPE: 1068 v.val = sk->sk_type; 1069 break; 1070 1071 case SO_PROTOCOL: 1072 v.val = sk->sk_protocol; 1073 break; 1074 1075 case SO_DOMAIN: 1076 v.val = sk->sk_family; 1077 break; 1078 1079 case SO_ERROR: 1080 v.val = -sock_error(sk); 1081 if (v.val == 0) 1082 v.val = xchg(&sk->sk_err_soft, 0); 1083 break; 1084 1085 case SO_OOBINLINE: 1086 v.val = sock_flag(sk, SOCK_URGINLINE); 1087 break; 1088 1089 case SO_NO_CHECK: 1090 v.val = sk->sk_no_check_tx; 1091 break; 1092 1093 case SO_PRIORITY: 1094 v.val = sk->sk_priority; 1095 break; 1096 1097 case SO_LINGER: 1098 lv = sizeof(v.ling); 1099 v.ling.l_onoff = sock_flag(sk, SOCK_LINGER); 1100 v.ling.l_linger = sk->sk_lingertime / HZ; 1101 break; 1102 1103 case SO_BSDCOMPAT: 1104 sock_warn_obsolete_bsdism("getsockopt"); 1105 break; 1106 1107 case SO_TIMESTAMP: 1108 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && 1109 !sock_flag(sk, SOCK_RCVTSTAMPNS); 1110 break; 1111 1112 case SO_TIMESTAMPNS: 1113 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS); 1114 break; 1115 1116 case SO_TIMESTAMPING: 1117 v.val = sk->sk_tsflags; 1118 break; 1119 1120 case SO_RCVTIMEO: 1121 lv = sizeof(struct timeval); 1122 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { 1123 v.tm.tv_sec = 0; 1124 v.tm.tv_usec = 0; 1125 } else { 1126 v.tm.tv_sec = sk->sk_rcvtimeo / HZ; 1127 v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ; 1128 } 1129 break; 1130 1131 case SO_SNDTIMEO: 1132 lv = sizeof(struct timeval); 1133 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) { 1134 v.tm.tv_sec = 0; 1135 v.tm.tv_usec = 0; 1136 } else { 1137 v.tm.tv_sec = sk->sk_sndtimeo / HZ; 1138 v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ; 1139 } 1140 break; 1141 1142 case SO_RCVLOWAT: 1143 v.val = sk->sk_rcvlowat; 1144 break; 1145 1146 case SO_SNDLOWAT: 1147 v.val = 1; 1148 break; 1149 1150 case SO_PASSCRED: 1151 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags); 1152 break; 1153 1154 case SO_PEERCRED: 1155 { 1156 struct ucred peercred; 1157 if (len > sizeof(peercred)) 1158 len = sizeof(peercred); 1159 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); 1160 if (copy_to_user(optval, &peercred, len)) 1161 return -EFAULT; 1162 goto lenout; 1163 } 1164 1165 case SO_PEERNAME: 1166 { 1167 char address[128]; 1168 1169 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2)) 1170 return -ENOTCONN; 1171 if (lv < len) 1172 return -EINVAL; 1173 if (copy_to_user(optval, address, len)) 1174 return -EFAULT; 1175 goto lenout; 1176 } 1177 1178 /* Dubious BSD thing... Probably nobody even uses it, but 1179 * the UNIX standard wants it for whatever reason... -DaveM 1180 */ 1181 case SO_ACCEPTCONN: 1182 v.val = sk->sk_state == TCP_LISTEN; 1183 break; 1184 1185 case SO_PASSSEC: 1186 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags); 1187 break; 1188 1189 case SO_PEERSEC: 1190 return security_socket_getpeersec_stream(sock, optval, optlen, len); 1191 1192 case SO_MARK: 1193 v.val = sk->sk_mark; 1194 break; 1195 1196 case SO_RXQ_OVFL: 1197 v.val = sock_flag(sk, SOCK_RXQ_OVFL); 1198 break; 1199 1200 case SO_WIFI_STATUS: 1201 v.val = sock_flag(sk, SOCK_WIFI_STATUS); 1202 break; 1203 1204 case SO_PEEK_OFF: 1205 if (!sock->ops->set_peek_off) 1206 return -EOPNOTSUPP; 1207 1208 v.val = sk->sk_peek_off; 1209 break; 1210 case SO_NOFCS: 1211 v.val = sock_flag(sk, SOCK_NOFCS); 1212 break; 1213 1214 case SO_BINDTODEVICE: 1215 return sock_getbindtodevice(sk, optval, optlen, len); 1216 1217 case SO_GET_FILTER: 1218 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); 1219 if (len < 0) 1220 return len; 1221 1222 goto lenout; 1223 1224 case SO_LOCK_FILTER: 1225 v.val = sock_flag(sk, SOCK_FILTER_LOCKED); 1226 break; 1227 1228 case SO_BPF_EXTENSIONS: 1229 v.val = bpf_tell_extensions(); 1230 break; 1231 1232 case SO_SELECT_ERR_QUEUE: 1233 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE); 1234 break; 1235 1236 #ifdef CONFIG_NET_RX_BUSY_POLL 1237 case SO_BUSY_POLL: 1238 v.val = sk->sk_ll_usec; 1239 break; 1240 #endif 1241 1242 case SO_MAX_PACING_RATE: 1243 v.val = sk->sk_max_pacing_rate; 1244 break; 1245 1246 case SO_INCOMING_CPU: 1247 v.val = sk->sk_incoming_cpu; 1248 break; 1249 1250 default: 1251 /* We implement the SO_SNDLOWAT etc to not be settable 1252 * (1003.1g 7). 1253 */ 1254 return -ENOPROTOOPT; 1255 } 1256 1257 if (len > lv) 1258 len = lv; 1259 if (copy_to_user(optval, &v, len)) 1260 return -EFAULT; 1261 lenout: 1262 if (put_user(len, optlen)) 1263 return -EFAULT; 1264 return 0; 1265 } 1266 1267 /* 1268 * Initialize an sk_lock. 1269 * 1270 * (We also register the sk_lock with the lock validator.) 1271 */ 1272 static inline void sock_lock_init(struct sock *sk) 1273 { 1274 sock_lock_init_class_and_name(sk, 1275 af_family_slock_key_strings[sk->sk_family], 1276 af_family_slock_keys + sk->sk_family, 1277 af_family_key_strings[sk->sk_family], 1278 af_family_keys + sk->sk_family); 1279 } 1280 1281 /* 1282 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, 1283 * even temporarly, because of RCU lookups. sk_node should also be left as is. 1284 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end 1285 */ 1286 static void sock_copy(struct sock *nsk, const struct sock *osk) 1287 { 1288 #ifdef CONFIG_SECURITY_NETWORK 1289 void *sptr = nsk->sk_security; 1290 #endif 1291 memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin)); 1292 1293 memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end, 1294 osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end)); 1295 1296 #ifdef CONFIG_SECURITY_NETWORK 1297 nsk->sk_security = sptr; 1298 security_sk_clone(osk, nsk); 1299 #endif 1300 } 1301 1302 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size) 1303 { 1304 unsigned long nulls1, nulls2; 1305 1306 nulls1 = offsetof(struct sock, __sk_common.skc_node.next); 1307 nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next); 1308 if (nulls1 > nulls2) 1309 swap(nulls1, nulls2); 1310 1311 if (nulls1 != 0) 1312 memset((char *)sk, 0, nulls1); 1313 memset((char *)sk + nulls1 + sizeof(void *), 0, 1314 nulls2 - nulls1 - sizeof(void *)); 1315 memset((char *)sk + nulls2 + sizeof(void *), 0, 1316 size - nulls2 - sizeof(void *)); 1317 } 1318 EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls); 1319 1320 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, 1321 int family) 1322 { 1323 struct sock *sk; 1324 struct kmem_cache *slab; 1325 1326 slab = prot->slab; 1327 if (slab != NULL) { 1328 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO); 1329 if (!sk) 1330 return sk; 1331 if (priority & __GFP_ZERO) { 1332 if (prot->clear_sk) 1333 prot->clear_sk(sk, prot->obj_size); 1334 else 1335 sk_prot_clear_nulls(sk, prot->obj_size); 1336 } 1337 } else 1338 sk = kmalloc(prot->obj_size, priority); 1339 1340 if (sk != NULL) { 1341 kmemcheck_annotate_bitfield(sk, flags); 1342 1343 if (security_sk_alloc(sk, family, priority)) 1344 goto out_free; 1345 1346 if (!try_module_get(prot->owner)) 1347 goto out_free_sec; 1348 sk_tx_queue_clear(sk); 1349 cgroup_sk_alloc(&sk->sk_cgrp_data); 1350 } 1351 1352 return sk; 1353 1354 out_free_sec: 1355 security_sk_free(sk); 1356 out_free: 1357 if (slab != NULL) 1358 kmem_cache_free(slab, sk); 1359 else 1360 kfree(sk); 1361 return NULL; 1362 } 1363 1364 static void sk_prot_free(struct proto *prot, struct sock *sk) 1365 { 1366 struct kmem_cache *slab; 1367 struct module *owner; 1368 1369 owner = prot->owner; 1370 slab = prot->slab; 1371 1372 cgroup_sk_free(&sk->sk_cgrp_data); 1373 security_sk_free(sk); 1374 if (slab != NULL) 1375 kmem_cache_free(slab, sk); 1376 else 1377 kfree(sk); 1378 module_put(owner); 1379 } 1380 1381 /** 1382 * sk_alloc - All socket objects are allocated here 1383 * @net: the applicable net namespace 1384 * @family: protocol family 1385 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1386 * @prot: struct proto associated with this new sock instance 1387 * @kern: is this to be a kernel socket? 1388 */ 1389 struct sock *sk_alloc(struct net *net, int family, gfp_t priority, 1390 struct proto *prot, int kern) 1391 { 1392 struct sock *sk; 1393 1394 sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); 1395 if (sk) { 1396 sk->sk_family = family; 1397 /* 1398 * See comment in struct sock definition to understand 1399 * why we need sk_prot_creator -acme 1400 */ 1401 sk->sk_prot = sk->sk_prot_creator = prot; 1402 sock_lock_init(sk); 1403 sk->sk_net_refcnt = kern ? 0 : 1; 1404 if (likely(sk->sk_net_refcnt)) 1405 get_net(net); 1406 sock_net_set(sk, net); 1407 atomic_set(&sk->sk_wmem_alloc, 1); 1408 1409 sock_update_classid(&sk->sk_cgrp_data); 1410 sock_update_netprioidx(&sk->sk_cgrp_data); 1411 } 1412 1413 return sk; 1414 } 1415 EXPORT_SYMBOL(sk_alloc); 1416 1417 void sk_destruct(struct sock *sk) 1418 { 1419 struct sk_filter *filter; 1420 1421 if (sk->sk_destruct) 1422 sk->sk_destruct(sk); 1423 1424 filter = rcu_dereference_check(sk->sk_filter, 1425 atomic_read(&sk->sk_wmem_alloc) == 0); 1426 if (filter) { 1427 sk_filter_uncharge(sk, filter); 1428 RCU_INIT_POINTER(sk->sk_filter, NULL); 1429 } 1430 if (rcu_access_pointer(sk->sk_reuseport_cb)) 1431 reuseport_detach_sock(sk); 1432 1433 sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP); 1434 1435 if (atomic_read(&sk->sk_omem_alloc)) 1436 pr_debug("%s: optmem leakage (%d bytes) detected\n", 1437 __func__, atomic_read(&sk->sk_omem_alloc)); 1438 1439 if (sk->sk_peer_cred) 1440 put_cred(sk->sk_peer_cred); 1441 put_pid(sk->sk_peer_pid); 1442 if (likely(sk->sk_net_refcnt)) 1443 put_net(sock_net(sk)); 1444 sk_prot_free(sk->sk_prot_creator, sk); 1445 } 1446 1447 static void __sk_free(struct sock *sk) 1448 { 1449 if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt)) 1450 sock_diag_broadcast_destroy(sk); 1451 else 1452 sk_destruct(sk); 1453 } 1454 1455 void sk_free(struct sock *sk) 1456 { 1457 /* 1458 * We subtract one from sk_wmem_alloc and can know if 1459 * some packets are still in some tx queue. 1460 * If not null, sock_wfree() will call __sk_free(sk) later 1461 */ 1462 if (atomic_dec_and_test(&sk->sk_wmem_alloc)) 1463 __sk_free(sk); 1464 } 1465 EXPORT_SYMBOL(sk_free); 1466 1467 /** 1468 * sk_clone_lock - clone a socket, and lock its clone 1469 * @sk: the socket to clone 1470 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) 1471 * 1472 * Caller must unlock socket even in error path (bh_unlock_sock(newsk)) 1473 */ 1474 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) 1475 { 1476 struct sock *newsk; 1477 bool is_charged = true; 1478 1479 newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family); 1480 if (newsk != NULL) { 1481 struct sk_filter *filter; 1482 1483 sock_copy(newsk, sk); 1484 1485 /* SANITY */ 1486 if (likely(newsk->sk_net_refcnt)) 1487 get_net(sock_net(newsk)); 1488 sk_node_init(&newsk->sk_node); 1489 sock_lock_init(newsk); 1490 bh_lock_sock(newsk); 1491 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; 1492 newsk->sk_backlog.len = 0; 1493 1494 atomic_set(&newsk->sk_rmem_alloc, 0); 1495 /* 1496 * sk_wmem_alloc set to one (see sk_free() and sock_wfree()) 1497 */ 1498 atomic_set(&newsk->sk_wmem_alloc, 1); 1499 atomic_set(&newsk->sk_omem_alloc, 0); 1500 skb_queue_head_init(&newsk->sk_receive_queue); 1501 skb_queue_head_init(&newsk->sk_write_queue); 1502 1503 rwlock_init(&newsk->sk_callback_lock); 1504 lockdep_set_class_and_name(&newsk->sk_callback_lock, 1505 af_callback_keys + newsk->sk_family, 1506 af_family_clock_key_strings[newsk->sk_family]); 1507 1508 newsk->sk_dst_cache = NULL; 1509 newsk->sk_wmem_queued = 0; 1510 newsk->sk_forward_alloc = 0; 1511 newsk->sk_send_head = NULL; 1512 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; 1513 1514 sock_reset_flag(newsk, SOCK_DONE); 1515 skb_queue_head_init(&newsk->sk_error_queue); 1516 1517 filter = rcu_dereference_protected(newsk->sk_filter, 1); 1518 if (filter != NULL) 1519 /* though it's an empty new sock, the charging may fail 1520 * if sysctl_optmem_max was changed between creation of 1521 * original socket and cloning 1522 */ 1523 is_charged = sk_filter_charge(newsk, filter); 1524 1525 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) { 1526 /* It is still raw copy of parent, so invalidate 1527 * destructor and make plain sk_free() */ 1528 newsk->sk_destruct = NULL; 1529 bh_unlock_sock(newsk); 1530 sk_free(newsk); 1531 newsk = NULL; 1532 goto out; 1533 } 1534 1535 newsk->sk_err = 0; 1536 newsk->sk_priority = 0; 1537 newsk->sk_incoming_cpu = raw_smp_processor_id(); 1538 atomic64_set(&newsk->sk_cookie, 0); 1539 /* 1540 * Before updating sk_refcnt, we must commit prior changes to memory 1541 * (Documentation/RCU/rculist_nulls.txt for details) 1542 */ 1543 smp_wmb(); 1544 atomic_set(&newsk->sk_refcnt, 2); 1545 1546 /* 1547 * Increment the counter in the same struct proto as the master 1548 * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that 1549 * is the same as sk->sk_prot->socks, as this field was copied 1550 * with memcpy). 1551 * 1552 * This _changes_ the previous behaviour, where 1553 * tcp_create_openreq_child always was incrementing the 1554 * equivalent to tcp_prot->socks (inet_sock_nr), so this have 1555 * to be taken into account in all callers. -acme 1556 */ 1557 sk_refcnt_debug_inc(newsk); 1558 sk_set_socket(newsk, NULL); 1559 newsk->sk_wq = NULL; 1560 1561 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 1562 sock_update_memcg(newsk); 1563 1564 if (newsk->sk_prot->sockets_allocated) 1565 sk_sockets_allocated_inc(newsk); 1566 1567 if (sock_needs_netstamp(sk) && 1568 newsk->sk_flags & SK_FLAGS_TIMESTAMP) 1569 net_enable_timestamp(); 1570 } 1571 out: 1572 return newsk; 1573 } 1574 EXPORT_SYMBOL_GPL(sk_clone_lock); 1575 1576 void sk_setup_caps(struct sock *sk, struct dst_entry *dst) 1577 { 1578 u32 max_segs = 1; 1579 1580 sk_dst_set(sk, dst); 1581 sk->sk_route_caps = dst->dev->features; 1582 if (sk->sk_route_caps & NETIF_F_GSO) 1583 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; 1584 sk->sk_route_caps &= ~sk->sk_route_nocaps; 1585 if (sk_can_gso(sk)) { 1586 if (dst->header_len) { 1587 sk->sk_route_caps &= ~NETIF_F_GSO_MASK; 1588 } else { 1589 sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; 1590 sk->sk_gso_max_size = dst->dev->gso_max_size; 1591 max_segs = max_t(u32, dst->dev->gso_max_segs, 1); 1592 } 1593 } 1594 sk->sk_gso_max_segs = max_segs; 1595 } 1596 EXPORT_SYMBOL_GPL(sk_setup_caps); 1597 1598 /* 1599 * Simple resource managers for sockets. 1600 */ 1601 1602 1603 /* 1604 * Write buffer destructor automatically called from kfree_skb. 1605 */ 1606 void sock_wfree(struct sk_buff *skb) 1607 { 1608 struct sock *sk = skb->sk; 1609 unsigned int len = skb->truesize; 1610 1611 if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) { 1612 /* 1613 * Keep a reference on sk_wmem_alloc, this will be released 1614 * after sk_write_space() call 1615 */ 1616 atomic_sub(len - 1, &sk->sk_wmem_alloc); 1617 sk->sk_write_space(sk); 1618 len = 1; 1619 } 1620 /* 1621 * if sk_wmem_alloc reaches 0, we must finish what sk_free() 1622 * could not do because of in-flight packets 1623 */ 1624 if (atomic_sub_and_test(len, &sk->sk_wmem_alloc)) 1625 __sk_free(sk); 1626 } 1627 EXPORT_SYMBOL(sock_wfree); 1628 1629 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk) 1630 { 1631 skb_orphan(skb); 1632 skb->sk = sk; 1633 #ifdef CONFIG_INET 1634 if (unlikely(!sk_fullsock(sk))) { 1635 skb->destructor = sock_edemux; 1636 sock_hold(sk); 1637 return; 1638 } 1639 #endif 1640 skb->destructor = sock_wfree; 1641 skb_set_hash_from_sk(skb, sk); 1642 /* 1643 * We used to take a refcount on sk, but following operation 1644 * is enough to guarantee sk_free() wont free this sock until 1645 * all in-flight packets are completed 1646 */ 1647 atomic_add(skb->truesize, &sk->sk_wmem_alloc); 1648 } 1649 EXPORT_SYMBOL(skb_set_owner_w); 1650 1651 void skb_orphan_partial(struct sk_buff *skb) 1652 { 1653 /* TCP stack sets skb->ooo_okay based on sk_wmem_alloc, 1654 * so we do not completely orphan skb, but transfert all 1655 * accounted bytes but one, to avoid unexpected reorders. 1656 */ 1657 if (skb->destructor == sock_wfree 1658 #ifdef CONFIG_INET 1659 || skb->destructor == tcp_wfree 1660 #endif 1661 ) { 1662 atomic_sub(skb->truesize - 1, &skb->sk->sk_wmem_alloc); 1663 skb->truesize = 1; 1664 } else { 1665 skb_orphan(skb); 1666 } 1667 } 1668 EXPORT_SYMBOL(skb_orphan_partial); 1669 1670 /* 1671 * Read buffer destructor automatically called from kfree_skb. 1672 */ 1673 void sock_rfree(struct sk_buff *skb) 1674 { 1675 struct sock *sk = skb->sk; 1676 unsigned int len = skb->truesize; 1677 1678 atomic_sub(len, &sk->sk_rmem_alloc); 1679 sk_mem_uncharge(sk, len); 1680 } 1681 EXPORT_SYMBOL(sock_rfree); 1682 1683 /* 1684 * Buffer destructor for skbs that are not used directly in read or write 1685 * path, e.g. for error handler skbs. Automatically called from kfree_skb. 1686 */ 1687 void sock_efree(struct sk_buff *skb) 1688 { 1689 sock_put(skb->sk); 1690 } 1691 EXPORT_SYMBOL(sock_efree); 1692 1693 kuid_t sock_i_uid(struct sock *sk) 1694 { 1695 kuid_t uid; 1696 1697 read_lock_bh(&sk->sk_callback_lock); 1698 uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID; 1699 read_unlock_bh(&sk->sk_callback_lock); 1700 return uid; 1701 } 1702 EXPORT_SYMBOL(sock_i_uid); 1703 1704 unsigned long sock_i_ino(struct sock *sk) 1705 { 1706 unsigned long ino; 1707 1708 read_lock_bh(&sk->sk_callback_lock); 1709 ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; 1710 read_unlock_bh(&sk->sk_callback_lock); 1711 return ino; 1712 } 1713 EXPORT_SYMBOL(sock_i_ino); 1714 1715 /* 1716 * Allocate a skb from the socket's send buffer. 1717 */ 1718 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, 1719 gfp_t priority) 1720 { 1721 if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { 1722 struct sk_buff *skb = alloc_skb(size, priority); 1723 if (skb) { 1724 skb_set_owner_w(skb, sk); 1725 return skb; 1726 } 1727 } 1728 return NULL; 1729 } 1730 EXPORT_SYMBOL(sock_wmalloc); 1731 1732 /* 1733 * Allocate a memory block from the socket's option memory buffer. 1734 */ 1735 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority) 1736 { 1737 if ((unsigned int)size <= sysctl_optmem_max && 1738 atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { 1739 void *mem; 1740 /* First do the add, to avoid the race if kmalloc 1741 * might sleep. 1742 */ 1743 atomic_add(size, &sk->sk_omem_alloc); 1744 mem = kmalloc(size, priority); 1745 if (mem) 1746 return mem; 1747 atomic_sub(size, &sk->sk_omem_alloc); 1748 } 1749 return NULL; 1750 } 1751 EXPORT_SYMBOL(sock_kmalloc); 1752 1753 /* Free an option memory block. Note, we actually want the inline 1754 * here as this allows gcc to detect the nullify and fold away the 1755 * condition entirely. 1756 */ 1757 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size, 1758 const bool nullify) 1759 { 1760 if (WARN_ON_ONCE(!mem)) 1761 return; 1762 if (nullify) 1763 kzfree(mem); 1764 else 1765 kfree(mem); 1766 atomic_sub(size, &sk->sk_omem_alloc); 1767 } 1768 1769 void sock_kfree_s(struct sock *sk, void *mem, int size) 1770 { 1771 __sock_kfree_s(sk, mem, size, false); 1772 } 1773 EXPORT_SYMBOL(sock_kfree_s); 1774 1775 void sock_kzfree_s(struct sock *sk, void *mem, int size) 1776 { 1777 __sock_kfree_s(sk, mem, size, true); 1778 } 1779 EXPORT_SYMBOL(sock_kzfree_s); 1780 1781 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock. 1782 I think, these locks should be removed for datagram sockets. 1783 */ 1784 static long sock_wait_for_wmem(struct sock *sk, long timeo) 1785 { 1786 DEFINE_WAIT(wait); 1787 1788 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk); 1789 for (;;) { 1790 if (!timeo) 1791 break; 1792 if (signal_pending(current)) 1793 break; 1794 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1795 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 1796 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) 1797 break; 1798 if (sk->sk_shutdown & SEND_SHUTDOWN) 1799 break; 1800 if (sk->sk_err) 1801 break; 1802 timeo = schedule_timeout(timeo); 1803 } 1804 finish_wait(sk_sleep(sk), &wait); 1805 return timeo; 1806 } 1807 1808 1809 /* 1810 * Generic send/receive buffer handlers 1811 */ 1812 1813 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, 1814 unsigned long data_len, int noblock, 1815 int *errcode, int max_page_order) 1816 { 1817 struct sk_buff *skb; 1818 long timeo; 1819 int err; 1820 1821 timeo = sock_sndtimeo(sk, noblock); 1822 for (;;) { 1823 err = sock_error(sk); 1824 if (err != 0) 1825 goto failure; 1826 1827 err = -EPIPE; 1828 if (sk->sk_shutdown & SEND_SHUTDOWN) 1829 goto failure; 1830 1831 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf) 1832 break; 1833 1834 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); 1835 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); 1836 err = -EAGAIN; 1837 if (!timeo) 1838 goto failure; 1839 if (signal_pending(current)) 1840 goto interrupted; 1841 timeo = sock_wait_for_wmem(sk, timeo); 1842 } 1843 skb = alloc_skb_with_frags(header_len, data_len, max_page_order, 1844 errcode, sk->sk_allocation); 1845 if (skb) 1846 skb_set_owner_w(skb, sk); 1847 return skb; 1848 1849 interrupted: 1850 err = sock_intr_errno(timeo); 1851 failure: 1852 *errcode = err; 1853 return NULL; 1854 } 1855 EXPORT_SYMBOL(sock_alloc_send_pskb); 1856 1857 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 1858 int noblock, int *errcode) 1859 { 1860 return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0); 1861 } 1862 EXPORT_SYMBOL(sock_alloc_send_skb); 1863 1864 int sock_cmsg_send(struct sock *sk, struct msghdr *msg, 1865 struct sockcm_cookie *sockc) 1866 { 1867 struct cmsghdr *cmsg; 1868 1869 for_each_cmsghdr(cmsg, msg) { 1870 if (!CMSG_OK(msg, cmsg)) 1871 return -EINVAL; 1872 if (cmsg->cmsg_level != SOL_SOCKET) 1873 continue; 1874 switch (cmsg->cmsg_type) { 1875 case SO_MARK: 1876 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) 1877 return -EPERM; 1878 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32))) 1879 return -EINVAL; 1880 sockc->mark = *(u32 *)CMSG_DATA(cmsg); 1881 break; 1882 default: 1883 return -EINVAL; 1884 } 1885 } 1886 return 0; 1887 } 1888 EXPORT_SYMBOL(sock_cmsg_send); 1889 1890 /* On 32bit arches, an skb frag is limited to 2^15 */ 1891 #define SKB_FRAG_PAGE_ORDER get_order(32768) 1892 1893 /** 1894 * skb_page_frag_refill - check that a page_frag contains enough room 1895 * @sz: minimum size of the fragment we want to get 1896 * @pfrag: pointer to page_frag 1897 * @gfp: priority for memory allocation 1898 * 1899 * Note: While this allocator tries to use high order pages, there is 1900 * no guarantee that allocations succeed. Therefore, @sz MUST be 1901 * less or equal than PAGE_SIZE. 1902 */ 1903 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) 1904 { 1905 if (pfrag->page) { 1906 if (atomic_read(&pfrag->page->_count) == 1) { 1907 pfrag->offset = 0; 1908 return true; 1909 } 1910 if (pfrag->offset + sz <= pfrag->size) 1911 return true; 1912 put_page(pfrag->page); 1913 } 1914 1915 pfrag->offset = 0; 1916 if (SKB_FRAG_PAGE_ORDER) { 1917 /* Avoid direct reclaim but allow kswapd to wake */ 1918 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | 1919 __GFP_COMP | __GFP_NOWARN | 1920 __GFP_NORETRY, 1921 SKB_FRAG_PAGE_ORDER); 1922 if (likely(pfrag->page)) { 1923 pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER; 1924 return true; 1925 } 1926 } 1927 pfrag->page = alloc_page(gfp); 1928 if (likely(pfrag->page)) { 1929 pfrag->size = PAGE_SIZE; 1930 return true; 1931 } 1932 return false; 1933 } 1934 EXPORT_SYMBOL(skb_page_frag_refill); 1935 1936 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) 1937 { 1938 if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) 1939 return true; 1940 1941 sk_enter_memory_pressure(sk); 1942 sk_stream_moderate_sndbuf(sk); 1943 return false; 1944 } 1945 EXPORT_SYMBOL(sk_page_frag_refill); 1946 1947 static void __lock_sock(struct sock *sk) 1948 __releases(&sk->sk_lock.slock) 1949 __acquires(&sk->sk_lock.slock) 1950 { 1951 DEFINE_WAIT(wait); 1952 1953 for (;;) { 1954 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, 1955 TASK_UNINTERRUPTIBLE); 1956 spin_unlock_bh(&sk->sk_lock.slock); 1957 schedule(); 1958 spin_lock_bh(&sk->sk_lock.slock); 1959 if (!sock_owned_by_user(sk)) 1960 break; 1961 } 1962 finish_wait(&sk->sk_lock.wq, &wait); 1963 } 1964 1965 static void __release_sock(struct sock *sk) 1966 __releases(&sk->sk_lock.slock) 1967 __acquires(&sk->sk_lock.slock) 1968 { 1969 struct sk_buff *skb = sk->sk_backlog.head; 1970 1971 do { 1972 sk->sk_backlog.head = sk->sk_backlog.tail = NULL; 1973 bh_unlock_sock(sk); 1974 1975 do { 1976 struct sk_buff *next = skb->next; 1977 1978 prefetch(next); 1979 WARN_ON_ONCE(skb_dst_is_noref(skb)); 1980 skb->next = NULL; 1981 sk_backlog_rcv(sk, skb); 1982 1983 /* 1984 * We are in process context here with softirqs 1985 * disabled, use cond_resched_softirq() to preempt. 1986 * This is safe to do because we've taken the backlog 1987 * queue private: 1988 */ 1989 cond_resched_softirq(); 1990 1991 skb = next; 1992 } while (skb != NULL); 1993 1994 bh_lock_sock(sk); 1995 } while ((skb = sk->sk_backlog.head) != NULL); 1996 1997 /* 1998 * Doing the zeroing here guarantee we can not loop forever 1999 * while a wild producer attempts to flood us. 2000 */ 2001 sk->sk_backlog.len = 0; 2002 } 2003 2004 /** 2005 * sk_wait_data - wait for data to arrive at sk_receive_queue 2006 * @sk: sock to wait on 2007 * @timeo: for how long 2008 * @skb: last skb seen on sk_receive_queue 2009 * 2010 * Now socket state including sk->sk_err is changed only under lock, 2011 * hence we may omit checks after joining wait queue. 2012 * We check receive queue before schedule() only as optimization; 2013 * it is very likely that release_sock() added new data. 2014 */ 2015 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb) 2016 { 2017 int rc; 2018 DEFINE_WAIT(wait); 2019 2020 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); 2021 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2022 rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb); 2023 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); 2024 finish_wait(sk_sleep(sk), &wait); 2025 return rc; 2026 } 2027 EXPORT_SYMBOL(sk_wait_data); 2028 2029 /** 2030 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated 2031 * @sk: socket 2032 * @size: memory size to allocate 2033 * @kind: allocation type 2034 * 2035 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means 2036 * rmem allocation. This function assumes that protocols which have 2037 * memory_pressure use sk_wmem_queued as write buffer accounting. 2038 */ 2039 int __sk_mem_schedule(struct sock *sk, int size, int kind) 2040 { 2041 struct proto *prot = sk->sk_prot; 2042 int amt = sk_mem_pages(size); 2043 long allocated; 2044 2045 sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; 2046 2047 allocated = sk_memory_allocated_add(sk, amt); 2048 2049 if (mem_cgroup_sockets_enabled && sk->sk_memcg && 2050 !mem_cgroup_charge_skmem(sk->sk_memcg, amt)) 2051 goto suppress_allocation; 2052 2053 /* Under limit. */ 2054 if (allocated <= sk_prot_mem_limits(sk, 0)) { 2055 sk_leave_memory_pressure(sk); 2056 return 1; 2057 } 2058 2059 /* Under pressure. */ 2060 if (allocated > sk_prot_mem_limits(sk, 1)) 2061 sk_enter_memory_pressure(sk); 2062 2063 /* Over hard limit. */ 2064 if (allocated > sk_prot_mem_limits(sk, 2)) 2065 goto suppress_allocation; 2066 2067 /* guarantee minimum buffer size under pressure */ 2068 if (kind == SK_MEM_RECV) { 2069 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0]) 2070 return 1; 2071 2072 } else { /* SK_MEM_SEND */ 2073 if (sk->sk_type == SOCK_STREAM) { 2074 if (sk->sk_wmem_queued < prot->sysctl_wmem[0]) 2075 return 1; 2076 } else if (atomic_read(&sk->sk_wmem_alloc) < 2077 prot->sysctl_wmem[0]) 2078 return 1; 2079 } 2080 2081 if (sk_has_memory_pressure(sk)) { 2082 int alloc; 2083 2084 if (!sk_under_memory_pressure(sk)) 2085 return 1; 2086 alloc = sk_sockets_allocated_read_positive(sk); 2087 if (sk_prot_mem_limits(sk, 2) > alloc * 2088 sk_mem_pages(sk->sk_wmem_queued + 2089 atomic_read(&sk->sk_rmem_alloc) + 2090 sk->sk_forward_alloc)) 2091 return 1; 2092 } 2093 2094 suppress_allocation: 2095 2096 if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { 2097 sk_stream_moderate_sndbuf(sk); 2098 2099 /* Fail only if socket is _under_ its sndbuf. 2100 * In this case we cannot block, so that we have to fail. 2101 */ 2102 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) 2103 return 1; 2104 } 2105 2106 trace_sock_exceed_buf_limit(sk, prot, allocated); 2107 2108 /* Alas. Undo changes. */ 2109 sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; 2110 2111 sk_memory_allocated_sub(sk, amt); 2112 2113 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2114 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt); 2115 2116 return 0; 2117 } 2118 EXPORT_SYMBOL(__sk_mem_schedule); 2119 2120 /** 2121 * __sk_mem_reclaim - reclaim memory_allocated 2122 * @sk: socket 2123 * @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple) 2124 */ 2125 void __sk_mem_reclaim(struct sock *sk, int amount) 2126 { 2127 amount >>= SK_MEM_QUANTUM_SHIFT; 2128 sk_memory_allocated_sub(sk, amount); 2129 sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT; 2130 2131 if (mem_cgroup_sockets_enabled && sk->sk_memcg) 2132 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount); 2133 2134 if (sk_under_memory_pressure(sk) && 2135 (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0))) 2136 sk_leave_memory_pressure(sk); 2137 } 2138 EXPORT_SYMBOL(__sk_mem_reclaim); 2139 2140 2141 /* 2142 * Set of default routines for initialising struct proto_ops when 2143 * the protocol does not support a particular function. In certain 2144 * cases where it makes no sense for a protocol to have a "do nothing" 2145 * function, some default processing is provided. 2146 */ 2147 2148 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) 2149 { 2150 return -EOPNOTSUPP; 2151 } 2152 EXPORT_SYMBOL(sock_no_bind); 2153 2154 int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 2155 int len, int flags) 2156 { 2157 return -EOPNOTSUPP; 2158 } 2159 EXPORT_SYMBOL(sock_no_connect); 2160 2161 int sock_no_socketpair(struct socket *sock1, struct socket *sock2) 2162 { 2163 return -EOPNOTSUPP; 2164 } 2165 EXPORT_SYMBOL(sock_no_socketpair); 2166 2167 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags) 2168 { 2169 return -EOPNOTSUPP; 2170 } 2171 EXPORT_SYMBOL(sock_no_accept); 2172 2173 int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 2174 int *len, int peer) 2175 { 2176 return -EOPNOTSUPP; 2177 } 2178 EXPORT_SYMBOL(sock_no_getname); 2179 2180 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt) 2181 { 2182 return 0; 2183 } 2184 EXPORT_SYMBOL(sock_no_poll); 2185 2186 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) 2187 { 2188 return -EOPNOTSUPP; 2189 } 2190 EXPORT_SYMBOL(sock_no_ioctl); 2191 2192 int sock_no_listen(struct socket *sock, int backlog) 2193 { 2194 return -EOPNOTSUPP; 2195 } 2196 EXPORT_SYMBOL(sock_no_listen); 2197 2198 int sock_no_shutdown(struct socket *sock, int how) 2199 { 2200 return -EOPNOTSUPP; 2201 } 2202 EXPORT_SYMBOL(sock_no_shutdown); 2203 2204 int sock_no_setsockopt(struct socket *sock, int level, int optname, 2205 char __user *optval, unsigned int optlen) 2206 { 2207 return -EOPNOTSUPP; 2208 } 2209 EXPORT_SYMBOL(sock_no_setsockopt); 2210 2211 int sock_no_getsockopt(struct socket *sock, int level, int optname, 2212 char __user *optval, int __user *optlen) 2213 { 2214 return -EOPNOTSUPP; 2215 } 2216 EXPORT_SYMBOL(sock_no_getsockopt); 2217 2218 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len) 2219 { 2220 return -EOPNOTSUPP; 2221 } 2222 EXPORT_SYMBOL(sock_no_sendmsg); 2223 2224 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len, 2225 int flags) 2226 { 2227 return -EOPNOTSUPP; 2228 } 2229 EXPORT_SYMBOL(sock_no_recvmsg); 2230 2231 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) 2232 { 2233 /* Mirror missing mmap method error code */ 2234 return -ENODEV; 2235 } 2236 EXPORT_SYMBOL(sock_no_mmap); 2237 2238 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) 2239 { 2240 ssize_t res; 2241 struct msghdr msg = {.msg_flags = flags}; 2242 struct kvec iov; 2243 char *kaddr = kmap(page); 2244 iov.iov_base = kaddr + offset; 2245 iov.iov_len = size; 2246 res = kernel_sendmsg(sock, &msg, &iov, 1, size); 2247 kunmap(page); 2248 return res; 2249 } 2250 EXPORT_SYMBOL(sock_no_sendpage); 2251 2252 /* 2253 * Default Socket Callbacks 2254 */ 2255 2256 static void sock_def_wakeup(struct sock *sk) 2257 { 2258 struct socket_wq *wq; 2259 2260 rcu_read_lock(); 2261 wq = rcu_dereference(sk->sk_wq); 2262 if (skwq_has_sleeper(wq)) 2263 wake_up_interruptible_all(&wq->wait); 2264 rcu_read_unlock(); 2265 } 2266 2267 static void sock_def_error_report(struct sock *sk) 2268 { 2269 struct socket_wq *wq; 2270 2271 rcu_read_lock(); 2272 wq = rcu_dereference(sk->sk_wq); 2273 if (skwq_has_sleeper(wq)) 2274 wake_up_interruptible_poll(&wq->wait, POLLERR); 2275 sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); 2276 rcu_read_unlock(); 2277 } 2278 2279 static void sock_def_readable(struct sock *sk) 2280 { 2281 struct socket_wq *wq; 2282 2283 rcu_read_lock(); 2284 wq = rcu_dereference(sk->sk_wq); 2285 if (skwq_has_sleeper(wq)) 2286 wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI | 2287 POLLRDNORM | POLLRDBAND); 2288 sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); 2289 rcu_read_unlock(); 2290 } 2291 2292 static void sock_def_write_space(struct sock *sk) 2293 { 2294 struct socket_wq *wq; 2295 2296 rcu_read_lock(); 2297 2298 /* Do not wake up a writer until he can make "significant" 2299 * progress. --DaveM 2300 */ 2301 if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { 2302 wq = rcu_dereference(sk->sk_wq); 2303 if (skwq_has_sleeper(wq)) 2304 wake_up_interruptible_sync_poll(&wq->wait, POLLOUT | 2305 POLLWRNORM | POLLWRBAND); 2306 2307 /* Should agree with poll, otherwise some programs break */ 2308 if (sock_writeable(sk)) 2309 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); 2310 } 2311 2312 rcu_read_unlock(); 2313 } 2314 2315 static void sock_def_destruct(struct sock *sk) 2316 { 2317 } 2318 2319 void sk_send_sigurg(struct sock *sk) 2320 { 2321 if (sk->sk_socket && sk->sk_socket->file) 2322 if (send_sigurg(&sk->sk_socket->file->f_owner)) 2323 sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); 2324 } 2325 EXPORT_SYMBOL(sk_send_sigurg); 2326 2327 void sk_reset_timer(struct sock *sk, struct timer_list* timer, 2328 unsigned long expires) 2329 { 2330 if (!mod_timer(timer, expires)) 2331 sock_hold(sk); 2332 } 2333 EXPORT_SYMBOL(sk_reset_timer); 2334 2335 void sk_stop_timer(struct sock *sk, struct timer_list* timer) 2336 { 2337 if (del_timer(timer)) 2338 __sock_put(sk); 2339 } 2340 EXPORT_SYMBOL(sk_stop_timer); 2341 2342 void sock_init_data(struct socket *sock, struct sock *sk) 2343 { 2344 skb_queue_head_init(&sk->sk_receive_queue); 2345 skb_queue_head_init(&sk->sk_write_queue); 2346 skb_queue_head_init(&sk->sk_error_queue); 2347 2348 sk->sk_send_head = NULL; 2349 2350 init_timer(&sk->sk_timer); 2351 2352 sk->sk_allocation = GFP_KERNEL; 2353 sk->sk_rcvbuf = sysctl_rmem_default; 2354 sk->sk_sndbuf = sysctl_wmem_default; 2355 sk->sk_state = TCP_CLOSE; 2356 sk_set_socket(sk, sock); 2357 2358 sock_set_flag(sk, SOCK_ZAPPED); 2359 2360 if (sock) { 2361 sk->sk_type = sock->type; 2362 sk->sk_wq = sock->wq; 2363 sock->sk = sk; 2364 } else 2365 sk->sk_wq = NULL; 2366 2367 rwlock_init(&sk->sk_callback_lock); 2368 lockdep_set_class_and_name(&sk->sk_callback_lock, 2369 af_callback_keys + sk->sk_family, 2370 af_family_clock_key_strings[sk->sk_family]); 2371 2372 sk->sk_state_change = sock_def_wakeup; 2373 sk->sk_data_ready = sock_def_readable; 2374 sk->sk_write_space = sock_def_write_space; 2375 sk->sk_error_report = sock_def_error_report; 2376 sk->sk_destruct = sock_def_destruct; 2377 2378 sk->sk_frag.page = NULL; 2379 sk->sk_frag.offset = 0; 2380 sk->sk_peek_off = -1; 2381 2382 sk->sk_peer_pid = NULL; 2383 sk->sk_peer_cred = NULL; 2384 sk->sk_write_pending = 0; 2385 sk->sk_rcvlowat = 1; 2386 sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; 2387 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; 2388 2389 sk->sk_stamp = ktime_set(-1L, 0); 2390 2391 #ifdef CONFIG_NET_RX_BUSY_POLL 2392 sk->sk_napi_id = 0; 2393 sk->sk_ll_usec = sysctl_net_busy_read; 2394 #endif 2395 2396 sk->sk_max_pacing_rate = ~0U; 2397 sk->sk_pacing_rate = ~0U; 2398 sk->sk_incoming_cpu = -1; 2399 /* 2400 * Before updating sk_refcnt, we must commit prior changes to memory 2401 * (Documentation/RCU/rculist_nulls.txt for details) 2402 */ 2403 smp_wmb(); 2404 atomic_set(&sk->sk_refcnt, 1); 2405 atomic_set(&sk->sk_drops, 0); 2406 } 2407 EXPORT_SYMBOL(sock_init_data); 2408 2409 void lock_sock_nested(struct sock *sk, int subclass) 2410 { 2411 might_sleep(); 2412 spin_lock_bh(&sk->sk_lock.slock); 2413 if (sk->sk_lock.owned) 2414 __lock_sock(sk); 2415 sk->sk_lock.owned = 1; 2416 spin_unlock(&sk->sk_lock.slock); 2417 /* 2418 * The sk_lock has mutex_lock() semantics here: 2419 */ 2420 mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_); 2421 local_bh_enable(); 2422 } 2423 EXPORT_SYMBOL(lock_sock_nested); 2424 2425 void release_sock(struct sock *sk) 2426 { 2427 /* 2428 * The sk_lock has mutex_unlock() semantics: 2429 */ 2430 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); 2431 2432 spin_lock_bh(&sk->sk_lock.slock); 2433 if (sk->sk_backlog.tail) 2434 __release_sock(sk); 2435 2436 /* Warning : release_cb() might need to release sk ownership, 2437 * ie call sock_release_ownership(sk) before us. 2438 */ 2439 if (sk->sk_prot->release_cb) 2440 sk->sk_prot->release_cb(sk); 2441 2442 sock_release_ownership(sk); 2443 if (waitqueue_active(&sk->sk_lock.wq)) 2444 wake_up(&sk->sk_lock.wq); 2445 spin_unlock_bh(&sk->sk_lock.slock); 2446 } 2447 EXPORT_SYMBOL(release_sock); 2448 2449 /** 2450 * lock_sock_fast - fast version of lock_sock 2451 * @sk: socket 2452 * 2453 * This version should be used for very small section, where process wont block 2454 * return false if fast path is taken 2455 * sk_lock.slock locked, owned = 0, BH disabled 2456 * return true if slow path is taken 2457 * sk_lock.slock unlocked, owned = 1, BH enabled 2458 */ 2459 bool lock_sock_fast(struct sock *sk) 2460 { 2461 might_sleep(); 2462 spin_lock_bh(&sk->sk_lock.slock); 2463 2464 if (!sk->sk_lock.owned) 2465 /* 2466 * Note : We must disable BH 2467 */ 2468 return false; 2469 2470 __lock_sock(sk); 2471 sk->sk_lock.owned = 1; 2472 spin_unlock(&sk->sk_lock.slock); 2473 /* 2474 * The sk_lock has mutex_lock() semantics here: 2475 */ 2476 mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_); 2477 local_bh_enable(); 2478 return true; 2479 } 2480 EXPORT_SYMBOL(lock_sock_fast); 2481 2482 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) 2483 { 2484 struct timeval tv; 2485 if (!sock_flag(sk, SOCK_TIMESTAMP)) 2486 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 2487 tv = ktime_to_timeval(sk->sk_stamp); 2488 if (tv.tv_sec == -1) 2489 return -ENOENT; 2490 if (tv.tv_sec == 0) { 2491 sk->sk_stamp = ktime_get_real(); 2492 tv = ktime_to_timeval(sk->sk_stamp); 2493 } 2494 return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0; 2495 } 2496 EXPORT_SYMBOL(sock_get_timestamp); 2497 2498 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) 2499 { 2500 struct timespec ts; 2501 if (!sock_flag(sk, SOCK_TIMESTAMP)) 2502 sock_enable_timestamp(sk, SOCK_TIMESTAMP); 2503 ts = ktime_to_timespec(sk->sk_stamp); 2504 if (ts.tv_sec == -1) 2505 return -ENOENT; 2506 if (ts.tv_sec == 0) { 2507 sk->sk_stamp = ktime_get_real(); 2508 ts = ktime_to_timespec(sk->sk_stamp); 2509 } 2510 return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0; 2511 } 2512 EXPORT_SYMBOL(sock_get_timestampns); 2513 2514 void sock_enable_timestamp(struct sock *sk, int flag) 2515 { 2516 if (!sock_flag(sk, flag)) { 2517 unsigned long previous_flags = sk->sk_flags; 2518 2519 sock_set_flag(sk, flag); 2520 /* 2521 * we just set one of the two flags which require net 2522 * time stamping, but time stamping might have been on 2523 * already because of the other one 2524 */ 2525 if (sock_needs_netstamp(sk) && 2526 !(previous_flags & SK_FLAGS_TIMESTAMP)) 2527 net_enable_timestamp(); 2528 } 2529 } 2530 2531 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, 2532 int level, int type) 2533 { 2534 struct sock_exterr_skb *serr; 2535 struct sk_buff *skb; 2536 int copied, err; 2537 2538 err = -EAGAIN; 2539 skb = sock_dequeue_err_skb(sk); 2540 if (skb == NULL) 2541 goto out; 2542 2543 copied = skb->len; 2544 if (copied > len) { 2545 msg->msg_flags |= MSG_TRUNC; 2546 copied = len; 2547 } 2548 err = skb_copy_datagram_msg(skb, 0, msg, copied); 2549 if (err) 2550 goto out_free_skb; 2551 2552 sock_recv_timestamp(msg, sk, skb); 2553 2554 serr = SKB_EXT_ERR(skb); 2555 put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee); 2556 2557 msg->msg_flags |= MSG_ERRQUEUE; 2558 err = copied; 2559 2560 out_free_skb: 2561 kfree_skb(skb); 2562 out: 2563 return err; 2564 } 2565 EXPORT_SYMBOL(sock_recv_errqueue); 2566 2567 /* 2568 * Get a socket option on an socket. 2569 * 2570 * FIX: POSIX 1003.1g is very ambiguous here. It states that 2571 * asynchronous errors should be reported by getsockopt. We assume 2572 * this means if you specify SO_ERROR (otherwise whats the point of it). 2573 */ 2574 int sock_common_getsockopt(struct socket *sock, int level, int optname, 2575 char __user *optval, int __user *optlen) 2576 { 2577 struct sock *sk = sock->sk; 2578 2579 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 2580 } 2581 EXPORT_SYMBOL(sock_common_getsockopt); 2582 2583 #ifdef CONFIG_COMPAT 2584 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname, 2585 char __user *optval, int __user *optlen) 2586 { 2587 struct sock *sk = sock->sk; 2588 2589 if (sk->sk_prot->compat_getsockopt != NULL) 2590 return sk->sk_prot->compat_getsockopt(sk, level, optname, 2591 optval, optlen); 2592 return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); 2593 } 2594 EXPORT_SYMBOL(compat_sock_common_getsockopt); 2595 #endif 2596 2597 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, 2598 int flags) 2599 { 2600 struct sock *sk = sock->sk; 2601 int addr_len = 0; 2602 int err; 2603 2604 err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, 2605 flags & ~MSG_DONTWAIT, &addr_len); 2606 if (err >= 0) 2607 msg->msg_namelen = addr_len; 2608 return err; 2609 } 2610 EXPORT_SYMBOL(sock_common_recvmsg); 2611 2612 /* 2613 * Set socket options on an inet socket. 2614 */ 2615 int sock_common_setsockopt(struct socket *sock, int level, int optname, 2616 char __user *optval, unsigned int optlen) 2617 { 2618 struct sock *sk = sock->sk; 2619 2620 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 2621 } 2622 EXPORT_SYMBOL(sock_common_setsockopt); 2623 2624 #ifdef CONFIG_COMPAT 2625 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname, 2626 char __user *optval, unsigned int optlen) 2627 { 2628 struct sock *sk = sock->sk; 2629 2630 if (sk->sk_prot->compat_setsockopt != NULL) 2631 return sk->sk_prot->compat_setsockopt(sk, level, optname, 2632 optval, optlen); 2633 return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); 2634 } 2635 EXPORT_SYMBOL(compat_sock_common_setsockopt); 2636 #endif 2637 2638 void sk_common_release(struct sock *sk) 2639 { 2640 if (sk->sk_prot->destroy) 2641 sk->sk_prot->destroy(sk); 2642 2643 /* 2644 * Observation: when sock_common_release is called, processes have 2645 * no access to socket. But net still has. 2646 * Step one, detach it from networking: 2647 * 2648 * A. Remove from hash tables. 2649 */ 2650 2651 sk->sk_prot->unhash(sk); 2652 2653 /* 2654 * In this point socket cannot receive new packets, but it is possible 2655 * that some packets are in flight because some CPU runs receiver and 2656 * did hash table lookup before we unhashed socket. They will achieve 2657 * receive queue and will be purged by socket destructor. 2658 * 2659 * Also we still have packets pending on receive queue and probably, 2660 * our own packets waiting in device queues. sock_destroy will drain 2661 * receive queue, but transmitted packets will delay socket destruction 2662 * until the last reference will be released. 2663 */ 2664 2665 sock_orphan(sk); 2666 2667 xfrm_sk_free_policy(sk); 2668 2669 sk_refcnt_debug_release(sk); 2670 2671 if (sk->sk_frag.page) { 2672 put_page(sk->sk_frag.page); 2673 sk->sk_frag.page = NULL; 2674 } 2675 2676 sock_put(sk); 2677 } 2678 EXPORT_SYMBOL(sk_common_release); 2679 2680 #ifdef CONFIG_PROC_FS 2681 #define PROTO_INUSE_NR 64 /* should be enough for the first time */ 2682 struct prot_inuse { 2683 int val[PROTO_INUSE_NR]; 2684 }; 2685 2686 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); 2687 2688 #ifdef CONFIG_NET_NS 2689 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 2690 { 2691 __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val); 2692 } 2693 EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 2694 2695 int sock_prot_inuse_get(struct net *net, struct proto *prot) 2696 { 2697 int cpu, idx = prot->inuse_idx; 2698 int res = 0; 2699 2700 for_each_possible_cpu(cpu) 2701 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx]; 2702 2703 return res >= 0 ? res : 0; 2704 } 2705 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 2706 2707 static int __net_init sock_inuse_init_net(struct net *net) 2708 { 2709 net->core.inuse = alloc_percpu(struct prot_inuse); 2710 return net->core.inuse ? 0 : -ENOMEM; 2711 } 2712 2713 static void __net_exit sock_inuse_exit_net(struct net *net) 2714 { 2715 free_percpu(net->core.inuse); 2716 } 2717 2718 static struct pernet_operations net_inuse_ops = { 2719 .init = sock_inuse_init_net, 2720 .exit = sock_inuse_exit_net, 2721 }; 2722 2723 static __init int net_inuse_init(void) 2724 { 2725 if (register_pernet_subsys(&net_inuse_ops)) 2726 panic("Cannot initialize net inuse counters"); 2727 2728 return 0; 2729 } 2730 2731 core_initcall(net_inuse_init); 2732 #else 2733 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse); 2734 2735 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) 2736 { 2737 __this_cpu_add(prot_inuse.val[prot->inuse_idx], val); 2738 } 2739 EXPORT_SYMBOL_GPL(sock_prot_inuse_add); 2740 2741 int sock_prot_inuse_get(struct net *net, struct proto *prot) 2742 { 2743 int cpu, idx = prot->inuse_idx; 2744 int res = 0; 2745 2746 for_each_possible_cpu(cpu) 2747 res += per_cpu(prot_inuse, cpu).val[idx]; 2748 2749 return res >= 0 ? res : 0; 2750 } 2751 EXPORT_SYMBOL_GPL(sock_prot_inuse_get); 2752 #endif 2753 2754 static void assign_proto_idx(struct proto *prot) 2755 { 2756 prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); 2757 2758 if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { 2759 pr_err("PROTO_INUSE_NR exhausted\n"); 2760 return; 2761 } 2762 2763 set_bit(prot->inuse_idx, proto_inuse_idx); 2764 } 2765 2766 static void release_proto_idx(struct proto *prot) 2767 { 2768 if (prot->inuse_idx != PROTO_INUSE_NR - 1) 2769 clear_bit(prot->inuse_idx, proto_inuse_idx); 2770 } 2771 #else 2772 static inline void assign_proto_idx(struct proto *prot) 2773 { 2774 } 2775 2776 static inline void release_proto_idx(struct proto *prot) 2777 { 2778 } 2779 #endif 2780 2781 static void req_prot_cleanup(struct request_sock_ops *rsk_prot) 2782 { 2783 if (!rsk_prot) 2784 return; 2785 kfree(rsk_prot->slab_name); 2786 rsk_prot->slab_name = NULL; 2787 kmem_cache_destroy(rsk_prot->slab); 2788 rsk_prot->slab = NULL; 2789 } 2790 2791 static int req_prot_init(const struct proto *prot) 2792 { 2793 struct request_sock_ops *rsk_prot = prot->rsk_prot; 2794 2795 if (!rsk_prot) 2796 return 0; 2797 2798 rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s", 2799 prot->name); 2800 if (!rsk_prot->slab_name) 2801 return -ENOMEM; 2802 2803 rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name, 2804 rsk_prot->obj_size, 0, 2805 prot->slab_flags, NULL); 2806 2807 if (!rsk_prot->slab) { 2808 pr_crit("%s: Can't create request sock SLAB cache!\n", 2809 prot->name); 2810 return -ENOMEM; 2811 } 2812 return 0; 2813 } 2814 2815 int proto_register(struct proto *prot, int alloc_slab) 2816 { 2817 if (alloc_slab) { 2818 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, 2819 SLAB_HWCACHE_ALIGN | prot->slab_flags, 2820 NULL); 2821 2822 if (prot->slab == NULL) { 2823 pr_crit("%s: Can't create sock SLAB cache!\n", 2824 prot->name); 2825 goto out; 2826 } 2827 2828 if (req_prot_init(prot)) 2829 goto out_free_request_sock_slab; 2830 2831 if (prot->twsk_prot != NULL) { 2832 prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name); 2833 2834 if (prot->twsk_prot->twsk_slab_name == NULL) 2835 goto out_free_request_sock_slab; 2836 2837 prot->twsk_prot->twsk_slab = 2838 kmem_cache_create(prot->twsk_prot->twsk_slab_name, 2839 prot->twsk_prot->twsk_obj_size, 2840 0, 2841 prot->slab_flags, 2842 NULL); 2843 if (prot->twsk_prot->twsk_slab == NULL) 2844 goto out_free_timewait_sock_slab_name; 2845 } 2846 } 2847 2848 mutex_lock(&proto_list_mutex); 2849 list_add(&prot->node, &proto_list); 2850 assign_proto_idx(prot); 2851 mutex_unlock(&proto_list_mutex); 2852 return 0; 2853 2854 out_free_timewait_sock_slab_name: 2855 kfree(prot->twsk_prot->twsk_slab_name); 2856 out_free_request_sock_slab: 2857 req_prot_cleanup(prot->rsk_prot); 2858 2859 kmem_cache_destroy(prot->slab); 2860 prot->slab = NULL; 2861 out: 2862 return -ENOBUFS; 2863 } 2864 EXPORT_SYMBOL(proto_register); 2865 2866 void proto_unregister(struct proto *prot) 2867 { 2868 mutex_lock(&proto_list_mutex); 2869 release_proto_idx(prot); 2870 list_del(&prot->node); 2871 mutex_unlock(&proto_list_mutex); 2872 2873 kmem_cache_destroy(prot->slab); 2874 prot->slab = NULL; 2875 2876 req_prot_cleanup(prot->rsk_prot); 2877 2878 if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { 2879 kmem_cache_destroy(prot->twsk_prot->twsk_slab); 2880 kfree(prot->twsk_prot->twsk_slab_name); 2881 prot->twsk_prot->twsk_slab = NULL; 2882 } 2883 } 2884 EXPORT_SYMBOL(proto_unregister); 2885 2886 #ifdef CONFIG_PROC_FS 2887 static void *proto_seq_start(struct seq_file *seq, loff_t *pos) 2888 __acquires(proto_list_mutex) 2889 { 2890 mutex_lock(&proto_list_mutex); 2891 return seq_list_start_head(&proto_list, *pos); 2892 } 2893 2894 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2895 { 2896 return seq_list_next(v, &proto_list, pos); 2897 } 2898 2899 static void proto_seq_stop(struct seq_file *seq, void *v) 2900 __releases(proto_list_mutex) 2901 { 2902 mutex_unlock(&proto_list_mutex); 2903 } 2904 2905 static char proto_method_implemented(const void *method) 2906 { 2907 return method == NULL ? 'n' : 'y'; 2908 } 2909 static long sock_prot_memory_allocated(struct proto *proto) 2910 { 2911 return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L; 2912 } 2913 2914 static char *sock_prot_memory_pressure(struct proto *proto) 2915 { 2916 return proto->memory_pressure != NULL ? 2917 proto_memory_pressure(proto) ? "yes" : "no" : "NI"; 2918 } 2919 2920 static void proto_seq_printf(struct seq_file *seq, struct proto *proto) 2921 { 2922 2923 seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s " 2924 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", 2925 proto->name, 2926 proto->obj_size, 2927 sock_prot_inuse_get(seq_file_net(seq), proto), 2928 sock_prot_memory_allocated(proto), 2929 sock_prot_memory_pressure(proto), 2930 proto->max_header, 2931 proto->slab == NULL ? "no" : "yes", 2932 module_name(proto->owner), 2933 proto_method_implemented(proto->close), 2934 proto_method_implemented(proto->connect), 2935 proto_method_implemented(proto->disconnect), 2936 proto_method_implemented(proto->accept), 2937 proto_method_implemented(proto->ioctl), 2938 proto_method_implemented(proto->init), 2939 proto_method_implemented(proto->destroy), 2940 proto_method_implemented(proto->shutdown), 2941 proto_method_implemented(proto->setsockopt), 2942 proto_method_implemented(proto->getsockopt), 2943 proto_method_implemented(proto->sendmsg), 2944 proto_method_implemented(proto->recvmsg), 2945 proto_method_implemented(proto->sendpage), 2946 proto_method_implemented(proto->bind), 2947 proto_method_implemented(proto->backlog_rcv), 2948 proto_method_implemented(proto->hash), 2949 proto_method_implemented(proto->unhash), 2950 proto_method_implemented(proto->get_port), 2951 proto_method_implemented(proto->enter_memory_pressure)); 2952 } 2953 2954 static int proto_seq_show(struct seq_file *seq, void *v) 2955 { 2956 if (v == &proto_list) 2957 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", 2958 "protocol", 2959 "size", 2960 "sockets", 2961 "memory", 2962 "press", 2963 "maxhdr", 2964 "slab", 2965 "module", 2966 "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); 2967 else 2968 proto_seq_printf(seq, list_entry(v, struct proto, node)); 2969 return 0; 2970 } 2971 2972 static const struct seq_operations proto_seq_ops = { 2973 .start = proto_seq_start, 2974 .next = proto_seq_next, 2975 .stop = proto_seq_stop, 2976 .show = proto_seq_show, 2977 }; 2978 2979 static int proto_seq_open(struct inode *inode, struct file *file) 2980 { 2981 return seq_open_net(inode, file, &proto_seq_ops, 2982 sizeof(struct seq_net_private)); 2983 } 2984 2985 static const struct file_operations proto_seq_fops = { 2986 .owner = THIS_MODULE, 2987 .open = proto_seq_open, 2988 .read = seq_read, 2989 .llseek = seq_lseek, 2990 .release = seq_release_net, 2991 }; 2992 2993 static __net_init int proto_init_net(struct net *net) 2994 { 2995 if (!proc_create("protocols", S_IRUGO, net->proc_net, &proto_seq_fops)) 2996 return -ENOMEM; 2997 2998 return 0; 2999 } 3000 3001 static __net_exit void proto_exit_net(struct net *net) 3002 { 3003 remove_proc_entry("protocols", net->proc_net); 3004 } 3005 3006 3007 static __net_initdata struct pernet_operations proto_net_ops = { 3008 .init = proto_init_net, 3009 .exit = proto_exit_net, 3010 }; 3011 3012 static int __init proto_init(void) 3013 { 3014 return register_pernet_subsys(&proto_net_ops); 3015 } 3016 3017 subsys_initcall(proto_init); 3018 3019 #endif /* PROC_FS */ 3020