1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * NET3: Garbage Collector For AF_UNIX sockets 4 * 5 * Garbage Collector: 6 * Copyright (C) Barak A. Pearlmutter. 7 * 8 * Chopped about by Alan Cox 22/3/96 to make it fit the AF_UNIX socket problem. 9 * If it doesn't work blame me, it worked when Barak sent it. 10 * 11 * Assumptions: 12 * 13 * - object w/ a bit 14 * - free list 15 * 16 * Current optimizations: 17 * 18 * - explicit stack instead of recursion 19 * - tail recurse on first born instead of immediate push/pop 20 * - we gather the stuff that should not be killed into tree 21 * and stack is just a path from root to the current pointer. 22 * 23 * Future optimizations: 24 * 25 * - don't just push entire root set; process in place 26 * 27 * Fixes: 28 * Alan Cox 07 Sept 1997 Vmalloc internal stack as needed. 29 * Cope with changing max_files. 30 * Al Viro 11 Oct 1998 31 * Graph may have cycles. That is, we can send the descriptor 32 * of foo to bar and vice versa. Current code chokes on that. 33 * Fix: move SCM_RIGHTS ones into the separate list and then 34 * skb_free() them all instead of doing explicit fput's. 35 * Another problem: since fput() may block somebody may 36 * create a new unix_socket when we are in the middle of sweep 37 * phase. Fix: revert the logic wrt MARKED. Mark everything 38 * upon the beginning and unmark non-junk ones. 39 * 40 * [12 Oct 1998] AAARGH! New code purges all SCM_RIGHTS 41 * sent to connect()'ed but still not accept()'ed sockets. 42 * Fixed. Old code had slightly different problem here: 43 * extra fput() in situation when we passed the descriptor via 44 * such socket and closed it (descriptor). That would happen on 45 * each unix_gc() until the accept(). Since the struct file in 46 * question would go to the free list and might be reused... 47 * That might be the reason of random oopses on filp_close() 48 * in unrelated processes. 49 * 50 * AV 28 Feb 1999 51 * Kill the explicit allocation of stack. Now we keep the tree 52 * with root in dummy + pointer (gc_current) to one of the nodes. 53 * Stack is represented as path from gc_current to dummy. Unmark 54 * now means "add to tree". Push == "make it a son of gc_current". 55 * Pop == "move gc_current to parent". We keep only pointers to 56 * parents (->gc_tree). 57 * AV 1 Mar 1999 58 * Damn. Added missing check for ->dead in listen queues scanning. 59 * 60 * Miklos Szeredi 25 Jun 2007 61 * Reimplement with a cycle collecting algorithm. This should 62 * solve several problems with the previous code, like being racy 63 * wrt receive and holding up unrelated socket operations. 64 */ 65 66 #include <linux/fs.h> 67 #include <linux/list.h> 68 #include <linux/skbuff.h> 69 #include <linux/socket.h> 70 #include <linux/workqueue.h> 71 #include <net/af_unix.h> 72 #include <net/scm.h> 73 #include <net/tcp_states.h> 74 75 #include "af_unix.h" 76 77 struct unix_vertex { 78 struct list_head edges; 79 struct list_head entry; 80 struct list_head scc_entry; 81 unsigned long out_degree; 82 unsigned long index; 83 unsigned long scc_index; 84 }; 85 86 struct unix_edge { 87 struct unix_sock *predecessor; 88 struct unix_sock *successor; 89 struct list_head vertex_entry; 90 struct list_head stack_entry; 91 }; 92 93 struct unix_sock *unix_get_socket(struct file *filp) 94 { 95 struct inode *inode = file_inode(filp); 96 97 /* Socket ? */ 98 if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { 99 struct socket *sock = SOCKET_I(inode); 100 const struct proto_ops *ops; 101 struct sock *sk = sock->sk; 102 103 ops = READ_ONCE(sock->ops); 104 105 /* PF_UNIX ? */ 106 if (sk && ops && ops->family == PF_UNIX) 107 return unix_sk(sk); 108 } 109 110 return NULL; 111 } 112 113 static struct unix_vertex *unix_edge_successor(struct unix_edge *edge) 114 { 115 /* If an embryo socket has a fd, 116 * the listener indirectly holds the fd's refcnt. 117 */ 118 if (edge->successor->listener) 119 return unix_sk(edge->successor->listener)->vertex; 120 121 return edge->successor->vertex; 122 } 123 124 enum { 125 UNIX_GRAPH_NOT_CYCLIC, 126 UNIX_GRAPH_MAYBE_CYCLIC, 127 UNIX_GRAPH_CYCLIC, 128 }; 129 130 static unsigned char unix_graph_state; 131 132 static void unix_update_graph(struct unix_vertex *vertex) 133 { 134 /* If the receiver socket is not inflight, no cyclic 135 * reference could be formed. 136 */ 137 if (!vertex) 138 return; 139 140 WRITE_ONCE(unix_graph_state, UNIX_GRAPH_MAYBE_CYCLIC); 141 } 142 143 static LIST_HEAD(unix_unvisited_vertices); 144 145 enum unix_vertex_index { 146 UNIX_VERTEX_INDEX_MARK1, 147 UNIX_VERTEX_INDEX_MARK2, 148 UNIX_VERTEX_INDEX_START, 149 }; 150 151 static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1; 152 static unsigned long unix_vertex_max_scc_index = UNIX_VERTEX_INDEX_START; 153 154 static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) 155 { 156 struct unix_vertex *vertex = edge->predecessor->vertex; 157 158 if (!vertex) { 159 vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry); 160 vertex->index = unix_vertex_unvisited_index; 161 vertex->scc_index = ++unix_vertex_max_scc_index; 162 vertex->out_degree = 0; 163 INIT_LIST_HEAD(&vertex->edges); 164 INIT_LIST_HEAD(&vertex->scc_entry); 165 166 list_move_tail(&vertex->entry, &unix_unvisited_vertices); 167 edge->predecessor->vertex = vertex; 168 } 169 170 vertex->out_degree++; 171 list_add_tail(&edge->vertex_entry, &vertex->edges); 172 173 unix_update_graph(unix_edge_successor(edge)); 174 } 175 176 static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge) 177 { 178 struct unix_vertex *vertex = edge->predecessor->vertex; 179 180 if (!fpl->dead) 181 unix_update_graph(unix_edge_successor(edge)); 182 183 list_del(&edge->vertex_entry); 184 vertex->out_degree--; 185 186 if (!vertex->out_degree) { 187 edge->predecessor->vertex = NULL; 188 list_move_tail(&vertex->entry, &fpl->vertices); 189 } 190 } 191 192 static void unix_free_vertices(struct scm_fp_list *fpl) 193 { 194 struct unix_vertex *vertex, *next_vertex; 195 196 list_for_each_entry_safe(vertex, next_vertex, &fpl->vertices, entry) { 197 list_del(&vertex->entry); 198 kfree(vertex); 199 } 200 } 201 202 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(unix_gc_lock); 203 204 void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) 205 { 206 int i = 0, j = 0; 207 208 spin_lock(&unix_gc_lock); 209 210 if (!fpl->count_unix) 211 goto out; 212 213 do { 214 struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]); 215 struct unix_edge *edge; 216 217 if (!inflight) 218 continue; 219 220 edge = fpl->edges + i++; 221 edge->predecessor = inflight; 222 edge->successor = receiver; 223 224 unix_add_edge(fpl, edge); 225 } while (i < fpl->count_unix); 226 227 receiver->scm_stat.nr_unix_fds += fpl->count_unix; 228 out: 229 WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count); 230 231 spin_unlock(&unix_gc_lock); 232 233 fpl->inflight = true; 234 235 unix_free_vertices(fpl); 236 } 237 238 void unix_del_edges(struct scm_fp_list *fpl) 239 { 240 struct unix_sock *receiver; 241 int i = 0; 242 243 spin_lock(&unix_gc_lock); 244 245 if (!fpl->count_unix) 246 goto out; 247 248 do { 249 struct unix_edge *edge = fpl->edges + i++; 250 251 unix_del_edge(fpl, edge); 252 } while (i < fpl->count_unix); 253 254 if (!fpl->dead) { 255 receiver = fpl->edges[0].successor; 256 receiver->scm_stat.nr_unix_fds -= fpl->count_unix; 257 } 258 out: 259 WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count); 260 261 spin_unlock(&unix_gc_lock); 262 263 fpl->inflight = false; 264 } 265 266 void unix_update_edges(struct unix_sock *receiver) 267 { 268 /* nr_unix_fds is only updated under unix_state_lock(). 269 * If it's 0 here, the embryo socket is not part of the 270 * inflight graph, and GC will not see it, so no lock needed. 271 */ 272 if (!receiver->scm_stat.nr_unix_fds) { 273 receiver->listener = NULL; 274 } else { 275 spin_lock(&unix_gc_lock); 276 unix_update_graph(unix_sk(receiver->listener)->vertex); 277 receiver->listener = NULL; 278 spin_unlock(&unix_gc_lock); 279 } 280 } 281 282 int unix_prepare_fpl(struct scm_fp_list *fpl) 283 { 284 struct unix_vertex *vertex; 285 int i; 286 287 if (!fpl->count_unix) 288 return 0; 289 290 for (i = 0; i < fpl->count_unix; i++) { 291 vertex = kmalloc(sizeof(*vertex), GFP_KERNEL); 292 if (!vertex) 293 goto err; 294 295 list_add(&vertex->entry, &fpl->vertices); 296 } 297 298 fpl->edges = kvmalloc_array(fpl->count_unix, sizeof(*fpl->edges), 299 GFP_KERNEL_ACCOUNT); 300 if (!fpl->edges) 301 goto err; 302 303 unix_schedule_gc(fpl->user); 304 305 return 0; 306 307 err: 308 unix_free_vertices(fpl); 309 return -ENOMEM; 310 } 311 312 void unix_destroy_fpl(struct scm_fp_list *fpl) 313 { 314 if (fpl->inflight) 315 unix_del_edges(fpl); 316 317 kvfree(fpl->edges); 318 unix_free_vertices(fpl); 319 } 320 321 static bool unix_vertex_dead(struct unix_vertex *vertex) 322 { 323 struct unix_edge *edge; 324 struct unix_sock *u; 325 long total_ref; 326 327 list_for_each_entry(edge, &vertex->edges, vertex_entry) { 328 struct unix_vertex *next_vertex = unix_edge_successor(edge); 329 330 /* The vertex's fd can be received by a non-inflight socket. */ 331 if (!next_vertex) 332 return false; 333 334 /* The vertex's fd can be received by an inflight socket in 335 * another SCC. 336 */ 337 if (next_vertex->scc_index != vertex->scc_index) 338 return false; 339 } 340 341 /* No receiver exists out of the same SCC. */ 342 343 edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); 344 u = edge->predecessor; 345 total_ref = file_count(u->sk.sk_socket->file); 346 347 /* If not close()d, total_ref > out_degree. */ 348 if (total_ref != vertex->out_degree) 349 return false; 350 351 return true; 352 } 353 354 static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist) 355 { 356 struct unix_vertex *vertex; 357 358 list_for_each_entry_reverse(vertex, scc, scc_entry) { 359 struct sk_buff_head *queue; 360 struct unix_edge *edge; 361 struct unix_sock *u; 362 363 edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); 364 u = edge->predecessor; 365 queue = &u->sk.sk_receive_queue; 366 367 spin_lock(&queue->lock); 368 369 if (u->sk.sk_state == TCP_LISTEN) { 370 struct sk_buff *skb; 371 372 skb_queue_walk(queue, skb) { 373 struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue; 374 375 spin_lock(&embryo_queue->lock); 376 skb_queue_splice_init(embryo_queue, hitlist); 377 spin_unlock(&embryo_queue->lock); 378 } 379 } else { 380 skb_queue_splice_init(queue, hitlist); 381 } 382 383 spin_unlock(&queue->lock); 384 } 385 } 386 387 static bool unix_scc_cyclic(struct list_head *scc) 388 { 389 struct unix_vertex *vertex; 390 struct unix_edge *edge; 391 392 /* SCC containing multiple vertices ? */ 393 if (!list_is_singular(scc)) 394 return true; 395 396 vertex = list_first_entry(scc, typeof(*vertex), scc_entry); 397 398 /* Self-reference or a embryo-listener circle ? */ 399 list_for_each_entry(edge, &vertex->edges, vertex_entry) { 400 if (unix_edge_successor(edge) == vertex) 401 return true; 402 } 403 404 return false; 405 } 406 407 static LIST_HEAD(unix_visited_vertices); 408 static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; 409 410 static unsigned long __unix_walk_scc(struct unix_vertex *vertex, 411 unsigned long *last_index, 412 struct sk_buff_head *hitlist) 413 { 414 unsigned long cyclic_sccs = 0; 415 LIST_HEAD(vertex_stack); 416 struct unix_edge *edge; 417 LIST_HEAD(edge_stack); 418 419 next_vertex: 420 /* Push vertex to vertex_stack and mark it as on-stack 421 * (index >= UNIX_VERTEX_INDEX_START). 422 * The vertex will be popped when finalising SCC later. 423 */ 424 list_add(&vertex->scc_entry, &vertex_stack); 425 426 vertex->index = *last_index; 427 vertex->scc_index = *last_index; 428 (*last_index)++; 429 430 /* Explore neighbour vertices (receivers of the current vertex's fd). */ 431 list_for_each_entry(edge, &vertex->edges, vertex_entry) { 432 struct unix_vertex *next_vertex = unix_edge_successor(edge); 433 434 if (!next_vertex) 435 continue; 436 437 if (next_vertex->index == unix_vertex_unvisited_index) { 438 /* Iterative deepening depth first search 439 * 440 * 1. Push a forward edge to edge_stack and set 441 * the successor to vertex for the next iteration. 442 */ 443 list_add(&edge->stack_entry, &edge_stack); 444 445 vertex = next_vertex; 446 goto next_vertex; 447 448 /* 2. Pop the edge directed to the current vertex 449 * and restore the ancestor for backtracking. 450 */ 451 prev_vertex: 452 edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry); 453 list_del_init(&edge->stack_entry); 454 455 next_vertex = vertex; 456 vertex = edge->predecessor->vertex; 457 458 /* If the successor has a smaller scc_index, two vertices 459 * are in the same SCC, so propagate the smaller scc_index 460 * to skip SCC finalisation. 461 */ 462 vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); 463 } else if (next_vertex->index != unix_vertex_grouped_index) { 464 /* Loop detected by a back/cross edge. 465 * 466 * The successor is on vertex_stack, so two vertices are in 467 * the same SCC. If the successor has a smaller *scc_index*, 468 * propagate it to skip SCC finalisation. 469 */ 470 vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); 471 } else { 472 /* The successor was already grouped as another SCC */ 473 } 474 } 475 476 if (vertex->index == vertex->scc_index) { 477 struct unix_vertex *v; 478 struct list_head scc; 479 bool scc_dead = true; 480 481 /* SCC finalised. 482 * 483 * If the scc_index was not updated, all the vertices above on 484 * vertex_stack are in the same SCC. Group them using scc_entry. 485 */ 486 __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry); 487 488 list_for_each_entry_reverse(v, &scc, scc_entry) { 489 /* Don't restart DFS from this vertex in unix_walk_scc(). */ 490 list_move_tail(&v->entry, &unix_visited_vertices); 491 492 /* Mark vertex as off-stack. */ 493 v->index = unix_vertex_grouped_index; 494 495 if (scc_dead) 496 scc_dead = unix_vertex_dead(v); 497 } 498 499 if (scc_dead) { 500 unix_collect_skb(&scc, hitlist); 501 } else { 502 if (unix_vertex_max_scc_index < vertex->scc_index) 503 unix_vertex_max_scc_index = vertex->scc_index; 504 505 if (unix_scc_cyclic(&scc)) 506 cyclic_sccs++; 507 } 508 509 list_del(&scc); 510 } 511 512 /* Need backtracking ? */ 513 if (!list_empty(&edge_stack)) 514 goto prev_vertex; 515 516 return cyclic_sccs; 517 } 518 519 static unsigned long unix_graph_cyclic_sccs; 520 521 static void unix_walk_scc(struct sk_buff_head *hitlist) 522 { 523 unsigned long last_index = UNIX_VERTEX_INDEX_START; 524 unsigned long cyclic_sccs = 0; 525 526 unix_vertex_max_scc_index = UNIX_VERTEX_INDEX_START; 527 528 /* Visit every vertex exactly once. 529 * __unix_walk_scc() moves visited vertices to unix_visited_vertices. 530 */ 531 while (!list_empty(&unix_unvisited_vertices)) { 532 struct unix_vertex *vertex; 533 534 vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); 535 cyclic_sccs += __unix_walk_scc(vertex, &last_index, hitlist); 536 } 537 538 list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); 539 swap(unix_vertex_unvisited_index, unix_vertex_grouped_index); 540 541 WRITE_ONCE(unix_graph_cyclic_sccs, cyclic_sccs); 542 WRITE_ONCE(unix_graph_state, 543 cyclic_sccs ? UNIX_GRAPH_CYCLIC : UNIX_GRAPH_NOT_CYCLIC); 544 } 545 546 static void unix_walk_scc_fast(struct sk_buff_head *hitlist) 547 { 548 unsigned long cyclic_sccs = unix_graph_cyclic_sccs; 549 550 while (!list_empty(&unix_unvisited_vertices)) { 551 struct unix_vertex *vertex; 552 struct list_head scc; 553 bool scc_dead = true; 554 555 vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); 556 list_add(&scc, &vertex->scc_entry); 557 558 list_for_each_entry_reverse(vertex, &scc, scc_entry) { 559 list_move_tail(&vertex->entry, &unix_visited_vertices); 560 561 if (scc_dead) 562 scc_dead = unix_vertex_dead(vertex); 563 } 564 565 if (scc_dead) { 566 cyclic_sccs--; 567 unix_collect_skb(&scc, hitlist); 568 } 569 570 list_del(&scc); 571 } 572 573 list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); 574 575 WRITE_ONCE(unix_graph_cyclic_sccs, cyclic_sccs); 576 WRITE_ONCE(unix_graph_state, 577 cyclic_sccs ? UNIX_GRAPH_CYCLIC : UNIX_GRAPH_NOT_CYCLIC); 578 } 579 580 static bool gc_in_progress; 581 582 static void unix_gc(struct work_struct *work) 583 { 584 struct sk_buff_head hitlist; 585 struct sk_buff *skb; 586 587 spin_lock(&unix_gc_lock); 588 589 if (unix_graph_state == UNIX_GRAPH_NOT_CYCLIC) { 590 spin_unlock(&unix_gc_lock); 591 goto skip_gc; 592 } 593 594 __skb_queue_head_init(&hitlist); 595 596 if (unix_graph_state == UNIX_GRAPH_CYCLIC) 597 unix_walk_scc_fast(&hitlist); 598 else 599 unix_walk_scc(&hitlist); 600 601 spin_unlock(&unix_gc_lock); 602 603 skb_queue_walk(&hitlist, skb) { 604 if (UNIXCB(skb).fp) 605 UNIXCB(skb).fp->dead = true; 606 } 607 608 __skb_queue_purge_reason(&hitlist, SKB_DROP_REASON_SOCKET_CLOSE); 609 skip_gc: 610 WRITE_ONCE(gc_in_progress, false); 611 } 612 613 static DECLARE_WORK(unix_gc_work, unix_gc); 614 615 #define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8) 616 617 void unix_schedule_gc(struct user_struct *user) 618 { 619 if (READ_ONCE(unix_graph_state) == UNIX_GRAPH_NOT_CYCLIC) 620 return; 621 622 /* Penalise users who want to send AF_UNIX sockets 623 * but whose sockets have not been received yet. 624 */ 625 if (user && 626 READ_ONCE(user->unix_inflight) < UNIX_INFLIGHT_SANE_USER) 627 return; 628 629 if (!READ_ONCE(gc_in_progress)) { 630 WRITE_ONCE(gc_in_progress, true); 631 queue_work(system_dfl_wq, &unix_gc_work); 632 } 633 634 if (user && READ_ONCE(unix_graph_cyclic_sccs)) 635 flush_work(&unix_gc_work); 636 } 637