xref: /freebsd/sys/netlink/netlink_message_writer.c (revision c5405d1c850765d04f74067ebb71f57e9a26b8ea)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/param.h>
29 #include <sys/malloc.h>
30 #include <sys/lock.h>
31 #include <sys/rmlock.h>
32 #include <sys/mbuf.h>
33 #include <sys/ck.h>
34 #include <sys/socket.h>
35 #include <sys/socketvar.h>
36 #include <sys/syslog.h>
37 
38 #include <netlink/netlink.h>
39 #include <netlink/netlink_ctl.h>
40 #include <netlink/netlink_linux.h>
41 #include <netlink/netlink_var.h>
42 
43 #define	DEBUG_MOD_NAME	nl_writer
44 #define	DEBUG_MAX_LEVEL	LOG_DEBUG3
45 #include <netlink/netlink_debug.h>
46 _DECLARE_DEBUG(LOG_INFO);
47 
48 /*
49  * The goal of this file is to provide convenient message writing KPI on top of
50  * different storage methods (mbufs, uio, temporary memory chunks).
51  *
52  * The main KPI guarantee is that the (last) message always resides in the contiguous
53  *  memory buffer, so one is able to update the header after writing the entire message.
54  *
55  * This guarantee comes with a side effect of potentially reallocating underlying
56  *  buffer, so one needs to update the desired pointers after something is added
57  *  to the header.
58  *
59  * Messaging layer contains hooks performing transparent Linux translation for the messages.
60  *
61  * There are 3 types of supported targets:
62  *  * socket (adds mbufs to the socket buffer, used for message replies)
63  *  * group (sends mbuf/chain to the specified groups, used for the notifications)
64  *  * chain (returns mbuf chain, used in Linux message translation code)
65  *
66  * There are 3 types of storage:
67  * * NS_WRITER_TYPE_MBUF (mbuf-based, most efficient, used when a single message
68  *    fits in NLMBUFSIZE)
69  * * NS_WRITER_TYPE_BUF (fallback, malloc-based, used when a single message needs
70  *    to be larger than one supported by NS_WRITER_TYPE_MBUF)
71  * * NS_WRITER_TYPE_LBUF (malloc-based, similar to NS_WRITER_TYPE_BUF, used for
72  *    Linux sockets, calls translation hook prior to sending messages to the socket).
73  *
74  * Internally, KPI switches between different types of storage when memory requirements
75  *  change. It happens transparently to the caller.
76  */
77 
78 /*
79  * Uma zone for the mbuf-based Netlink storage
80  */
81 static uma_zone_t	nlmsg_zone;
82 
83 static void
84 nl_free_mbuf_storage(struct mbuf *m)
85 {
86 	uma_zfree(nlmsg_zone, m->m_ext.ext_buf);
87 }
88 
89 static int
90 nl_setup_mbuf_storage(void *mem, int size, void *arg, int how __unused)
91 {
92 	struct mbuf *m = (struct mbuf *)arg;
93 
94 	if (m != NULL)
95 		m_extadd(m, mem, size, nl_free_mbuf_storage, NULL, NULL, 0, EXT_MOD_TYPE);
96 
97 	return (0);
98 }
99 
100 static struct mbuf *
101 nl_get_mbuf_flags(int size, int malloc_flags, int mbuf_flags)
102 {
103 	struct mbuf *m, *m_storage;
104 
105 	if (size <= MHLEN)
106 		return (m_get2(size, malloc_flags, MT_DATA, mbuf_flags));
107 
108 	if (__predict_false(size > NLMBUFSIZE))
109 		return (NULL);
110 
111 	m = m_gethdr(malloc_flags, MT_DATA);
112 	if (m == NULL)
113 		return (NULL);
114 
115 	m_storage = uma_zalloc_arg(nlmsg_zone, m, malloc_flags);
116 	if (m_storage == NULL) {
117 		m_free_raw(m);
118 		return (NULL);
119 	}
120 
121 	return (m);
122 }
123 
124 static struct mbuf *
125 nl_get_mbuf(int size, int malloc_flags)
126 {
127 	return (nl_get_mbuf_flags(size, malloc_flags, M_PKTHDR));
128 }
129 
130 /*
131  * Gets a chain of Netlink mbufs.
132  * This is strip-down version of m_getm2()
133  */
134 static struct mbuf *
135 nl_get_mbuf_chain(int len, int malloc_flags)
136 {
137 	struct mbuf *m_chain = NULL, *m_tail = NULL;
138 	int mbuf_flags = M_PKTHDR;
139 
140 	while (len > 0) {
141 		int sz = len > NLMBUFSIZE ? NLMBUFSIZE: len;
142 		struct mbuf *m = nl_get_mbuf_flags(sz, malloc_flags, mbuf_flags);
143 
144 		if (m == NULL) {
145 			m_freem(m_chain);
146 			return (NULL);
147 		}
148 
149 		/* Book keeping. */
150 		len -= M_SIZE(m);
151 		if (m_tail != NULL)
152 			m_tail->m_next = m;
153 		else
154 			m_chain = m;
155 		m_tail = m;
156 		mbuf_flags &= ~M_PKTHDR;	/* Only valid on the first mbuf. */
157 	}
158 
159 	return (m_chain);
160 }
161 
162 void
163 nl_init_msg_zone(void)
164 {
165 	nlmsg_zone = uma_zcreate("netlink", NLMBUFSIZE, nl_setup_mbuf_storage,
166 	    NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
167 }
168 
169 void
170 nl_destroy_msg_zone(void)
171 {
172 	uma_zdestroy(nlmsg_zone);
173 }
174 
175 
176 typedef bool nlwriter_op_init(struct nl_writer *nw, int size, bool waitok);
177 typedef bool nlwriter_op_write(struct nl_writer *nw, void *buf, int buflen, int cnt);
178 
179 struct nlwriter_ops {
180 	nlwriter_op_init	*init;
181 	nlwriter_op_write	*write_socket;
182 	nlwriter_op_write	*write_group;
183 	nlwriter_op_write	*write_chain;
184 };
185 
186 /*
187  * NS_WRITER_TYPE_BUF
188  * Writes message to a temporary memory buffer,
189  * flushing to the socket/group when buffer size limit is reached
190  */
191 static bool
192 nlmsg_get_ns_buf(struct nl_writer *nw, int size, bool waitok)
193 {
194 	int mflag = waitok ? M_WAITOK : M_NOWAIT;
195 	nw->_storage = malloc(size, M_NETLINK, mflag | M_ZERO);
196 	if (__predict_false(nw->_storage == NULL))
197 		return (false);
198 	nw->alloc_len = size;
199 	nw->offset = 0;
200 	nw->hdr = NULL;
201 	nw->data = nw->_storage;
202 	nw->writer_type = NS_WRITER_TYPE_BUF;
203 	nw->malloc_flag = mflag;
204 	nw->num_messages = 0;
205 	nw->enomem = false;
206 	return (true);
207 }
208 
209 static bool
210 nlmsg_write_socket_buf(struct nl_writer *nw, void *buf, int datalen, int cnt)
211 {
212 	NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr);
213 	if (__predict_false(datalen == 0)) {
214 		free(buf, M_NETLINK);
215 		return (true);
216 	}
217 
218 	struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag);
219 	if (__predict_false(m == NULL)) {
220 		/* XXX: should we set sorcverr? */
221 		free(buf, M_NETLINK);
222 		return (false);
223 	}
224 	m_append(m, datalen, buf);
225 	free(buf, M_NETLINK);
226 
227 	int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0;
228 	return (nl_send_one(m, (struct nlpcb *)(nw->arg.ptr), cnt, io_flags));
229 }
230 
231 static bool
232 nlmsg_write_group_buf(struct nl_writer *nw, void *buf, int datalen, int cnt)
233 {
234 	NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d proto: %d id: %d", buf, datalen,
235 	    nw->arg.group.proto, nw->arg.group.id);
236 	if (__predict_false(datalen == 0)) {
237 		free(buf, M_NETLINK);
238 		return (true);
239 	}
240 
241 	struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag);
242 	if (__predict_false(m == NULL)) {
243 		free(buf, M_NETLINK);
244 		return (false);
245 	}
246 	bool success = m_append(m, datalen, buf) != 0;
247 	free(buf, M_NETLINK);
248 
249 	if (!success)
250 		return (false);
251 
252 	nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id);
253 	return (true);
254 }
255 
256 static bool
257 nlmsg_write_chain_buf(struct nl_writer *nw, void *buf, int datalen, int cnt)
258 {
259 	struct mbuf **m0 = (struct mbuf **)(nw->arg.ptr);
260 	NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr);
261 
262 	if (__predict_false(datalen == 0)) {
263 		free(buf, M_NETLINK);
264 		return (true);
265 	}
266 
267 	if (*m0 == NULL) {
268 		struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag);
269 
270 		if (__predict_false(m == NULL)) {
271 			free(buf, M_NETLINK);
272 			return (false);
273 		}
274 		*m0 = m;
275 	}
276 	if (__predict_false(m_append(*m0, datalen, buf) == 0)) {
277 		free(buf, M_NETLINK);
278 		return (false);
279 	}
280 	return (true);
281 }
282 
283 
284 /*
285  * NS_WRITER_TYPE_MBUF
286  * Writes message to the allocated mbuf,
287  * flushing to socket/group when mbuf size limit is reached.
288  * This is the most efficient mechanism as it avoids double-copying.
289  *
290  * Allocates a single mbuf suitable to store up to @size bytes of data.
291  * If size < MHLEN (around 160 bytes), allocates mbuf with pkghdr.
292  * If the size <= NLMBUFSIZE (2k), allocate mbuf+storage out of nlmsg_zone.
293  * Returns NULL on greater size or the allocation failure.
294  */
295 static bool
296 nlmsg_get_ns_mbuf(struct nl_writer *nw, int size, bool waitok)
297 {
298 	int mflag = waitok ? M_WAITOK : M_NOWAIT;
299 	struct mbuf *m = nl_get_mbuf(size, mflag);
300 
301 	if (__predict_false(m == NULL))
302 		return (false);
303 	nw->alloc_len = M_TRAILINGSPACE(m);
304 	nw->offset = 0;
305 	nw->hdr = NULL;
306 	nw->_storage = (void *)m;
307 	nw->data = mtod(m, void *);
308 	nw->writer_type = NS_WRITER_TYPE_MBUF;
309 	nw->malloc_flag = mflag;
310 	nw->num_messages = 0;
311 	nw->enomem = false;
312 	memset(nw->data, 0, size);
313 	NL_LOG(LOG_DEBUG2, "alloc mbuf %p req_len %d alloc_len %d data_ptr %p",
314 	    m, size, nw->alloc_len, nw->data);
315 	return (true);
316 }
317 
318 static bool
319 nlmsg_write_socket_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
320 {
321 	struct mbuf *m = (struct mbuf *)buf;
322 	NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr);
323 
324 	if (__predict_false(datalen == 0)) {
325 		m_freem(m);
326 		return (true);
327 	}
328 
329 	m->m_pkthdr.len = datalen;
330 	m->m_len = datalen;
331 	int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0;
332 	return (nl_send_one(m, (struct nlpcb *)(nw->arg.ptr), cnt, io_flags));
333 }
334 
335 static bool
336 nlmsg_write_group_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
337 {
338 	struct mbuf *m = (struct mbuf *)buf;
339 	NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d proto: %d id: %d", buf, datalen,
340 	    nw->arg.group.proto, nw->arg.group.id);
341 
342 	if (__predict_false(datalen == 0)) {
343 		m_freem(m);
344 		return (true);
345 	}
346 
347 	m->m_pkthdr.len = datalen;
348 	m->m_len = datalen;
349 	nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id);
350 	return (true);
351 }
352 
353 static bool
354 nlmsg_write_chain_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
355 {
356 	struct mbuf *m_new = (struct mbuf *)buf;
357 	struct mbuf **m0 = (struct mbuf **)(nw->arg.ptr);
358 
359 	NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr);
360 
361 	if (__predict_false(datalen == 0)) {
362 		m_freem(m_new);
363 		return (true);
364 	}
365 
366 	m_new->m_pkthdr.len = datalen;
367 	m_new->m_len = datalen;
368 
369 	if (*m0 == NULL) {
370 		*m0 = m_new;
371 	} else {
372 		struct mbuf *m_last;
373 		for (m_last = *m0; m_last->m_next != NULL; m_last = m_last->m_next)
374 			;
375 		m_last->m_next = m_new;
376 		(*m0)->m_pkthdr.len += datalen;
377 	}
378 
379 	return (true);
380 }
381 
382 /*
383  * NS_WRITER_TYPE_LBUF
384  * Writes message to the allocated memory buffer,
385  * flushing to socket/group when mbuf size limit is reached.
386  * Calls linux handler to rewrite messages before sending to the socket.
387  */
388 static bool
389 nlmsg_get_ns_lbuf(struct nl_writer *nw, int size, bool waitok)
390 {
391 	int mflag = waitok ? M_WAITOK : M_NOWAIT;
392 	size = roundup2(size, sizeof(void *));
393 	int add_size = sizeof(struct linear_buffer) + SCRATCH_BUFFER_SIZE;
394 	char *buf = malloc(add_size + size * 2, M_NETLINK, mflag | M_ZERO);
395 	if (__predict_false(buf == NULL))
396 		return (false);
397 
398 	/* Fill buffer header first */
399 	struct linear_buffer *lb = (struct linear_buffer *)buf;
400 	lb->base = &buf[sizeof(struct linear_buffer) + size];
401 	lb->size = size + SCRATCH_BUFFER_SIZE;
402 
403 	nw->alloc_len = size;
404 	nw->offset = 0;
405 	nw->hdr = NULL;
406 	nw->_storage = buf;
407 	nw->data = (char *)(lb + 1);
408 	nw->malloc_flag = mflag;
409 	nw->writer_type = NS_WRITER_TYPE_LBUF;
410 	nw->num_messages = 0;
411 	nw->enomem = false;
412 	return (true);
413 }
414 
415 static bool
416 nlmsg_write_socket_lbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
417 {
418 	struct linear_buffer *lb = (struct linear_buffer *)buf;
419 	char *data = (char *)(lb + 1);
420 	struct nlpcb *nlp = (struct nlpcb *)(nw->arg.ptr);
421 
422 	if (__predict_false(datalen == 0)) {
423 		free(buf, M_NETLINK);
424 		return (true);
425 	}
426 
427 	struct mbuf *m = NULL;
428 	if (linux_netlink_p != NULL)
429 		m = linux_netlink_p->msgs_to_linux(nlp->nl_proto, data, datalen, nlp);
430 	free(buf, M_NETLINK);
431 
432 	if (__predict_false(m == NULL)) {
433 		/* XXX: should we set sorcverr? */
434 		return (false);
435 	}
436 
437 	int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0;
438 	return (nl_send_one(m, nlp, cnt, io_flags));
439 }
440 
441 /* Shouldn't be called (maybe except Linux code originating message) */
442 static bool
443 nlmsg_write_group_lbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
444 {
445 	struct linear_buffer *lb = (struct linear_buffer *)buf;
446 	char *data = (char *)(lb + 1);
447 
448 	if (__predict_false(datalen == 0)) {
449 		free(buf, M_NETLINK);
450 		return (true);
451 	}
452 
453 	struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag);
454 	if (__predict_false(m == NULL)) {
455 		free(buf, M_NETLINK);
456 		return (false);
457 	}
458 	m_append(m, datalen, data);
459 	free(buf, M_NETLINK);
460 
461 	nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id);
462 	return (true);
463 }
464 
465 static const struct nlwriter_ops nlmsg_writers[] = {
466 	/* NS_WRITER_TYPE_MBUF */
467 	{
468 		.init = nlmsg_get_ns_mbuf,
469 		.write_socket = nlmsg_write_socket_mbuf,
470 		.write_group = nlmsg_write_group_mbuf,
471 		.write_chain = nlmsg_write_chain_mbuf,
472 	},
473 	/* NS_WRITER_TYPE_BUF */
474 	{
475 		.init = nlmsg_get_ns_buf,
476 		.write_socket = nlmsg_write_socket_buf,
477 		.write_group = nlmsg_write_group_buf,
478 		.write_chain = nlmsg_write_chain_buf,
479 	},
480 	/* NS_WRITER_TYPE_LBUF */
481 	{
482 		.init = nlmsg_get_ns_lbuf,
483 		.write_socket = nlmsg_write_socket_lbuf,
484 		.write_group = nlmsg_write_group_lbuf,
485 	},
486 };
487 
488 static void
489 nlmsg_set_callback(struct nl_writer *nw)
490 {
491 	const struct nlwriter_ops *pops = &nlmsg_writers[nw->writer_type];
492 
493 	switch (nw->writer_target) {
494 	case NS_WRITER_TARGET_SOCKET:
495 		nw->cb = pops->write_socket;
496 		break;
497 	case NS_WRITER_TARGET_GROUP:
498 		nw->cb = pops->write_group;
499 		break;
500 	case NS_WRITER_TARGET_CHAIN:
501 		nw->cb = pops->write_chain;
502 		break;
503 	default:
504 		panic("not implemented");
505 	}
506 }
507 
508 static bool
509 nlmsg_get_buf_type(struct nl_writer *nw, int size, int type, bool waitok)
510 {
511 	MPASS(type + 1 <= sizeof(nlmsg_writers) / sizeof(nlmsg_writers[0]));
512 	NL_LOG(LOG_DEBUG3, "Setting up nw %p size %d type %d", nw, size, type);
513 	return (nlmsg_writers[type].init(nw, size, waitok));
514 }
515 
516 static bool
517 nlmsg_get_buf(struct nl_writer *nw, int size, bool waitok, bool is_linux)
518 {
519 	int type;
520 
521 	if (!is_linux) {
522 		if (__predict_true(size <= NLMBUFSIZE))
523 			type = NS_WRITER_TYPE_MBUF;
524 		else
525 			type = NS_WRITER_TYPE_BUF;
526 	} else
527 		type = NS_WRITER_TYPE_LBUF;
528 	return (nlmsg_get_buf_type(nw, size, type, waitok));
529 }
530 
531 bool
532 _nlmsg_get_unicast_writer(struct nl_writer *nw, int size, struct nlpcb *nlp)
533 {
534 	if (!nlmsg_get_buf(nw, size, false, nlp->nl_linux))
535 		return (false);
536 	nw->arg.ptr = (void *)nlp;
537 	nw->writer_target = NS_WRITER_TARGET_SOCKET;
538 	nlmsg_set_callback(nw);
539 	return (true);
540 }
541 
542 bool
543 _nlmsg_get_group_writer(struct nl_writer *nw, int size, int protocol, int group_id)
544 {
545 	if (!nlmsg_get_buf(nw, size, false, false))
546 		return (false);
547 	nw->arg.group.proto = protocol;
548 	nw->arg.group.id = group_id;
549 	nw->writer_target = NS_WRITER_TARGET_GROUP;
550 	nlmsg_set_callback(nw);
551 	return (true);
552 }
553 
554 bool
555 _nlmsg_get_chain_writer(struct nl_writer *nw, int size, struct mbuf **pm)
556 {
557 	if (!nlmsg_get_buf(nw, size, false, false))
558 		return (false);
559 	*pm = NULL;
560 	nw->arg.ptr = (void *)pm;
561 	nw->writer_target = NS_WRITER_TARGET_CHAIN;
562 	nlmsg_set_callback(nw);
563 	NL_LOG(LOG_DEBUG3, "setup cb %p (need %p)", nw->cb, &nlmsg_write_chain_mbuf);
564 	return (true);
565 }
566 
567 void
568 _nlmsg_ignore_limit(struct nl_writer *nw)
569 {
570 	nw->ignore_limit = true;
571 }
572 
573 bool
574 _nlmsg_flush(struct nl_writer *nw)
575 {
576 
577 	if (__predict_false(nw->hdr != NULL)) {
578 		/* Last message has not been completed, skip it. */
579 		int completed_len = (char *)nw->hdr - nw->data;
580 		/* Send completed messages */
581 		nw->offset -= nw->offset - completed_len;
582 		nw->hdr = NULL;
583 	}
584 
585 	NL_LOG(LOG_DEBUG2, "OUT");
586 	bool result = nw->cb(nw, nw->_storage, nw->offset, nw->num_messages);
587 	nw->_storage = NULL;
588 
589 	if (!result) {
590 		NL_LOG(LOG_DEBUG, "nw %p offset %d: flush with %p() failed", nw, nw->offset, nw->cb);
591 	}
592 
593 	return (result);
594 }
595 
596 /*
597  * Flushes previous data and allocates new underlying storage
598  *  sufficient for holding at least @required_len bytes.
599  * Return true on success.
600  */
601 bool
602 _nlmsg_refill_buffer(struct nl_writer *nw, int required_len)
603 {
604 	struct nl_writer ns_new = {};
605 	int completed_len, new_len;
606 
607 	if (nw->enomem)
608 		return (false);
609 
610 	NL_LOG(LOG_DEBUG3, "no space at offset %d/%d (want %d), trying to reclaim",
611 	    nw->offset, nw->alloc_len, required_len);
612 
613 	/* Calculated new buffer size and allocate it s*/
614 	completed_len = (nw->hdr != NULL) ? (char *)nw->hdr - nw->data : nw->offset;
615 	if (completed_len > 0 && required_len < NLMBUFSIZE) {
616 		/* We already ran out of space, use the largest effective size */
617 		new_len = max(nw->alloc_len, NLMBUFSIZE);
618 	} else {
619 		if (nw->alloc_len < NLMBUFSIZE)
620 			new_len = NLMBUFSIZE;
621 		else
622 			new_len = nw->alloc_len * 2;
623 		while (new_len < required_len)
624 			new_len *= 2;
625 	}
626 	bool waitok = (nw->malloc_flag == M_WAITOK);
627 	bool is_linux = (nw->writer_type == NS_WRITER_TYPE_LBUF);
628 	if (!nlmsg_get_buf(&ns_new, new_len, waitok, is_linux)) {
629 		nw->enomem = true;
630 		NL_LOG(LOG_DEBUG, "getting new buf failed, setting ENOMEM");
631 		return (false);
632 	}
633 	if (nw->ignore_limit)
634 		nlmsg_ignore_limit(&ns_new);
635 
636 	/* Update callback data */
637 	ns_new.writer_target = nw->writer_target;
638 	nlmsg_set_callback(&ns_new);
639 	ns_new.arg = nw->arg;
640 
641 	/* Copy last (unfinished) header to the new storage */
642 	int last_len = nw->offset - completed_len;
643 	if (last_len > 0) {
644 		memcpy(ns_new.data, nw->hdr, last_len);
645 		ns_new.hdr = (struct nlmsghdr *)ns_new.data;
646 		ns_new.offset = last_len;
647 	}
648 
649 	NL_LOG(LOG_DEBUG2, "completed: %d bytes, copied: %d bytes", completed_len, last_len);
650 
651 	/* Flush completed headers & switch to the new nw */
652 	nlmsg_flush(nw);
653 	memcpy(nw, &ns_new, sizeof(struct nl_writer));
654 	NL_LOG(LOG_DEBUG2, "switched buffer: used %d/%d bytes", nw->offset, nw->alloc_len);
655 
656 	return (true);
657 }
658 
659 bool
660 _nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type,
661     uint16_t flags, uint32_t len)
662 {
663 	struct nlmsghdr *hdr;
664 
665 	MPASS(nw->hdr == NULL);
666 
667 	int required_len = NETLINK_ALIGN(len + sizeof(struct nlmsghdr));
668 	if (__predict_false(nw->offset + required_len > nw->alloc_len)) {
669 		if (!nlmsg_refill_buffer(nw, required_len))
670 			return (false);
671 	}
672 
673 	hdr = (struct nlmsghdr *)(&nw->data[nw->offset]);
674 
675 	hdr->nlmsg_len = len;
676 	hdr->nlmsg_type = type;
677 	hdr->nlmsg_flags = flags;
678 	hdr->nlmsg_seq = seq;
679 	hdr->nlmsg_pid = portid;
680 
681 	nw->hdr = hdr;
682 	nw->offset += sizeof(struct nlmsghdr);
683 
684 	return (true);
685 }
686 
687 bool
688 _nlmsg_end(struct nl_writer *nw)
689 {
690 	MPASS(nw->hdr != NULL);
691 
692 	if (nw->enomem) {
693 		NL_LOG(LOG_DEBUG, "ENOMEM when dumping message");
694 		nlmsg_abort(nw);
695 		return (false);
696 	}
697 
698 	nw->hdr->nlmsg_len = (uint32_t)(nw->data + nw->offset - (char *)nw->hdr);
699 	NL_LOG(LOG_DEBUG2, "wrote msg len: %u type: %d: flags: 0x%X seq: %u pid: %u",
700 	    nw->hdr->nlmsg_len, nw->hdr->nlmsg_type, nw->hdr->nlmsg_flags,
701 	    nw->hdr->nlmsg_seq, nw->hdr->nlmsg_pid);
702 	nw->hdr = NULL;
703 	nw->num_messages++;
704 	return (true);
705 }
706 
707 void
708 _nlmsg_abort(struct nl_writer *nw)
709 {
710 	if (nw->hdr != NULL) {
711 		nw->offset = (uint32_t)((char *)nw->hdr - nw->data);
712 		nw->hdr = NULL;
713 	}
714 }
715 
716 void
717 nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *hdr,
718     struct nl_pstate *npt)
719 {
720 	struct nlmsgerr *errmsg;
721 	int payload_len;
722 	uint32_t flags = nlp->nl_flags;
723 	struct nl_writer *nw = npt->nw;
724 	bool cap_ack;
725 
726 	payload_len = sizeof(struct nlmsgerr);
727 
728 	/*
729 	 * The only case when we send the full message in the
730 	 * reply is when there is an error and NETLINK_CAP_ACK
731 	 * is not set.
732 	 */
733 	cap_ack = (error == 0) || (flags & NLF_CAP_ACK);
734 	if (!cap_ack)
735 		payload_len += hdr->nlmsg_len - sizeof(struct nlmsghdr);
736 	payload_len = NETLINK_ALIGN(payload_len);
737 
738 	uint16_t nl_flags = cap_ack ? NLM_F_CAPPED : 0;
739 	if ((npt->err_msg || npt->err_off) && nlp->nl_flags & NLF_EXT_ACK)
740 		nl_flags |= NLM_F_ACK_TLVS;
741 
742 	NL_LOG(LOG_DEBUG3, "acknowledging message type %d seq %d",
743 	    hdr->nlmsg_type, hdr->nlmsg_seq);
744 
745 	if (!nlmsg_add(nw, nlp->nl_port, hdr->nlmsg_seq, NLMSG_ERROR, nl_flags, payload_len))
746 		goto enomem;
747 
748 	errmsg = nlmsg_reserve_data(nw, payload_len, struct nlmsgerr);
749 	errmsg->error = error;
750 	/* In case of error copy the whole message, else just the header */
751 	memcpy(&errmsg->msg, hdr, cap_ack ? sizeof(*hdr) : hdr->nlmsg_len);
752 
753 	if (npt->err_msg != NULL && nlp->nl_flags & NLF_EXT_ACK)
754 		nlattr_add_string(nw, NLMSGERR_ATTR_MSG, npt->err_msg);
755 	if (npt->err_off != 0 && nlp->nl_flags & NLF_EXT_ACK)
756 		nlattr_add_u32(nw, NLMSGERR_ATTR_OFFS, npt->err_off);
757 	if (npt->cookie != NULL)
758 		nlattr_add_raw(nw, npt->cookie);
759 
760 	if (nlmsg_end(nw))
761 		return;
762 enomem:
763 	NLP_LOG(LOG_DEBUG, nlp, "error allocating ack data for message %d seq %u",
764 	    hdr->nlmsg_type, hdr->nlmsg_seq);
765 	nlmsg_abort(nw);
766 }
767 
768 bool
769 _nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr)
770 {
771 	if (!nlmsg_add(nw, hdr->nlmsg_pid, hdr->nlmsg_seq, NLMSG_DONE, 0, sizeof(int))) {
772 		NL_LOG(LOG_DEBUG, "Error finalizing table dump");
773 		return (false);
774 	}
775 	/* Save operation result */
776 	int *perror = nlmsg_reserve_object(nw, int);
777 	NL_LOG(LOG_DEBUG2, "record error=%d at off %d (%p)", error,
778 	    nw->offset, perror);
779 	*perror = error;
780 	nlmsg_end(nw);
781 	nw->suppress_ack = true;
782 
783 	return (true);
784 }
785 
786 #include <netlink/ktest_netlink_message_writer.h>
787