xref: /freebsd/sys/netlink/netlink_message_writer.c (revision edf8578117e8844e02c0121147f45e4609b30680)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include "opt_netlink.h"
29 
30 #include <sys/cdefs.h>
31 #include <sys/param.h>
32 #include <sys/malloc.h>
33 #include <sys/lock.h>
34 #include <sys/rmlock.h>
35 #include <sys/mbuf.h>
36 #include <sys/ck.h>
37 #include <sys/socket.h>
38 #include <sys/socketvar.h>
39 #include <sys/syslog.h>
40 
41 #include <netlink/netlink.h>
42 #include <netlink/netlink_ctl.h>
43 #include <netlink/netlink_linux.h>
44 #include <netlink/netlink_var.h>
45 
46 #define	DEBUG_MOD_NAME	nl_writer
47 #define	DEBUG_MAX_LEVEL	LOG_DEBUG3
48 #include <netlink/netlink_debug.h>
49 _DECLARE_DEBUG(LOG_INFO);
50 
51 /*
52  * The goal of this file is to provide convenient message writing KPI on top of
53  * different storage methods (mbufs, uio, temporary memory chunks).
54  *
55  * The main KPI guarantee is that the (last) message always resides in the contiguous
56  *  memory buffer, so one is able to update the header after writing the entire message.
57  *
58  * This guarantee comes with a side effect of potentially reallocating underlying
59  *  buffer, so one needs to update the desired pointers after something is added
60  *  to the header.
61  *
62  * Messaging layer contains hooks performing transparent Linux translation for the messages.
63  *
64  * There are 3 types of supported targets:
65  *  * socket (adds mbufs to the socket buffer, used for message replies)
66  *  * group (sends mbuf/chain to the specified groups, used for the notifications)
67  *  * chain (returns mbuf chain, used in Linux message translation code)
68  *
69  * There are 3 types of storage:
70  * * NS_WRITER_TYPE_MBUF (mbuf-based, most efficient, used when a single message
71  *    fits in NLMBUFSIZE)
72  * * NS_WRITER_TYPE_BUF (fallback, malloc-based, used when a single message needs
73  *    to be larger than one supported by NS_WRITER_TYPE_MBUF)
74  * * NS_WRITER_TYPE_LBUF (malloc-based, similar to NS_WRITER_TYPE_BUF, used for
75  *    Linux sockets, calls translation hook prior to sending messages to the socket).
76  *
77  * Internally, KPI switches between different types of storage when memory requirements
78  *  change. It happens transparently to the caller.
79  */
80 
81 /*
82  * Uma zone for the mbuf-based Netlink storage
83  */
84 static uma_zone_t	nlmsg_zone;
85 
86 static void
87 nl_free_mbuf_storage(struct mbuf *m)
88 {
89 	uma_zfree(nlmsg_zone, m->m_ext.ext_buf);
90 }
91 
92 static int
93 nl_setup_mbuf_storage(void *mem, int size, void *arg, int how __unused)
94 {
95 	struct mbuf *m = (struct mbuf *)arg;
96 
97 	if (m != NULL)
98 		m_extadd(m, mem, size, nl_free_mbuf_storage, NULL, NULL, 0, EXT_MOD_TYPE);
99 
100 	return (0);
101 }
102 
103 static struct mbuf *
104 nl_get_mbuf_flags(int size, int malloc_flags, int mbuf_flags)
105 {
106 	struct mbuf *m, *m_storage;
107 
108 	if (size <= MHLEN)
109 		return (m_get2(size, malloc_flags, MT_DATA, mbuf_flags));
110 
111 	if (__predict_false(size > NLMBUFSIZE))
112 		return (NULL);
113 
114 	m = m_gethdr(malloc_flags, MT_DATA);
115 	if (m == NULL)
116 		return (NULL);
117 
118 	m_storage = uma_zalloc_arg(nlmsg_zone, m, malloc_flags);
119 	if (m_storage == NULL) {
120 		m_free_raw(m);
121 		return (NULL);
122 	}
123 
124 	return (m);
125 }
126 
127 static struct mbuf *
128 nl_get_mbuf(int size, int malloc_flags)
129 {
130 	return (nl_get_mbuf_flags(size, malloc_flags, M_PKTHDR));
131 }
132 
133 /*
134  * Gets a chain of Netlink mbufs.
135  * This is strip-down version of m_getm2()
136  */
137 static struct mbuf *
138 nl_get_mbuf_chain(int len, int malloc_flags)
139 {
140 	struct mbuf *m_chain = NULL, *m_tail = NULL;
141 	int mbuf_flags = M_PKTHDR;
142 
143 	while (len > 0) {
144 		int sz = len > NLMBUFSIZE ? NLMBUFSIZE: len;
145 		struct mbuf *m = nl_get_mbuf_flags(sz, malloc_flags, mbuf_flags);
146 
147 		if (m == NULL) {
148 			m_freem(m_chain);
149 			return (NULL);
150 		}
151 
152 		/* Book keeping. */
153 		len -= M_SIZE(m);
154 		if (m_tail != NULL)
155 			m_tail->m_next = m;
156 		else
157 			m_chain = m;
158 		m_tail = m;
159 		mbuf_flags &= ~M_PKTHDR;	/* Only valid on the first mbuf. */
160 	}
161 
162 	return (m_chain);
163 }
164 
165 void
166 nl_init_msg_zone(void)
167 {
168 	nlmsg_zone = uma_zcreate("netlink", NLMBUFSIZE, nl_setup_mbuf_storage,
169 	    NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
170 }
171 
172 void
173 nl_destroy_msg_zone(void)
174 {
175 	uma_zdestroy(nlmsg_zone);
176 }
177 
178 
179 typedef bool nlwriter_op_init(struct nl_writer *nw, int size, bool waitok);
180 typedef bool nlwriter_op_write(struct nl_writer *nw, void *buf, int buflen, int cnt);
181 
182 struct nlwriter_ops {
183 	nlwriter_op_init	*init;
184 	nlwriter_op_write	*write_socket;
185 	nlwriter_op_write	*write_group;
186 	nlwriter_op_write	*write_chain;
187 };
188 
189 /*
190  * NS_WRITER_TYPE_BUF
191  * Writes message to a temporary memory buffer,
192  * flushing to the socket/group when buffer size limit is reached
193  */
194 static bool
195 nlmsg_get_ns_buf(struct nl_writer *nw, int size, bool waitok)
196 {
197 	int mflag = waitok ? M_WAITOK : M_NOWAIT;
198 	nw->_storage = malloc(size, M_NETLINK, mflag | M_ZERO);
199 	if (__predict_false(nw->_storage == NULL))
200 		return (false);
201 	nw->alloc_len = size;
202 	nw->offset = 0;
203 	nw->hdr = NULL;
204 	nw->data = nw->_storage;
205 	nw->writer_type = NS_WRITER_TYPE_BUF;
206 	nw->malloc_flag = mflag;
207 	nw->num_messages = 0;
208 	nw->enomem = false;
209 	return (true);
210 }
211 
212 static bool
213 nlmsg_write_socket_buf(struct nl_writer *nw, void *buf, int datalen, int cnt)
214 {
215 	NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr);
216 	if (__predict_false(datalen == 0)) {
217 		free(buf, M_NETLINK);
218 		return (true);
219 	}
220 
221 	struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag);
222 	if (__predict_false(m == NULL)) {
223 		/* XXX: should we set sorcverr? */
224 		free(buf, M_NETLINK);
225 		return (false);
226 	}
227 	m_append(m, datalen, buf);
228 	free(buf, M_NETLINK);
229 
230 	int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0;
231 	return (nl_send_one(m, (struct nlpcb *)(nw->arg.ptr), cnt, io_flags));
232 }
233 
234 static bool
235 nlmsg_write_group_buf(struct nl_writer *nw, void *buf, int datalen, int cnt)
236 {
237 	NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d proto: %d id: %d", buf, datalen,
238 	    nw->arg.group.proto, nw->arg.group.id);
239 	if (__predict_false(datalen == 0)) {
240 		free(buf, M_NETLINK);
241 		return (true);
242 	}
243 
244 	struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag);
245 	if (__predict_false(m == NULL)) {
246 		free(buf, M_NETLINK);
247 		return (false);
248 	}
249 	bool success = m_append(m, datalen, buf) != 0;
250 	free(buf, M_NETLINK);
251 
252 	if (!success)
253 		return (false);
254 
255 	nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id);
256 	return (true);
257 }
258 
259 static bool
260 nlmsg_write_chain_buf(struct nl_writer *nw, void *buf, int datalen, int cnt)
261 {
262 	struct mbuf **m0 = (struct mbuf **)(nw->arg.ptr);
263 	NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr);
264 
265 	if (__predict_false(datalen == 0)) {
266 		free(buf, M_NETLINK);
267 		return (true);
268 	}
269 
270 	if (*m0 == NULL) {
271 		struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag);
272 
273 		if (__predict_false(m == NULL)) {
274 			free(buf, M_NETLINK);
275 			return (false);
276 		}
277 		*m0 = m;
278 	}
279 	if (__predict_false(m_append(*m0, datalen, buf) == 0)) {
280 		free(buf, M_NETLINK);
281 		return (false);
282 	}
283 	return (true);
284 }
285 
286 
287 /*
288  * NS_WRITER_TYPE_MBUF
289  * Writes message to the allocated mbuf,
290  * flushing to socket/group when mbuf size limit is reached.
291  * This is the most efficient mechanism as it avoids double-copying.
292  *
293  * Allocates a single mbuf suitable to store up to @size bytes of data.
294  * If size < MHLEN (around 160 bytes), allocates mbuf with pkghdr.
295  * If the size <= NLMBUFSIZE (2k), allocate mbuf+storage out of nlmsg_zone.
296  * Returns NULL on greater size or the allocation failure.
297  */
298 static bool
299 nlmsg_get_ns_mbuf(struct nl_writer *nw, int size, bool waitok)
300 {
301 	int mflag = waitok ? M_WAITOK : M_NOWAIT;
302 	struct mbuf *m = nl_get_mbuf(size, mflag);
303 
304 	if (__predict_false(m == NULL))
305 		return (false);
306 	nw->alloc_len = M_TRAILINGSPACE(m);
307 	nw->offset = 0;
308 	nw->hdr = NULL;
309 	nw->_storage = (void *)m;
310 	nw->data = mtod(m, void *);
311 	nw->writer_type = NS_WRITER_TYPE_MBUF;
312 	nw->malloc_flag = mflag;
313 	nw->num_messages = 0;
314 	nw->enomem = false;
315 	memset(nw->data, 0, size);
316 	NL_LOG(LOG_DEBUG2, "alloc mbuf %p req_len %d alloc_len %d data_ptr %p",
317 	    m, size, nw->alloc_len, nw->data);
318 	return (true);
319 }
320 
321 static bool
322 nlmsg_write_socket_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
323 {
324 	struct mbuf *m = (struct mbuf *)buf;
325 	NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr);
326 
327 	if (__predict_false(datalen == 0)) {
328 		m_freem(m);
329 		return (true);
330 	}
331 
332 	m->m_pkthdr.len = datalen;
333 	m->m_len = datalen;
334 	int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0;
335 	return (nl_send_one(m, (struct nlpcb *)(nw->arg.ptr), cnt, io_flags));
336 }
337 
338 static bool
339 nlmsg_write_group_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
340 {
341 	struct mbuf *m = (struct mbuf *)buf;
342 	NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d proto: %d id: %d", buf, datalen,
343 	    nw->arg.group.proto, nw->arg.group.id);
344 
345 	if (__predict_false(datalen == 0)) {
346 		m_freem(m);
347 		return (true);
348 	}
349 
350 	m->m_pkthdr.len = datalen;
351 	m->m_len = datalen;
352 	nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id);
353 	return (true);
354 }
355 
356 static bool
357 nlmsg_write_chain_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
358 {
359 	struct mbuf *m_new = (struct mbuf *)buf;
360 	struct mbuf **m0 = (struct mbuf **)(nw->arg.ptr);
361 
362 	NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr);
363 
364 	if (__predict_false(datalen == 0)) {
365 		m_freem(m_new);
366 		return (true);
367 	}
368 
369 	m_new->m_pkthdr.len = datalen;
370 	m_new->m_len = datalen;
371 
372 	if (*m0 == NULL) {
373 		*m0 = m_new;
374 	} else {
375 		struct mbuf *m_last;
376 		for (m_last = *m0; m_last->m_next != NULL; m_last = m_last->m_next)
377 			;
378 		m_last->m_next = m_new;
379 		(*m0)->m_pkthdr.len += datalen;
380 	}
381 
382 	return (true);
383 }
384 
385 /*
386  * NS_WRITER_TYPE_LBUF
387  * Writes message to the allocated memory buffer,
388  * flushing to socket/group when mbuf size limit is reached.
389  * Calls linux handler to rewrite messages before sending to the socket.
390  */
391 static bool
392 nlmsg_get_ns_lbuf(struct nl_writer *nw, int size, bool waitok)
393 {
394 	int mflag = waitok ? M_WAITOK : M_NOWAIT;
395 	size = roundup2(size, sizeof(void *));
396 	int add_size = sizeof(struct linear_buffer) + SCRATCH_BUFFER_SIZE;
397 	char *buf = malloc(add_size + size * 2, M_NETLINK, mflag | M_ZERO);
398 	if (__predict_false(buf == NULL))
399 		return (false);
400 
401 	/* Fill buffer header first */
402 	struct linear_buffer *lb = (struct linear_buffer *)buf;
403 	lb->base = &buf[sizeof(struct linear_buffer) + size];
404 	lb->size = size + SCRATCH_BUFFER_SIZE;
405 
406 	nw->alloc_len = size;
407 	nw->offset = 0;
408 	nw->hdr = NULL;
409 	nw->_storage = buf;
410 	nw->data = (char *)(lb + 1);
411 	nw->malloc_flag = mflag;
412 	nw->writer_type = NS_WRITER_TYPE_LBUF;
413 	nw->num_messages = 0;
414 	nw->enomem = false;
415 	return (true);
416 }
417 
418 static bool
419 nlmsg_write_socket_lbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
420 {
421 	struct linear_buffer *lb = (struct linear_buffer *)buf;
422 	char *data = (char *)(lb + 1);
423 	struct nlpcb *nlp = (struct nlpcb *)(nw->arg.ptr);
424 
425 	if (__predict_false(datalen == 0)) {
426 		free(buf, M_NETLINK);
427 		return (true);
428 	}
429 
430 	struct mbuf *m = NULL;
431 	if (linux_netlink_p != NULL)
432 		m = linux_netlink_p->msgs_to_linux(nlp->nl_proto, data, datalen, nlp);
433 	free(buf, M_NETLINK);
434 
435 	if (__predict_false(m == NULL)) {
436 		/* XXX: should we set sorcverr? */
437 		return (false);
438 	}
439 
440 	int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0;
441 	return (nl_send_one(m, nlp, cnt, io_flags));
442 }
443 
444 /* Shouldn't be called (maybe except Linux code originating message) */
445 static bool
446 nlmsg_write_group_lbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
447 {
448 	struct linear_buffer *lb = (struct linear_buffer *)buf;
449 	char *data = (char *)(lb + 1);
450 
451 	if (__predict_false(datalen == 0)) {
452 		free(buf, M_NETLINK);
453 		return (true);
454 	}
455 
456 	struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag);
457 	if (__predict_false(m == NULL)) {
458 		free(buf, M_NETLINK);
459 		return (false);
460 	}
461 	m_append(m, datalen, data);
462 	free(buf, M_NETLINK);
463 
464 	nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id);
465 	return (true);
466 }
467 
468 static const struct nlwriter_ops nlmsg_writers[] = {
469 	/* NS_WRITER_TYPE_MBUF */
470 	{
471 		.init = nlmsg_get_ns_mbuf,
472 		.write_socket = nlmsg_write_socket_mbuf,
473 		.write_group = nlmsg_write_group_mbuf,
474 		.write_chain = nlmsg_write_chain_mbuf,
475 	},
476 	/* NS_WRITER_TYPE_BUF */
477 	{
478 		.init = nlmsg_get_ns_buf,
479 		.write_socket = nlmsg_write_socket_buf,
480 		.write_group = nlmsg_write_group_buf,
481 		.write_chain = nlmsg_write_chain_buf,
482 	},
483 	/* NS_WRITER_TYPE_LBUF */
484 	{
485 		.init = nlmsg_get_ns_lbuf,
486 		.write_socket = nlmsg_write_socket_lbuf,
487 		.write_group = nlmsg_write_group_lbuf,
488 	},
489 };
490 
491 static void
492 nlmsg_set_callback(struct nl_writer *nw)
493 {
494 	const struct nlwriter_ops *pops = &nlmsg_writers[nw->writer_type];
495 
496 	switch (nw->writer_target) {
497 	case NS_WRITER_TARGET_SOCKET:
498 		nw->cb = pops->write_socket;
499 		break;
500 	case NS_WRITER_TARGET_GROUP:
501 		nw->cb = pops->write_group;
502 		break;
503 	case NS_WRITER_TARGET_CHAIN:
504 		nw->cb = pops->write_chain;
505 		break;
506 	default:
507 		panic("not implemented");
508 	}
509 }
510 
511 static bool
512 nlmsg_get_buf_type(struct nl_writer *nw, int size, int type, bool waitok)
513 {
514 	MPASS(type + 1 <= sizeof(nlmsg_writers) / sizeof(nlmsg_writers[0]));
515 	NL_LOG(LOG_DEBUG3, "Setting up nw %p size %d type %d", nw, size, type);
516 	return (nlmsg_writers[type].init(nw, size, waitok));
517 }
518 
519 static bool
520 nlmsg_get_buf(struct nl_writer *nw, int size, bool waitok, bool is_linux)
521 {
522 	int type;
523 
524 	if (!is_linux) {
525 		if (__predict_true(size <= NLMBUFSIZE))
526 			type = NS_WRITER_TYPE_MBUF;
527 		else
528 			type = NS_WRITER_TYPE_BUF;
529 	} else
530 		type = NS_WRITER_TYPE_LBUF;
531 	return (nlmsg_get_buf_type(nw, size, type, waitok));
532 }
533 
534 bool
535 _nlmsg_get_unicast_writer(struct nl_writer *nw, int size, struct nlpcb *nlp)
536 {
537 	if (!nlmsg_get_buf(nw, size, false, nlp->nl_linux))
538 		return (false);
539 	nw->arg.ptr = (void *)nlp;
540 	nw->writer_target = NS_WRITER_TARGET_SOCKET;
541 	nlmsg_set_callback(nw);
542 	return (true);
543 }
544 
545 bool
546 _nlmsg_get_group_writer(struct nl_writer *nw, int size, int protocol, int group_id)
547 {
548 	if (!nlmsg_get_buf(nw, size, false, false))
549 		return (false);
550 	nw->arg.group.proto = protocol;
551 	nw->arg.group.id = group_id;
552 	nw->writer_target = NS_WRITER_TARGET_GROUP;
553 	nlmsg_set_callback(nw);
554 	return (true);
555 }
556 
557 bool
558 _nlmsg_get_chain_writer(struct nl_writer *nw, int size, struct mbuf **pm)
559 {
560 	if (!nlmsg_get_buf(nw, size, false, false))
561 		return (false);
562 	*pm = NULL;
563 	nw->arg.ptr = (void *)pm;
564 	nw->writer_target = NS_WRITER_TARGET_CHAIN;
565 	nlmsg_set_callback(nw);
566 	NL_LOG(LOG_DEBUG3, "setup cb %p (need %p)", nw->cb, &nlmsg_write_chain_mbuf);
567 	return (true);
568 }
569 
570 void
571 _nlmsg_ignore_limit(struct nl_writer *nw)
572 {
573 	nw->ignore_limit = true;
574 }
575 
576 bool
577 _nlmsg_flush(struct nl_writer *nw)
578 {
579 
580 	if (__predict_false(nw->hdr != NULL)) {
581 		/* Last message has not been completed, skip it. */
582 		int completed_len = (char *)nw->hdr - nw->data;
583 		/* Send completed messages */
584 		nw->offset -= nw->offset - completed_len;
585 		nw->hdr = NULL;
586 	}
587 
588 	NL_LOG(LOG_DEBUG2, "OUT");
589 	bool result = nw->cb(nw, nw->_storage, nw->offset, nw->num_messages);
590 	nw->_storage = NULL;
591 
592 	if (!result) {
593 		NL_LOG(LOG_DEBUG, "nw %p offset %d: flush with %p() failed", nw, nw->offset, nw->cb);
594 	}
595 
596 	return (result);
597 }
598 
599 /*
600  * Flushes previous data and allocates new underlying storage
601  *  sufficient for holding at least @required_len bytes.
602  * Return true on success.
603  */
604 bool
605 _nlmsg_refill_buffer(struct nl_writer *nw, int required_len)
606 {
607 	struct nl_writer ns_new = {};
608 	int completed_len, new_len;
609 
610 	if (nw->enomem)
611 		return (false);
612 
613 	NL_LOG(LOG_DEBUG3, "no space at offset %d/%d (want %d), trying to reclaim",
614 	    nw->offset, nw->alloc_len, required_len);
615 
616 	/* Calculated new buffer size and allocate it s*/
617 	completed_len = (nw->hdr != NULL) ? (char *)nw->hdr - nw->data : nw->offset;
618 	if (completed_len > 0 && required_len < NLMBUFSIZE) {
619 		/* We already ran out of space, use the largest effective size */
620 		new_len = max(nw->alloc_len, NLMBUFSIZE);
621 	} else {
622 		if (nw->alloc_len < NLMBUFSIZE)
623 			new_len = NLMBUFSIZE;
624 		else
625 			new_len = nw->alloc_len * 2;
626 		while (new_len < required_len)
627 			new_len *= 2;
628 	}
629 	bool waitok = (nw->malloc_flag == M_WAITOK);
630 	bool is_linux = (nw->writer_type == NS_WRITER_TYPE_LBUF);
631 	if (!nlmsg_get_buf(&ns_new, new_len, waitok, is_linux)) {
632 		nw->enomem = true;
633 		NL_LOG(LOG_DEBUG, "getting new buf failed, setting ENOMEM");
634 		return (false);
635 	}
636 	if (nw->ignore_limit)
637 		nlmsg_ignore_limit(&ns_new);
638 
639 	/* Update callback data */
640 	ns_new.writer_target = nw->writer_target;
641 	nlmsg_set_callback(&ns_new);
642 	ns_new.arg = nw->arg;
643 
644 	/* Copy last (unfinished) header to the new storage */
645 	int last_len = nw->offset - completed_len;
646 	if (last_len > 0) {
647 		memcpy(ns_new.data, nw->hdr, last_len);
648 		ns_new.hdr = (struct nlmsghdr *)ns_new.data;
649 		ns_new.offset = last_len;
650 	}
651 
652 	NL_LOG(LOG_DEBUG2, "completed: %d bytes, copied: %d bytes", completed_len, last_len);
653 
654 	/* Flush completed headers & switch to the new nw */
655 	nlmsg_flush(nw);
656 	memcpy(nw, &ns_new, sizeof(struct nl_writer));
657 	NL_LOG(LOG_DEBUG2, "switched buffer: used %d/%d bytes", nw->offset, nw->alloc_len);
658 
659 	return (true);
660 }
661 
662 bool
663 _nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type,
664     uint16_t flags, uint32_t len)
665 {
666 	struct nlmsghdr *hdr;
667 
668 	MPASS(nw->hdr == NULL);
669 
670 	int required_len = NETLINK_ALIGN(len + sizeof(struct nlmsghdr));
671 	if (__predict_false(nw->offset + required_len > nw->alloc_len)) {
672 		if (!nlmsg_refill_buffer(nw, required_len))
673 			return (false);
674 	}
675 
676 	hdr = (struct nlmsghdr *)(&nw->data[nw->offset]);
677 
678 	hdr->nlmsg_len = len;
679 	hdr->nlmsg_type = type;
680 	hdr->nlmsg_flags = flags;
681 	hdr->nlmsg_seq = seq;
682 	hdr->nlmsg_pid = portid;
683 
684 	nw->hdr = hdr;
685 	nw->offset += sizeof(struct nlmsghdr);
686 
687 	return (true);
688 }
689 
690 bool
691 _nlmsg_end(struct nl_writer *nw)
692 {
693 	MPASS(nw->hdr != NULL);
694 
695 	if (nw->enomem) {
696 		NL_LOG(LOG_DEBUG, "ENOMEM when dumping message");
697 		nlmsg_abort(nw);
698 		return (false);
699 	}
700 
701 	nw->hdr->nlmsg_len = (uint32_t)(nw->data + nw->offset - (char *)nw->hdr);
702 	NL_LOG(LOG_DEBUG2, "wrote msg len: %u type: %d: flags: 0x%X seq: %u pid: %u",
703 	    nw->hdr->nlmsg_len, nw->hdr->nlmsg_type, nw->hdr->nlmsg_flags,
704 	    nw->hdr->nlmsg_seq, nw->hdr->nlmsg_pid);
705 	nw->hdr = NULL;
706 	nw->num_messages++;
707 	return (true);
708 }
709 
710 void
711 _nlmsg_abort(struct nl_writer *nw)
712 {
713 	if (nw->hdr != NULL) {
714 		nw->offset = (uint32_t)((char *)nw->hdr - nw->data);
715 		nw->hdr = NULL;
716 	}
717 }
718 
719 void
720 nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *hdr,
721     struct nl_pstate *npt)
722 {
723 	struct nlmsgerr *errmsg;
724 	int payload_len;
725 	uint32_t flags = nlp->nl_flags;
726 	struct nl_writer *nw = npt->nw;
727 	bool cap_ack;
728 
729 	payload_len = sizeof(struct nlmsgerr);
730 
731 	/*
732 	 * The only case when we send the full message in the
733 	 * reply is when there is an error and NETLINK_CAP_ACK
734 	 * is not set.
735 	 */
736 	cap_ack = (error == 0) || (flags & NLF_CAP_ACK);
737 	if (!cap_ack)
738 		payload_len += hdr->nlmsg_len - sizeof(struct nlmsghdr);
739 	payload_len = NETLINK_ALIGN(payload_len);
740 
741 	uint16_t nl_flags = cap_ack ? NLM_F_CAPPED : 0;
742 	if ((npt->err_msg || npt->err_off) && nlp->nl_flags & NLF_EXT_ACK)
743 		nl_flags |= NLM_F_ACK_TLVS;
744 
745 	NL_LOG(LOG_DEBUG3, "acknowledging message type %d seq %d",
746 	    hdr->nlmsg_type, hdr->nlmsg_seq);
747 
748 	if (!nlmsg_add(nw, nlp->nl_port, hdr->nlmsg_seq, NLMSG_ERROR, nl_flags, payload_len))
749 		goto enomem;
750 
751 	errmsg = nlmsg_reserve_data(nw, payload_len, struct nlmsgerr);
752 	errmsg->error = error;
753 	/* In case of error copy the whole message, else just the header */
754 	memcpy(&errmsg->msg, hdr, cap_ack ? sizeof(*hdr) : hdr->nlmsg_len);
755 
756 	if (npt->err_msg != NULL && nlp->nl_flags & NLF_EXT_ACK)
757 		nlattr_add_string(nw, NLMSGERR_ATTR_MSG, npt->err_msg);
758 	if (npt->err_off != 0 && nlp->nl_flags & NLF_EXT_ACK)
759 		nlattr_add_u32(nw, NLMSGERR_ATTR_OFFS, npt->err_off);
760 	if (npt->cookie != NULL)
761 		nlattr_add_raw(nw, npt->cookie);
762 
763 	if (nlmsg_end(nw))
764 		return;
765 enomem:
766 	NLP_LOG(LOG_DEBUG, nlp, "error allocating ack data for message %d seq %u",
767 	    hdr->nlmsg_type, hdr->nlmsg_seq);
768 	nlmsg_abort(nw);
769 }
770 
771 bool
772 _nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr)
773 {
774 	if (!nlmsg_add(nw, hdr->nlmsg_pid, hdr->nlmsg_seq, NLMSG_DONE, 0, sizeof(int))) {
775 		NL_LOG(LOG_DEBUG, "Error finalizing table dump");
776 		return (false);
777 	}
778 	/* Save operation result */
779 	int *perror = nlmsg_reserve_object(nw, int);
780 	NL_LOG(LOG_DEBUG2, "record error=%d at off %d (%p)", error,
781 	    nw->offset, perror);
782 	*perror = error;
783 	nlmsg_end(nw);
784 	nw->suppress_ack = true;
785 
786 	return (true);
787 }
788 
789 #include <netlink/ktest_netlink_message_writer.h>
790