xref: /illumos-gate/usr/src/man/man3c/regcomp.3c (revision 2833423dc59f4c35fe4713dbb942950c82df0437)
1.\" Copyright (c) 1992, 1993, 1994 Henry Spencer.
2.\" Copyright (c) 1992, 1993, 1994
3.\"	The Regents of the University of California.  All rights reserved.
4.\"
5.\" This code is derived from software contributed to Berkeley by
6.\" Henry Spencer.
7.\"
8.\" Redistribution and use in source and binary forms, with or without
9.\" modification, are permitted provided that the following conditions
10.\" are met:
11.\" 1. Redistributions of source code must retain the above copyright
12.\"    notice, this list of conditions and the following disclaimer.
13.\" 2. Redistributions in binary form must reproduce the above copyright
14.\"    notice, this list of conditions and the following disclaimer in the
15.\"    documentation and/or other materials provided with the distribution.
16.\" 3. Neither the name of the University nor the names of its contributors
17.\"    may be used to endorse or promote products derived from this software
18.\"    without specific prior written permission.
19.\"
20.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30.\" SUCH DAMAGE.
31.\"
32.\" Sun Microsystems, Inc. gratefully acknowledges The Open Group for
33.\" permission to reproduce portions of its copyrighted documentation.
34.\" Original documentation from The Open Group can be obtained online at
35.\" http://www.opengroup.org/bookstore/.
36.\"
37.\" The Institute of Electrical and Electronics Engineers and The Open
38.\" Group, have given us permission to reprint portions of their
39.\" documentation.
40.\"
41.\" In the following statement, the phrase ``this text'' refers to portions
42.\" of the system documentation.
43.\"
44.\" Portions of this text are reprinted and reproduced in electronic form
45.\" in the SunOS Reference Manual, from IEEE Std 1003.1, 2004 Edition,
46.\" Standard for Information Technology -- Portable Operating System
47.\" Interface (POSIX), The Open Group Base Specifications Issue 6,
48.\" Copyright (C) 2001-2004 by the Institute of Electrical and Electronics
49.\" Engineers, Inc and The Open Group.  In the event of any discrepancy
50.\" between these versions and the original IEEE and The Open Group
51.\" Standard, the original IEEE and The Open Group Standard is the referee
52.\" document.  The original Standard can be obtained online at
53.\" http://www.opengroup.org/unix/online.html.
54.\"
55.\" This notice shall appear on any product containing this material.
56.\"
57.\" The contents of this file are subject to the terms of the
58.\" Common Development and Distribution License (the "License").
59.\" You may not use this file except in compliance with the License.
60.\"
61.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
62.\" or http://www.opensolaris.org/os/licensing.
63.\" See the License for the specific language governing permissions
64.\" and limitations under the License.
65.\"
66.\" When distributing Covered Code, include this CDDL HEADER in each
67.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
68.\" If applicable, add the following below this CDDL HEADER, with the
69.\" fields enclosed by brackets "[]" replaced with your own identifying
70.\" information: Portions Copyright [yyyy] [name of copyright owner]
71.\"
72.\"
73.\" Copyright (c) 1992, X/Open Company Limited. All Rights Reserved.
74.\" Portions Copyright (c) 2003, Sun Microsystems, Inc.  All Rights Reserved.
75.\" Copyright 2017 Nexenta Systems, Inc.
76.\"
77.Dd December 26, 2023
78.Dt REGCOMP 3C
79.Os
80.Sh NAME
81.Nm regcomp ,
82.Nm regexec ,
83.Nm regerror ,
84.Nm regfree
85.Nd regular-expression library
86.Sh LIBRARY
87.Lb libc
88.Sh SYNOPSIS
89.In regex.h
90.Ft int
91.Fo regcomp
92.Fa "regex_t *restrict preg" "const char *restrict pattern" "int cflags"
93.Fc
94.Ft int
95.Fo regexec
96.Fa "const regex_t *restrict preg" "const char *restrict string"
97.Fa "size_t nmatch" "regmatch_t pmatch[restrict]" "int eflags"
98.Fc
99.Ft size_t
100.Fo regerror
101.Fa "int errcode" "const regex_t *restrict preg"
102.Fa "char *restrict errbuf" "size_t errbuf_size"
103.Fc
104.Ft void
105.Fn regfree "regex_t *preg"
106.Sh DESCRIPTION
107These routines implement
108.St -p1003.2
109regular expressions; see
110.Xr regex 7 .
111The
112.Fn regcomp
113function compiles an RE written as a string into an internal form,
114.Fn regexec
115matches that internal form against a string and reports results,
116.Fn regerror
117transforms error codes from either into human-readable messages,
118and
119.Fn regfree
120frees any dynamically-allocated storage used by the internal form
121of an RE.
122.Pp
123The translation of an RE into the internal form contained in a
124.Ft regex_t
125is inherently locale-specific; changes to the locale in effect between
126.Fn regcomp
127and subsequent calls to
128.Fn regexec
129may result in unexpected or undefined behavior.
130.Pp
131The header
132.In regex.h
133declares two structure types,
134.Ft regex_t
135and
136.Ft regmatch_t ,
137the former for compiled internal forms and the latter for match reporting.
138It also declares the four functions, a type
139.Ft regoff_t ,
140and a number of constants with names starting with
141.Qq Dv REG_ .
142.Ss Fn regcomp
143The
144.Fn regcomp
145function compiles the regular expression contained in the
146.Fa pattern
147string, subject to the flags in
148.Fa cflags ,
149and places the results in the
150.Ft regex_t
151structure pointed to by
152.Fa preg .
153The
154.Fa cflags
155argument is the bitwise OR of zero or more of the following flags:
156.Bl -tag -width REG_EXTENDED
157.It Dv REG_EXTENDED
158Compile extended regular expressions
159.Pq EREs ,
160rather than the basic regular expressions
161.Pq BREs
162that are the default.
163.It Dv REG_BASIC
164This is a synonym for 0, provided as a counterpart to
165.Dv REG_EXTENDED
166to improve readability.
167.It Dv REG_NOSPEC
168Compile with recognition of all special characters turned off.
169All characters are thus considered ordinary, so the RE is a literal string.
170This is an extension, compatible with but not specified by
171.St -p1003.2 ,
172and should be used with caution in software intended to be portable to other
173systems.
174.Dv REG_EXTENDED
175and
176.Dv REG_NOSPEC
177may not be used in the same call to
178.Fn regcomp .
179.It Dv REG_ICASE
180Compile for matching that ignores upper/lower case distinctions.
181See
182.Xr regex 7 .
183.It Dv REG_NOSUB
184Compile for matching that need only report success or failure,
185not what was matched.
186.It Dv REG_NEWLINE
187Compile for newline-sensitive matching.
188By default, newline is a completely ordinary character with no special
189meaning in either REs or strings.
190With this flag,
191.Qq [^
192bracket expressions and
193.Qq \&.
194never match newline,
195a
196.Qq \&^
197anchor matches the null string after any newline in the string in addition to
198its normal function, and the
199.Qq \&$
200anchor matches the null string before any newline in the string in addition to
201its normal function.
202.It Dv REG_PEND
203The regular expression ends, not at the first NUL, but just before the character
204pointed to by the
205.Va re_endp
206member of the structure pointed to by
207.Fa preg .
208The
209.Va re_endp
210member is of type
211.Vt "const char *" .
212This flag permits inclusion of NULs in the RE; they are considered ordinary
213characters.
214This is an extension, compatible with but not specified by
215.St -p1003.2 ,
216and should be used with caution in software intended to be portable to other
217systems.
218.El
219.Pp
220When successful,
221.Fn regcomp
222returns 0 and fills in the structure pointed to by
223.Fa preg .
224One member of that structure
225.Po other than
226.Va re_endp
227.Pc
228is publicized:
229.Va re_nsub ,
230of type
231.Ft size_t ,
232contains the number of parenthesized subexpressions within the RE
233.Po except that the value of this member is undefined if the
234.Dv REG_NOSUB
235flag was used
236.Pc .
237.Ss Fn regexec
238The
239.Fn regexec
240function matches the compiled RE pointed to by
241.Fa preg
242against the
243.Fa string ,
244subject to the flags in
245.Fa eflags ,
246and reports results using
247.Fa nmatch ,
248.Fa pmatch ,
249and the returned value.
250The RE must have been compiled by a previous invocation of
251.Fn regcomp .
252The compiled form is not altered during execution of
253.Fn regexec ,
254so a single compiled RE can be used simultaneously by multiple threads.
255The locale in effect at the time of
256.Fn regexec
257must be the same as the one in effect when the RE was compiled by
258.Fn regcomp .
259.Pp
260By default, the NUL-terminated string pointed to by
261.Fa string
262is considered to be the text of an entire line, minus any terminating
263newline.
264The
265.Fa eflags
266argument is the bitwise OR of zero or more of the following flags:
267.Bl -tag -width REG_STARTEND
268.It Dv REG_NOTBOL
269The first character of the string is treated as the continuation
270of a line.
271This means that the anchors
272.Qq \&^ ,
273.Qq [[:<:]] ,
274and
275.Qq \e<
276do not match before it; but see
277.Dv REG_STARTEND
278below.
279This does not affect the behavior of newlines under
280.Dv REG_NEWLINE .
281.It Dv REG_NOTEOL
282The NUL terminating the string does not end a line, so the
283.Qq \&$
284anchor does not match before it.
285This does not affect the behavior of newlines under
286.Dv REG_NEWLINE .
287.It Dv REG_STARTEND
288The string is considered to start at
289.Fa string No +
290.Fa pmatch Ns [0]. Ns Fa rm_so
291and to end before the byte located at
292.Fa string No +
293.Fa pmatch Ns [0]. Ns Fa rm_eo ,
294regardless of the value of
295.Fa nmatch .
296See below for the definition of
297.Fa pmatch
298and
299.Fa nmatch .
300This is an extension, compatible with but not specified by
301.St -p1003.2 ,
302and should be used with caution in software intended to be portable to other
303systems.
304.Pp
305Without
306.Dv REG_NOTBOL ,
307the position
308.Fa rm_so
309is considered the beginning of a line, such that
310.Qq \&^
311matches before it, and the beginning of a word if there is a word character at
312this position, such that
313.Qq [[:<:]]
314and
315.Qq \e<
316match before it.
317.Pp
318With
319.Dv REG_NOTBOL ,
320the character at position
321.Fa rm_so
322is treated as the continuation of a line, and if
323.Fa rm_so
324is greater than 0, the preceding character is taken into consideration.
325If the preceding character is a newline and the regular expression was compiled
326with
327.Dv REG_NEWLINE ,
328.Qq ^
329matches before the string; if the preceding character is not a word character
330but the string starts with a word character,
331.Qq [[:<:]]
332and
333.Qq \e<
334match before the string.
335.El
336.Pp
337See
338.Xr regex 7
339for a discussion of what is matched in situations where an RE or a portion
340thereof could match any of several substrings of
341.Fa string .
342.Pp
343If
344.Dv REG_NOSUB
345was specified in the compilation of the RE, or if
346.Fa nmatch
347is 0,
348.Fn regexec
349ignores the
350.Fa pmatch
351argument
352.Po but see below for the case where
353.Dv REG_STARTEND
354is specified
355.Pc .
356Otherwise,
357.Fa pmatch
358points to an array of
359.Fa nmatch
360structures of type
361.Ft regmatch_t .
362Such a structure has at least the members
363.Va rm_so
364and
365.Va rm_eo ,
366both of type
367.Ft regoff_t
368.Po a signed arithmetic type at least as large as an
369.Ft off_t
370and a
371.Ft ssize_t
372.Pc ,
373containing respectively the offset of the first character of a substring
374and the offset of the first character after the end of the substring.
375Offsets are measured from the beginning of the
376.Fa string
377argument given to
378.Fn regexec .
379An empty substring is denoted by equal offsets, both indicating the character
380following the empty substring.
381.Pp
382The 0th member of the
383.Fa pmatch
384array is filled in to indicate what substring of
385.Fa string
386was matched by the entire RE.
387Remaining members report what substring was matched by parenthesized
388subexpressions within the RE; member
389.Va i
390reports subexpression
391.Va i ,
392with subexpressions counted
393.Pq starting at 1
394by the order of their opening parentheses in the RE, left to right.
395Unused entries in the array
396.Po corresponding either to subexpressions that did not participate in the match
397at all, or to subexpressions that do not exist in the RE
398.Po that is,
399.Va i
400>
401.Fa preg Ns -> Ns Va re_nsub
402.Pc
403.Pc
404have both
405.Va rm_so
406and
407.Va rm_eo
408set to -1.
409If a subexpression participated in the match several times,
410the reported substring is the last one it matched.
411.Po Note, as an example in particular, that when the RE
412.Qq (b*)+
413matches
414.Qq bbb ,
415the parenthesized subexpression matches each of the three
416.So Li b Sc Ns s
417and then an infinite number of empty strings following the last
418.Qq b ,
419so the reported substring is one of the empties.
420.Pc
421.Pp
422If
423.Dv REG_STARTEND
424is specified,
425.Fa pmatch
426must point to at least one
427.Ft regmatch_t
428.Po even if
429.Fa nmatch
430is 0 or
431.Dv REG_NOSUB
432was specified
433.Pc ,
434to hold the input offsets for
435.Dv REG_STARTEND .
436Use for output is still entirely controlled by
437.Fa nmatch ;
438if
439.Fa nmatch
440is 0 or
441.Dv REG_NOSUB
442was specified,
443the value of
444.Fa pmatch Ns [0]
445will not be changed by a successful
446.Fn regexec .
447.Ss Fn regerror
448The
449.Fn regerror
450function maps a non-zero
451.Fa errcode
452from either
453.Fn regcomp
454or
455.Fn regexec
456to a human-readable, printable message.
457If
458.Fa preg
459is non-NULL, the error code should have arisen from use of the
460.Ft regex_t
461pointed to by
462.Fa preg ,
463and if the error code came from
464.Fn regcomp ,
465it should have been the result from the most recent
466.Fn regcomp
467using that
468.Ft regex_t .
469The
470.Po
471.Fn regerror
472may be able to supply a more detailed message using information
473from the
474.Ft regex_t .
475.Pc
476The
477.Fn regerror
478function places the NUL-terminated message into the buffer pointed to by
479.Fa errbuf ,
480limiting the length
481.Pq including the NUL
482to at most
483.Fa errbuf_size
484bytes.
485If the whole message will not fit, as much of it as will fit before the
486terminating NUL is supplied.
487In any case, the returned value is the size of buffer needed to hold the whole
488message
489.Pq including terminating NUL .
490If
491.Fa errbuf_size
492is 0,
493.Fa errbuf
494is ignored but the return value is still correct.
495.Pp
496If the
497.Fa errcode
498given to
499.Fn regerror
500is first ORed with
501.Dv REG_ITOA ,
502the
503.Qq message
504that results is the printable name of the error code, e.g.
505.Qq Dv REG_NOMATCH ,
506rather than an explanation thereof.
507If
508.Fa errcode
509is
510.Dv REG_ATOI ,
511then
512.Fa preg
513shall be non-NULL and the
514.Va re_endp
515member of the structure it points to must point to the printable name of an
516error code; in this case, the result in
517.Fa errbuf
518is the decimal digits of the numeric value of the error code
519.Pq 0 if the name is not recognized .
520.Dv REG_ITOA
521and
522.Dv REG_ATOI
523are intended primarily as debugging facilities; they are extensions,
524compatible with but not specified by
525.St -p1003.2 ,
526and should be used with caution in software intended to be portable to other
527systems.
528.Ss Fn regfree
529The
530.Fn regfree
531function frees any dynamically-allocated storage associated with the compiled RE
532pointed to by
533.Fa preg .
534The remaining
535.Ft regex_t
536is no longer a valid compiled RE and the effect of supplying it to
537.Fn regexec
538or
539.Fn regerror
540is undefined.
541.Sh IMPLEMENTATION NOTES
542There are a number of decisions that
543.St -p1003.2
544leaves up to the implementor,
545either by explicitly saying
546.Qq undefined
547or by virtue of them being forbidden by the RE grammar.
548This implementation treats them as follows.
549.Pp
550There is no particular limit on the length of REs, except insofar as memory is
551limited.
552Memory usage is approximately linear in RE size, and largely insensitive
553to RE complexity, except for bounded repetitions.
554.Pp
555A backslashed character other than one specifically given a magic meaning by
556.St -p1003.2
557.Pq such magic meanings occur only in BREs
558is taken as an ordinary character.
559.Pp
560Any unmatched
561.Qq \&[
562is a
563.Dv REG_EBRACK
564error.
565.Pp
566Equivalence classes cannot begin or end bracket-expression ranges.
567The endpoint of one range cannot begin another.
568.Pp
569.Dv RE_DUP_MAX ,
570the limit on repetition counts in bounded repetitions, is 255.
571.Pp
572A repetition operator
573.Po
574.Qq \&? ,
575.Qq \&* ,
576.Qq \&+ ,
577or bounds
578.Pc
579cannot follow another repetition operator.
580A repetition operator cannot begin an expression or subexpression
581or follow
582.Qq \&^
583or
584.Qq \&| .
585.Pp
586.Qq \&|
587cannot appear first or last in a (sub)expression or after another
588.Qq \&| ,
589i.e., an operand of
590.Qq \&|
591cannot be an empty subexpression.
592An empty parenthesized subexpression,
593.Qq () ,
594is legal and matches an empty (sub)string.
595An empty string is not a legal RE.
596.Pp
597A
598.Qq \&{
599followed by a digit is considered the beginning of bounds for a bounded
600repetition, which must then follow the syntax for bounds.
601A
602.Qq \&{
603.Em not
604followed by a digit is considered an ordinary character.
605.Pp
606.Qq \&^
607and
608.Qq \&$
609beginning and ending subexpressions in BREs are anchors, not ordinary
610characters.
611.Sh RETURN VALUES
612On successful completion, the
613.Fn regcomp
614function returns 0.
615Otherwise, it returns an integer value indicating an error as described in
616.In regex.h ,
617and the content of preg is undefined.
618.Pp
619On successful completion, the
620.Fn regexec
621function returns 0.
622Otherwise it returns
623.Dv REG_NOMATCH
624to indicate no match, or
625.Dv REG_ENOSYS
626to indicate that the function is not supported.
627.Pp
628Upon successful completion, the
629.Fn regerror
630function returns the number of bytes needed to hold the entire generated string.
631Otherwise, it returns 0 to indicate that the function is not implemented.
632.Pp
633The
634.Fn regfree
635function returns no value.
636.Pp
637The following constants are defined as error return values:
638.Pp
639.Bl -tag -width "REG_ECOLLATE" -compact
640.It Dv REG_NOMATCH
641The
642.Fn regexec
643function failed to match.
644.It Dv REG_BADPAT
645Invalid regular expression.
646.It Dv REG_ECOLLATE
647Invalid collating element referenced.
648.It Dv REG_ECTYPE
649Invalid character class type referenced.
650.It Dv REG_EESCAPE
651Trailing
652.Qq \&\e
653in pattern.
654.It Dv REG_ESUBREG
655Number in
656.Qq \&\e Ns Em digit
657invalid or in error.
658.It Dv REG_EBRACK
659.Qq []
660imbalance.
661.It Dv REG_ENOSYS
662The function is not supported.
663.It Dv REG_EPAREN
664.Qq \e(\e)
665or
666.Qq ()
667imbalance.
668.It Dv REG_EBRACE
669.Qq \e{\e}
670imbalance.
671.It Dv REG_BADBR
672Content of
673.Qq \e{\e}
674invalid: not a number, number too large, more than two
675numbers, first larger than second.
676.It Dv REG_ERANGE
677Invalid endpoint in range expression.
678.It Dv REG_ESPACE
679Out of memory.
680.It Dv REG_BADRPT
681.Qq \&? ,
682.Qq *
683or
684.Qq +
685not preceded by valid regular expression.
686.El
687.Sh USAGE
688An application could use:
689.Bd -literal -offset Ds
690regerror(code, preg, (char *)NULL, (size_t)0)
691.Ed
692.Pp
693to find out how big a buffer is needed for the generated string,
694.Fn malloc
695a buffer to hold the string, and then call
696.Fn regerror
697again to get the string
698.Po see
699.Xr malloc 3C
700.Pc .
701Alternately, it could allocate a fixed, static buffer that is big enough to hold
702most strings, and then use
703.Fn malloc
704allocate a larger buffer if it finds that this is too small.
705.Sh EXAMPLES
706Matching string against the extended regular expression in pattern.
707.Bd -literal -offset Ds
708#include <regex.h>
709
710/*
711* Match string against the extended regular expression in
712* pattern, treating errors as no match.
713*
714* return 1 for match, 0 for no match
715*/
716int
717match(const char *string, char *pattern)
718{
719	int status;
720	regex_t re;
721
722	if (regcomp(&re, pattern, REG_EXTENDED\||\|REG_NOSUB) != 0) {
723		return(0);      /* report error */
724	}
725	status = regexec(&re, string, (size_t) 0, NULL, 0);
726	regfree(&re);
727	if (status != 0) {
728		return(0);      /* report error */
729	}
730	return(1);
731}
732.Ed
733.Pp
734The following demonstrates how the
735.Dv REG_NOTBOL
736flag could be used with
737.Fn regexec
738to find all substrings in a line that match a pattern supplied by a user.
739.Pq For simplicity of the example, very little error checking is done.
740.Bd -literal -offset Ds
741(void) regcomp(&re, pattern, 0);
742/* this call to regexec() finds the first match on the line */
743error = regexec(&re, &buffer[0], 1, &pm, 0);
744while (error == 0) {    /* while matches found */
745	/* substring found between pm.rm_so and pm.rm_eo */
746	/* This call to regexec() finds the next match */
747	error = regexec(&re, buffer + pm.rm_eo, 1, &pm, REG_NOTBOL);
748}
749.Ed
750.Sh ERRORS
751No errors are defined.
752.Sh CODE SET INDEPENDENCE
753.Sy Enabled
754.Sh INTERFACE STABILITY
755.Sy Standard
756.Sh MT-LEVEL
757.Sy MT-Safe with exceptions
758.Pp
759The
760.Fn regcomp
761function can be used safely in a multithreaded application as long as
762.Xr setlocale 3C
763or
764.Xr uselocale 3C
765are not being called to change the locale.
766.Sh SEE ALSO
767.Xr attributes 7 ,
768.Xr locale 7 ,
769.Xr regex 7 ,
770.Xr standards 7
771.Pp
772.St -p1003.2 ,
773sections 2.8
774.Pq Regular Expression Notation
775and
776B.5
777.Pq C Binding for Regular Expression Matching .
778