xref: /illumos-gate/usr/src/man/man3c/regcomp.3c (revision e8921a52c53ee69f7b65f054d9b2e886139daa59)
1.\" Copyright (c) 1992, 1993, 1994 Henry Spencer.
2.\" Copyright (c) 1992, 1993, 1994
3.\"	The Regents of the University of California.  All rights reserved.
4.\"
5.\" This code is derived from software contributed to Berkeley by
6.\" Henry Spencer.
7.\"
8.\" Redistribution and use in source and binary forms, with or without
9.\" modification, are permitted provided that the following conditions
10.\" are met:
11.\" 1. Redistributions of source code must retain the above copyright
12.\"    notice, this list of conditions and the following disclaimer.
13.\" 2. Redistributions in binary form must reproduce the above copyright
14.\"    notice, this list of conditions and the following disclaimer in the
15.\"    documentation and/or other materials provided with the distribution.
16.\" 3. Neither the name of the University nor the names of its contributors
17.\"    may be used to endorse or promote products derived from this software
18.\"    without specific prior written permission.
19.\"
20.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30.\" SUCH DAMAGE.
31.\"
32.\" Sun Microsystems, Inc. gratefully acknowledges The Open Group for
33.\" permission to reproduce portions of its copyrighted documentation.
34.\" Original documentation from The Open Group can be obtained online at
35.\" http://www.opengroup.org/bookstore/.
36.\"
37.\" The Institute of Electrical and Electronics Engineers and The Open
38.\" Group, have given us permission to reprint portions of their
39.\" documentation.
40.\"
41.\" In the following statement, the phrase ``this text'' refers to portions
42.\" of the system documentation.
43.\"
44.\" Portions of this text are reprinted and reproduced in electronic form
45.\" in the SunOS Reference Manual, from IEEE Std 1003.1, 2004 Edition,
46.\" Standard for Information Technology -- Portable Operating System
47.\" Interface (POSIX), The Open Group Base Specifications Issue 6,
48.\" Copyright (C) 2001-2004 by the Institute of Electrical and Electronics
49.\" Engineers, Inc and The Open Group.  In the event of any discrepancy
50.\" between these versions and the original IEEE and The Open Group
51.\" Standard, the original IEEE and The Open Group Standard is the referee
52.\" document.  The original Standard can be obtained online at
53.\" http://www.opengroup.org/unix/online.html.
54.\"
55.\" This notice shall appear on any product containing this material.
56.\"
57.\" The contents of this file are subject to the terms of the
58.\" Common Development and Distribution License (the "License").
59.\" You may not use this file except in compliance with the License.
60.\"
61.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
62.\" or http://www.opensolaris.org/os/licensing.
63.\" See the License for the specific language governing permissions
64.\" and limitations under the License.
65.\"
66.\" When distributing Covered Code, include this CDDL HEADER in each
67.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
68.\" If applicable, add the following below this CDDL HEADER, with the
69.\" fields enclosed by brackets "[]" replaced with your own identifying
70.\" information: Portions Copyright [yyyy] [name of copyright owner]
71.\"
72.\"
73.\" Copyright (c) 1992, X/Open Company Limited. All Rights Reserved.
74.\" Portions Copyright (c) 2003, Sun Microsystems, Inc.  All Rights Reserved.
75.\" Copyright 2017 Nexenta Systems, Inc.
76.\"
77.Dd June 14, 2017
78.Dt REGCOMP 3C
79.Os
80.Sh NAME
81.Nm regcomp ,
82.Nm regexec ,
83.Nm regerror ,
84.Nm regfree
85.Nd regular-expression library
86.Sh LIBRARY
87.Lb libc
88.Sh SYNOPSIS
89.In regex.h
90.Ft int
91.Fo regcomp
92.Fa "regex_t *restrict preg" "const char *restrict pattern" "int cflags"
93.Fc
94.Ft int
95.Fo regexec
96.Fa "const regex_t *restrict preg" "const char *restrict string"
97.Fa "size_t nmatch" "regmatch_t pmatch[restrict]" "int eflags"
98.Fc
99.Ft size_t
100.Fo regerror
101.Fa "int errcode" "const regex_t *restrict preg"
102.Fa "char *restrict errbuf" "size_t errbuf_size"
103.Fc
104.Ft void
105.Fn regfree "regex_t *preg"
106.Sh DESCRIPTION
107These routines implement
108.St -p1003.2
109regular expressions; see
110.Xr regex 5 .
111The
112.Fn regcomp
113function compiles an RE written as a string into an internal form,
114.Fn regexec
115matches that internal form against a string and reports results,
116.Fn regerror
117transforms error codes from either into human-readable messages,
118and
119.Fn regfree
120frees any dynamically-allocated storage used by the internal form
121of an RE.
122.Pp
123The header
124.In regex.h
125declares two structure types,
126.Ft regex_t
127and
128.Ft regmatch_t ,
129the former for compiled internal forms and the latter for match reporting.
130It also declares the four functions, a type
131.Ft regoff_t ,
132and a number of constants with names starting with
133.Qq Dv REG_ .
134.Ss Fn regcomp
135The
136.Fn regcomp
137function compiles the regular expression contained in the
138.Fa pattern
139string, subject to the flags in
140.Fa cflags ,
141and places the results in the
142.Ft regex_t
143structure pointed to by
144.Fa preg .
145The
146.Fa cflags
147argument is the bitwise OR of zero or more of the following flags:
148.Bl -tag -width REG_EXTENDED
149.It Dv REG_EXTENDED
150Compile extended regular expressions
151.Pq EREs ,
152rather than the basic regular expressions
153.Pq BREs
154that are the default.
155.It Dv REG_BASIC
156This is a synonym for 0, provided as a counterpart to
157.Dv REG_EXTENDED
158to improve readability.
159.It Dv REG_NOSPEC
160Compile with recognition of all special characters turned off.
161All characters are thus considered ordinary, so the RE is a literal string.
162This is an extension, compatible with but not specified by
163.St -p1003.2 ,
164and should be used with caution in software intended to be portable to other
165systems.
166.Dv REG_EXTENDED
167and
168.Dv REG_NOSPEC
169may not be used in the same call to
170.Fn regcomp .
171.It Dv REG_ICASE
172Compile for matching that ignores upper/lower case distinctions.
173See
174.Xr regex 5 .
175.It Dv REG_NOSUB
176Compile for matching that need only report success or failure,
177not what was matched.
178.It Dv REG_NEWLINE
179Compile for newline-sensitive matching.
180By default, newline is a completely ordinary character with no special
181meaning in either REs or strings.
182With this flag,
183.Qq [^
184bracket expressions and
185.Qq \&.
186never match newline,
187a
188.Qq \&^
189anchor matches the null string after any newline in the string in addition to
190its normal function, and the
191.Qq \&$
192anchor matches the null string before any newline in the string in addition to
193its normal function.
194.It Dv REG_PEND
195The regular expression ends, not at the first NUL, but just before the character
196pointed to by the
197.Va re_endp
198member of the structure pointed to by
199.Fa preg .
200The
201.Va re_endp
202member is of type
203.Vt "const char *" .
204This flag permits inclusion of NULs in the RE; they are considered ordinary
205characters.
206This is an extension, compatible with but not specified by
207.St -p1003.2 ,
208and should be used with caution in software intended to be portable to other
209systems.
210.El
211.Pp
212When successful,
213.Fn regcomp
214returns 0 and fills in the structure pointed to by
215.Fa preg .
216One member of that structure
217.Po other than
218.Va re_endp
219.Pc
220is publicized:
221.Va re_nsub ,
222of type
223.Ft size_t ,
224contains the number of parenthesized subexpressions within the RE
225.Po except that the value of this member is undefined if the
226.Dv REG_NOSUB
227flag was used
228.Pc .
229.Ss Fn regexec
230The
231.Fn regexec
232function matches the compiled RE pointed to by
233.Fa preg
234against the
235.Fa string ,
236subject to the flags in
237.Fa eflags ,
238and reports results using
239.Fa nmatch ,
240.Fa pmatch ,
241and the returned value.
242The RE must have been compiled by a previous invocation of
243.Fn regcomp .
244The compiled form is not altered during execution of
245.Fn regexec ,
246so a single compiled RE can be used simultaneously by multiple threads.
247.Pp
248By default, the NUL-terminated string pointed to by
249.Fa string
250is considered to be the text of an entire line, minus any terminating
251newline.
252The
253.Fa eflags
254argument is the bitwise OR of zero or more of the following flags:
255.Bl -tag -width REG_STARTEND
256.It Dv REG_NOTBOL
257The first character of the string is treated as the continuation
258of a line.
259This means that the anchors
260.Qq \&^ ,
261.Qq [[:<:]] ,
262and
263.Qq \e<
264do not match before it; but see
265.Dv REG_STARTEND
266below.
267This does not affect the behavior of newlines under
268.Dv REG_NEWLINE .
269.It Dv REG_NOTEOL
270The NUL terminating the string does not end a line, so the
271.Qq \&$
272anchor does not match before it.
273This does not affect the behavior of newlines under
274.Dv REG_NEWLINE .
275.It Dv REG_STARTEND
276The string is considered to start at
277.Fa string No +
278.Fa pmatch Ns [0]. Ns Fa rm_so
279and to end before the byte located at
280.Fa string No +
281.Fa pmatch Ns [0]. Ns Fa rm_eo ,
282regardless of the value of
283.Fa nmatch .
284See below for the definition of
285.Fa pmatch
286and
287.Fa nmatch .
288This is an extension, compatible with but not specified by
289.St -p1003.2 ,
290and should be used with caution in software intended to be portable to other
291systems.
292.Pp
293Without
294.Dv REG_NOTBOL ,
295the position
296.Fa rm_so
297is considered the beginning of a line, such that
298.Qq \&^
299matches before it, and the beginning of a word if there is a word character at
300this position, such that
301.Qq [[:<:]]
302and
303.Qq \e<
304match before it.
305.Pp
306With
307.Dv REG_NOTBOL ,
308the character at position
309.Fa rm_so
310is treated as the continuation of a line, and if
311.Fa rm_so
312is greater than 0, the preceding character is taken into consideration.
313If the preceding character is a newline and the regular expression was compiled
314with
315.Dv REG_NEWLINE ,
316.Qq ^
317matches before the string; if the preceding character is not a word character
318but the string starts with a word character,
319.Qq [[:<:]]
320and
321.Qq \e<
322match before the string.
323.El
324.Pp
325See
326.Xr regex 5
327for a discussion of what is matched in situations where an RE or a portion
328thereof could match any of several substrings of
329.Fa string .
330.Pp
331If
332.Dv REG_NOSUB
333was specified in the compilation of the RE, or if
334.Fa nmatch
335is 0,
336.Fn regexec
337ignores the
338.Fa pmatch
339argument
340.Po but see below for the case where
341.Dv REG_STARTEND
342is specified
343.Pc .
344Otherwise,
345.Fa pmatch
346points to an array of
347.Fa nmatch
348structures of type
349.Ft regmatch_t .
350Such a structure has at least the members
351.Va rm_so
352and
353.Va rm_eo ,
354both of type
355.Ft regoff_t
356.Po a signed arithmetic type at least as large as an
357.Ft off_t
358and a
359.Ft ssize_t
360.Pc ,
361containing respectively the offset of the first character of a substring
362and the offset of the first character after the end of the substring.
363Offsets are measured from the beginning of the
364.Fa string
365argument given to
366.Fn regexec .
367An empty substring is denoted by equal offsets, both indicating the character
368following the empty substring.
369.Pp
370The 0th member of the
371.Fa pmatch
372array is filled in to indicate what substring of
373.Fa string
374was matched by the entire RE.
375Remaining members report what substring was matched by parenthesized
376subexpressions within the RE; member
377.Va i
378reports subexpression
379.Va i ,
380with subexpressions counted
381.Pq starting at 1
382by the order of their opening parentheses in the RE, left to right.
383Unused entries in the array
384.Po corresponding either to subexpressions that did not participate in the match
385at all, or to subexpressions that do not exist in the RE
386.Po that is,
387.Va i
388>
389.Fa preg Ns -> Ns Va re_nsub
390.Pc
391.Pc
392have both
393.Va rm_so
394and
395.Va rm_eo
396set to -1.
397If a subexpression participated in the match several times,
398the reported substring is the last one it matched.
399.Po Note, as an example in particular, that when the RE
400.Qq (b*)+
401matches
402.Qq bbb ,
403the parenthesized subexpression matches each of the three
404.So Li b Sc Ns s
405and then an infinite number of empty strings following the last
406.Qq b ,
407so the reported substring is one of the empties.
408.Pc
409.Pp
410If
411.Dv REG_STARTEND
412is specified,
413.Fa pmatch
414must point to at least one
415.Ft regmatch_t
416.Po even if
417.Fa nmatch
418is 0 or
419.Dv REG_NOSUB
420was specified
421.Pc ,
422to hold the input offsets for
423.Dv REG_STARTEND .
424Use for output is still entirely controlled by
425.Fa nmatch ;
426if
427.Fa nmatch
428is 0 or
429.Dv REG_NOSUB
430was specified,
431the value of
432.Fa pmatch Ns [0]
433will not be changed by a successful
434.Fn regexec .
435.Ss Fn regerror
436The
437.Fn regerror
438function maps a non-zero
439.Fa errcode
440from either
441.Fn regcomp
442or
443.Fn regexec
444to a human-readable, printable message.
445If
446.Fa preg
447is non-NULL, the error code should have arisen from use of the
448.Ft regex_t
449pointed to by
450.Fa preg ,
451and if the error code came from
452.Fn regcomp ,
453it should have been the result from the most recent
454.Fn regcomp
455using that
456.Ft regex_t .
457The
458.Po
459.Fn regerror
460may be able to supply a more detailed message using information
461from the
462.Ft regex_t .
463.Pc
464The
465.Fn regerror
466function places the NUL-terminated message into the buffer pointed to by
467.Fa errbuf ,
468limiting the length
469.Pq including the NUL
470to at most
471.Fa errbuf_size
472bytes.
473If the whole message will not fit, as much of it as will fit before the
474terminating NUL is supplied.
475In any case, the returned value is the size of buffer needed to hold the whole
476message
477.Pq including terminating NUL .
478If
479.Fa errbuf_size
480is 0,
481.Fa errbuf
482is ignored but the return value is still correct.
483.Pp
484If the
485.Fa errcode
486given to
487.Fn regerror
488is first ORed with
489.Dv REG_ITOA ,
490the
491.Qq message
492that results is the printable name of the error code, e.g.
493.Qq Dv REG_NOMATCH ,
494rather than an explanation thereof.
495If
496.Fa errcode
497is
498.Dv REG_ATOI ,
499then
500.Fa preg
501shall be non-NULL and the
502.Va re_endp
503member of the structure it points to must point to the printable name of an
504error code; in this case, the result in
505.Fa errbuf
506is the decimal digits of the numeric value of the error code
507.Pq 0 if the name is not recognized .
508.Dv REG_ITOA
509and
510.Dv REG_ATOI
511are intended primarily as debugging facilities; they are extensions,
512compatible with but not specified by
513.St -p1003.2 ,
514and should be used with caution in software intended to be portable to other
515systems.
516.Ss Fn regfree
517The
518.Fn regfree
519function frees any dynamically-allocated storage associated with the compiled RE
520pointed to by
521.Fa preg .
522The remaining
523.Ft regex_t
524is no longer a valid compiled RE and the effect of supplying it to
525.Fn regexec
526or
527.Fn regerror
528is undefined.
529.Sh IMPLEMENTATION NOTES
530There are a number of decisions that
531.St -p1003.2
532leaves up to the implementor,
533either by explicitly saying
534.Qq undefined
535or by virtue of them being forbidden by the RE grammar.
536This implementation treats them as follows.
537.Pp
538There is no particular limit on the length of REs, except insofar as memory is
539limited.
540Memory usage is approximately linear in RE size, and largely insensitive
541to RE complexity, except for bounded repetitions.
542.Pp
543A backslashed character other than one specifically given a magic meaning by
544.St -p1003.2
545.Pq such magic meanings occur only in BREs
546is taken as an ordinary character.
547.Pp
548Any unmatched
549.Qq \&[
550is a
551.Dv REG_EBRACK
552error.
553.Pp
554Equivalence classes cannot begin or end bracket-expression ranges.
555The endpoint of one range cannot begin another.
556.Pp
557.Dv RE_DUP_MAX ,
558the limit on repetition counts in bounded repetitions, is 255.
559.Pp
560A repetition operator
561.Po
562.Qq \&? ,
563.Qq \&* ,
564.Qq \&+ ,
565or bounds
566.Pc
567cannot follow another repetition operator.
568A repetition operator cannot begin an expression or subexpression
569or follow
570.Qq \&^
571or
572.Qq \&| .
573.Pp
574.Qq \&|
575cannot appear first or last in a (sub)expression or after another
576.Qq \&| ,
577i.e., an operand of
578.Qq \&|
579cannot be an empty subexpression.
580An empty parenthesized subexpression,
581.Qq () ,
582is legal and matches an empty (sub)string.
583An empty string is not a legal RE.
584.Pp
585A
586.Qq \&{
587followed by a digit is considered the beginning of bounds for a bounded
588repetition, which must then follow the syntax for bounds.
589A
590.Qq \&{
591.Em not
592followed by a digit is considered an ordinary character.
593.Pp
594.Qq \&^
595and
596.Qq \&$
597beginning and ending subexpressions in BREs are anchors, not ordinary
598characters.
599.Sh RETURN VALUES
600On successful completion, the
601.Fn regcomp
602function returns 0.
603Otherwise, it returns an integer value indicating an error as described in
604.In regex.h ,
605and the content of preg is undefined.
606.Pp
607On successful completion, the
608.Fn regexec
609function returns 0.
610Otherwise it returns
611.Dv REG_NOMATCH
612to indicate no match, or
613.Dv REG_ENOSYS
614to indicate that the function is not supported.
615.Pp
616Upon successful completion, the
617.Fn regerror
618function returns the number of bytes needed to hold the entire generated string.
619Otherwise, it returns 0 to indicate that the function is not implemented.
620.Pp
621The
622.Fn regfree
623function returns no value.
624.Pp
625The following constants are defined as error return values:
626.Pp
627.Bl -tag -width "REG_ECOLLATE" -compact
628.It Dv REG_NOMATCH
629The
630.Fn regexec
631function failed to match.
632.It Dv REG_BADPAT
633Invalid regular expression.
634.It Dv REG_ECOLLATE
635Invalid collating element referenced.
636.It Dv REG_ECTYPE
637Invalid character class type referenced.
638.It Dv REG_EESCAPE
639Trailing
640.Qq \&\e
641in pattern.
642.It Dv REG_ESUBREG
643Number in
644.Qq \&\e Ns Em digit
645invalid or in error.
646.It Dv REG_EBRACK
647.Qq []
648imbalance.
649.It Dv REG_ENOSYS
650The function is not supported.
651.It Dv REG_EPAREN
652.Qq \e(\e)
653or
654.Qq ()
655imbalance.
656.It Dv REG_EBRACE
657.Qq \e{\e}
658imbalance.
659.It Dv REG_BADBR
660Content of
661.Qq \e{\e}
662invalid: not a number, number too large, more than two
663numbers, first larger than second.
664.It Dv REG_ERANGE
665Invalid endpoint in range expression.
666.It Dv REG_ESPACE
667Out of memory.
668.It Dv REG_BADRPT
669.Qq \&? ,
670.Qq *
671or
672.Qq +
673not preceded by valid regular expression.
674.El
675.Sh USAGE
676An application could use:
677.Bd -literal -offset Ds
678regerror(code, preg, (char *)NULL, (size_t)0)
679.Ed
680.Pp
681to find out how big a buffer is needed for the generated string,
682.Fn malloc
683a buffer to hold the string, and then call
684.Fn regerror
685again to get the string
686.Po see
687.Xr malloc 3C
688.Pc .
689Alternately, it could allocate a fixed, static buffer that is big enough to hold
690most strings, and then use
691.Fn malloc
692allocate a larger buffer if it finds that this is too small.
693.Sh EXAMPLES
694Matching string against the extended regular expression in pattern.
695.Bd -literal -offset Ds
696#include <regex.h>
697
698/*
699* Match string against the extended regular expression in
700* pattern, treating errors as no match.
701*
702* return 1 for match, 0 for no match
703*/
704int
705match(const char *string, char *pattern)
706{
707	int status;
708	regex_t re;
709
710	if (regcomp(&re, pattern, REG_EXTENDED\||\|REG_NOSUB) != 0) {
711		return(0);      /* report error */
712	}
713	status = regexec(&re, string, (size_t) 0, NULL, 0);
714	regfree(&re);
715	if (status != 0) {
716		return(0);      /* report error */
717	}
718	return(1);
719}
720.Ed
721.Pp
722The following demonstrates how the
723.Dv REG_NOTBOL
724flag could be used with
725.Fn regexec
726to find all substrings in a line that match a pattern supplied by a user.
727.Pq For simplicity of the example, very little error checking is done.
728.Bd -literal -offset Ds
729(void) regcomp(&re, pattern, 0);
730/* this call to regexec() finds the first match on the line */
731error = regexec(&re, &buffer[0], 1, &pm, 0);
732while (error == 0) {    /* while matches found */
733	/* substring found between pm.rm_so and pm.rm_eo */
734	/* This call to regexec() finds the next match */
735	error = regexec(&re, buffer + pm.rm_eo, 1, &pm, REG_NOTBOL);
736}
737.Ed
738.Sh ERRORS
739No errors are defined.
740.Sh CODE SET INDEPENDENCE
741.Sy Enabled
742.Sh INTERFACE STABILITY
743.Sy Standard
744.Sh MT-LEVEL
745.Sy MT-Safe with exceptions
746.Pp
747The
748.Fn regcomp
749function can be used safely in a multithreaded application as long as
750.Xr setlocale 3C
751is not being called to change the locale.
752.Sh SEE ALSO
753.Xr attributes 5 ,
754.Xr regex 5 ,
755.Xr standards 5
756.Pp
757.St -p1003.2 ,
758sections 2.8
759.Pq Regular Expression Notation
760and
761B.5
762.Pq C Binding for Regular Expression Matching .
763