xref: /illumos-gate/usr/src/man/man3c/regcomp.3c (revision e07d85f87c3920e032adb855fdc500e4616c7718)
1.\" Copyright (c) 1992, 1993, 1994 Henry Spencer.
2.\" Copyright (c) 1992, 1993, 1994
3.\"	The Regents of the University of California.  All rights reserved.
4.\"
5.\" This code is derived from software contributed to Berkeley by
6.\" Henry Spencer.
7.\"
8.\" Redistribution and use in source and binary forms, with or without
9.\" modification, are permitted provided that the following conditions
10.\" are met:
11.\" 1. Redistributions of source code must retain the above copyright
12.\"    notice, this list of conditions and the following disclaimer.
13.\" 2. Redistributions in binary form must reproduce the above copyright
14.\"    notice, this list of conditions and the following disclaimer in the
15.\"    documentation and/or other materials provided with the distribution.
16.\" 3. Neither the name of the University nor the names of its contributors
17.\"    may be used to endorse or promote products derived from this software
18.\"    without specific prior written permission.
19.\"
20.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
21.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
24.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
26.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
28.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
29.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
30.\" SUCH DAMAGE.
31.\"
32.\"
33.\" Sun Microsystems, Inc. gratefully acknowledges The Open Group for permission
34.\" to reproduce portions of its copyrighted documentation.
35.\"
36.\" Original documentation from The Open Group can be obtained online at
37.\" http://www.opengroup.org/bookstore/.
38.\"
39.\" The Institute of Electrical and Electronics Engineers and The Open Group,
40.\" have given us permission to reprint portions of their documentation. In the
41.\" following statement, the phrase "this text" refers to portions of the system
42.\" documentation.
43.\"
44.\" Portions of this text are reprinted and reproduced in electronic form in the
45.\" Sun OS Reference Manual, from IEEE Std 1003.1, 2004 Edition, Standard for
46.\" Information Technology -- Portable Operating System Interface (POSIX),
47.\" The Open Group Base Specifications Issue 6, Copyright (C) 2001-2004 by the
48.\" Institute of Electrical and Electronics Engineers, Inc and The Open Group.
49.\"
50.\" In the event of any discrepancy between these versions and the original
51.\" IEEE and The Open Group Standard, the original IEEE and The Open Group
52.\" Standard is the referee document.
53.\"
54.\" The original Standard can be obtained online at
55.\" http://www.opengroup.org/unix/online.html.
56.\"
57.\" This notice shall appear on any product containing this material.
58.\"
59.\" The contents of this file are subject to the terms of the
60.\" Common Development and Distribution License (the "License").
61.\" You may not use this file except in compliance with the License.
62.\"
63.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
64.\" or http://www.opensolaris.org/os/licensing.
65.\" See the License for the specific language governing permissions
66.\" and limitations under the License.
67.\"
68.\" When distributing Covered Code, include this CDDL HEADER in each
69.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE.
70.\" If applicable, add the following below this CDDL HEADER, with the
71.\" fields enclosed by brackets "[]" replaced with your own identifying
72.\" information: Portions Copyright [yyyy] [name of copyright owner]
73.\"
74.\"
75.\" Copyright (c) 1992, X/Open Company Limited. All Rights Reserved.
76.\" Portions Copyright (c) 2003, Sun Microsystems, Inc.  All Rights Reserved.
77.\" Copyright 2017 Nexenta Systems, Inc.
78.\"
79.Dd June 14, 2017
80.Dt REGCOMP 3C
81.Os
82.Sh NAME
83.Nm regcomp ,
84.Nm regexec ,
85.Nm regerror ,
86.Nm regfree
87.Nd regular-expression library
88.Sh LIBRARY
89.Lb libc
90.Sh SYNOPSIS
91.In regex.h
92.Ft int
93.Fo regcomp
94.Fa "regex_t *restrict preg" "const char *restrict pattern" "int cflags"
95.Fc
96.Ft int
97.Fo regexec
98.Fa "const regex_t *restrict preg" "const char *restrict string"
99.Fa "size_t nmatch" "regmatch_t pmatch[restrict]" "int eflags"
100.Fc
101.Ft size_t
102.Fo regerror
103.Fa "int errcode" "const regex_t *restrict preg"
104.Fa "char *restrict errbuf" "size_t errbuf_size"
105.Fc
106.Ft void
107.Fn regfree "regex_t *preg"
108.Sh DESCRIPTION
109These routines implement
110.St -p1003.2
111regular expressions; see
112.Xr regex 5 .
113The
114.Fn regcomp
115function compiles an RE written as a string into an internal form,
116.Fn regexec
117matches that internal form against a string and reports results,
118.Fn regerror
119transforms error codes from either into human-readable messages,
120and
121.Fn regfree
122frees any dynamically-allocated storage used by the internal form
123of an RE.
124.Pp
125The header
126.In regex.h
127declares two structure types,
128.Ft regex_t
129and
130.Ft regmatch_t ,
131the former for compiled internal forms and the latter for match reporting.
132It also declares the four functions, a type
133.Ft regoff_t ,
134and a number of constants with names starting with
135.Qq Dv REG_ .
136.Ss Fn regcomp
137The
138.Fn regcomp
139function compiles the regular expression contained in the
140.Fa pattern
141string, subject to the flags in
142.Fa cflags ,
143and places the results in the
144.Ft regex_t
145structure pointed to by
146.Fa preg .
147The
148.Fa cflags
149argument is the bitwise OR of zero or more of the following flags:
150.Bl -tag -width REG_EXTENDED
151.It Dv REG_EXTENDED
152Compile extended regular expressions
153.Pq EREs ,
154rather than the basic regular expressions
155.Pq BREs
156that are the default.
157.It Dv REG_BASIC
158This is a synonym for 0, provided as a counterpart to
159.Dv REG_EXTENDED
160to improve readability.
161.It Dv REG_NOSPEC
162Compile with recognition of all special characters turned off.
163All characters are thus considered ordinary, so the RE is a literal string.
164This is an extension, compatible with but not specified by
165.St -p1003.2 ,
166and should be used with caution in software intended to be portable to other
167systems.
168.Dv REG_EXTENDED
169and
170.Dv REG_NOSPEC
171may not be used in the same call to
172.Fn regcomp .
173.It Dv REG_ICASE
174Compile for matching that ignores upper/lower case distinctions.
175See
176.Xr regex 5 .
177.It Dv REG_NOSUB
178Compile for matching that need only report success or failure,
179not what was matched.
180.It Dv REG_NEWLINE
181Compile for newline-sensitive matching.
182By default, newline is a completely ordinary character with no special
183meaning in either REs or strings.
184With this flag,
185.Qq [^
186bracket expressions and
187.Qq \&.
188never match newline,
189a
190.Qq \&^
191anchor matches the null string after any newline in the string in addition to
192its normal function, and the
193.Qq \&$
194anchor matches the null string before any newline in the string in addition to
195its normal function.
196.It Dv REG_PEND
197The regular expression ends, not at the first NUL, but just before the character
198pointed to by the
199.Va re_endp
200member of the structure pointed to by
201.Fa preg .
202The
203.Va re_endp
204member is of type
205.Vt "const char *" .
206This flag permits inclusion of NULs in the RE; they are considered ordinary
207characters.
208This is an extension, compatible with but not specified by
209.St -p1003.2 ,
210and should be used with caution in software intended to be portable to other
211systems.
212.El
213.Pp
214When successful,
215.Fn regcomp
216returns 0 and fills in the structure pointed to by
217.Fa preg .
218One member of that structure
219.Po other than
220.Va re_endp
221.Pc
222is publicized:
223.Va re_nsub ,
224of type
225.Ft size_t ,
226contains the number of parenthesized subexpressions within the RE
227.Po except that the value of this member is undefined if the
228.Dv REG_NOSUB
229flag was used
230.Pc .
231.Ss Fn regexec
232The
233.Fn regexec
234function matches the compiled RE pointed to by
235.Fa preg
236against the
237.Fa string ,
238subject to the flags in
239.Fa eflags ,
240and reports results using
241.Fa nmatch ,
242.Fa pmatch ,
243and the returned value.
244The RE must have been compiled by a previous invocation of
245.Fn regcomp .
246The compiled form is not altered during execution of
247.Fn regexec ,
248so a single compiled RE can be used simultaneously by multiple threads.
249.Pp
250By default, the NUL-terminated string pointed to by
251.Fa string
252is considered to be the text of an entire line, minus any terminating
253newline.
254The
255.Fa eflags
256argument is the bitwise OR of zero or more of the following flags:
257.Bl -tag -width REG_STARTEND
258.It Dv REG_NOTBOL
259The first character of the string is treated as the continuation
260of a line.
261This means that the anchors
262.Qq \&^ ,
263.Qq [[:<:]] ,
264and
265.Qq \e<
266do not match before it; but see
267.Dv REG_STARTEND
268below.
269This does not affect the behavior of newlines under
270.Dv REG_NEWLINE .
271.It Dv REG_NOTEOL
272The NUL terminating the string does not end a line, so the
273.Qq \&$
274anchor does not match before it.
275This does not affect the behavior of newlines under
276.Dv REG_NEWLINE .
277.It Dv REG_STARTEND
278The string is considered to start at
279.Fa string No +
280.Fa pmatch Ns [0]. Ns Fa rm_so
281and to end before the byte located at
282.Fa string No +
283.Fa pmatch Ns [0]. Ns Fa rm_eo ,
284regardless of the value of
285.Fa nmatch .
286See below for the definition of
287.Fa pmatch
288and
289.Fa nmatch .
290This is an extension, compatible with but not specified by
291.St -p1003.2 ,
292and should be used with caution in software intended to be portable to other
293systems.
294.Pp
295Without
296.Dv REG_NOTBOL ,
297the position
298.Fa rm_so
299is considered the beginning of a line, such that
300.Qq \&^
301matches before it, and the beginning of a word if there is a word character at
302this position, such that
303.Qq [[:<:]]
304and
305.Qq \e<
306match before it.
307.Pp
308With
309.Dv REG_NOTBOL ,
310the character at position
311.Fa rm_so
312is treated as the continuation of a line, and if
313.Fa rm_so
314is greater than 0, the preceding character is taken into consideration.
315If the preceding character is a newline and the regular expression was compiled
316with
317.Dv REG_NEWLINE ,
318.Qq ^
319matches before the string; if the preceding character is not a word character
320but the string starts with a word character,
321.Qq [[:<:]]
322and
323.Qq \e<
324match before the string.
325.El
326.Pp
327See
328.Xr regex 5
329for a discussion of what is matched in situations where an RE or a portion
330thereof could match any of several substrings of
331.Fa string .
332.Pp
333If
334.Dv REG_NOSUB
335was specified in the compilation of the RE, or if
336.Fa nmatch
337is 0,
338.Fn regexec
339ignores the
340.Fa pmatch
341argument
342.Po but see below for the case where
343.Dv REG_STARTEND
344is specified
345.Pc .
346Otherwise,
347.Fa pmatch
348points to an array of
349.Fa nmatch
350structures of type
351.Ft regmatch_t .
352Such a structure has at least the members
353.Va rm_so
354and
355.Va rm_eo ,
356both of type
357.Ft regoff_t
358.Po a signed arithmetic type at least as large as an
359.Ft off_t
360and a
361.Ft ssize_t
362.Pc ,
363containing respectively the offset of the first character of a substring
364and the offset of the first character after the end of the substring.
365Offsets are measured from the beginning of the
366.Fa string
367argument given to
368.Fn regexec .
369An empty substring is denoted by equal offsets, both indicating the character
370following the empty substring.
371.Pp
372The 0th member of the
373.Fa pmatch
374array is filled in to indicate what substring of
375.Fa string
376was matched by the entire RE.
377Remaining members report what substring was matched by parenthesized
378subexpressions within the RE; member
379.Va i
380reports subexpression
381.Va i ,
382with subexpressions counted
383.Pq starting at 1
384by the order of their opening parentheses in the RE, left to right.
385Unused entries in the array
386.Po corresponding either to subexpressions that did not participate in the match
387at all, or to subexpressions that do not exist in the RE
388.Po that is,
389.Va i
390>
391.Fa preg Ns -> Ns Va re_nsub
392.Pc
393.Pc
394have both
395.Va rm_so
396and
397.Va rm_eo
398set to -1.
399If a subexpression participated in the match several times,
400the reported substring is the last one it matched.
401.Po Note, as an example in particular, that when the RE
402.Qq (b*)+
403matches
404.Qq bbb ,
405the parenthesized subexpression matches each of the three
406.So Li b Sc Ns s
407and then an infinite number of empty strings following the last
408.Qq b ,
409so the reported substring is one of the empties.
410.Pc
411.Pp
412If
413.Dv REG_STARTEND
414is specified,
415.Fa pmatch
416must point to at least one
417.Ft regmatch_t
418.Po even if
419.Fa nmatch
420is 0 or
421.Dv REG_NOSUB
422was specified
423.Pc ,
424to hold the input offsets for
425.Dv REG_STARTEND .
426Use for output is still entirely controlled by
427.Fa nmatch ;
428if
429.Fa nmatch
430is 0 or
431.Dv REG_NOSUB
432was specified,
433the value of
434.Fa pmatch Ns [0]
435will not be changed by a successful
436.Fn regexec .
437.Ss Fn regerror
438The
439.Fn regerror
440function maps a non-zero
441.Fa errcode
442from either
443.Fn regcomp
444or
445.Fn regexec
446to a human-readable, printable message.
447If
448.Fa preg
449is non-NULL, the error code should have arisen from use of the
450.Ft regex_t
451pointed to by
452.Fa preg ,
453and if the error code came from
454.Fn regcomp ,
455it should have been the result from the most recent
456.Fn regcomp
457using that
458.Ft regex_t .
459The
460.Po
461.Fn regerror
462may be able to supply a more detailed message using information
463from the
464.Ft regex_t .
465.Pc
466The
467.Fn regerror
468function places the NUL-terminated message into the buffer pointed to by
469.Fa errbuf ,
470limiting the length
471.Pq including the NUL
472to at most
473.Fa errbuf_size
474bytes.
475If the whole message will not fit, as much of it as will fit before the
476terminating NUL is supplied.
477In any case, the returned value is the size of buffer needed to hold the whole
478message
479.Pq including terminating NUL .
480If
481.Fa errbuf_size
482is 0,
483.Fa errbuf
484is ignored but the return value is still correct.
485.Pp
486If the
487.Fa errcode
488given to
489.Fn regerror
490is first ORed with
491.Dv REG_ITOA ,
492the
493.Qq message
494that results is the printable name of the error code, e.g.
495.Qq Dv REG_NOMATCH ,
496rather than an explanation thereof.
497If
498.Fa errcode
499is
500.Dv REG_ATOI ,
501then
502.Fa preg
503shall be non-NULL and the
504.Va re_endp
505member of the structure it points to must point to the printable name of an
506error code; in this case, the result in
507.Fa errbuf
508is the decimal digits of the numeric value of the error code
509.Pq 0 if the name is not recognized .
510.Dv REG_ITOA
511and
512.Dv REG_ATOI
513are intended primarily as debugging facilities; they are extensions,
514compatible with but not specified by
515.St -p1003.2 ,
516and should be used with caution in software intended to be portable to other
517systems.
518.Ss Fn regfree
519The
520.Fn regfree
521function frees any dynamically-allocated storage associated with the compiled RE
522pointed to by
523.Fa preg .
524The remaining
525.Ft regex_t
526is no longer a valid compiled RE and the effect of supplying it to
527.Fn regexec
528or
529.Fn regerror
530is undefined.
531.Sh IMPLEMENTATION NOTES
532There are a number of decisions that
533.St -p1003.2
534leaves up to the implementor,
535either by explicitly saying
536.Qq undefined
537or by virtue of them being forbidden by the RE grammar.
538This implementation treats them as follows.
539.Pp
540There is no particular limit on the length of REs, except insofar as memory is
541limited.
542Memory usage is approximately linear in RE size, and largely insensitive
543to RE complexity, except for bounded repetitions.
544.Pp
545A backslashed character other than one specifically given a magic meaning by
546.St -p1003.2
547.Pq such magic meanings occur only in BREs
548is taken as an ordinary character.
549.Pp
550Any unmatched
551.Qq \&[
552is a
553.Dv REG_EBRACK
554error.
555.Pp
556Equivalence classes cannot begin or end bracket-expression ranges.
557The endpoint of one range cannot begin another.
558.Pp
559.Dv RE_DUP_MAX ,
560the limit on repetition counts in bounded repetitions, is 255.
561.Pp
562A repetition operator
563.Po
564.Qq \&? ,
565.Qq \&* ,
566.Qq \&+ ,
567or bounds
568.Pc
569cannot follow another repetition operator.
570A repetition operator cannot begin an expression or subexpression
571or follow
572.Qq \&^
573or
574.Qq \&| .
575.Pp
576.Qq \&|
577cannot appear first or last in a (sub)expression or after another
578.Qq \&| ,
579i.e., an operand of
580.Qq \&|
581cannot be an empty subexpression.
582An empty parenthesized subexpression,
583.Qq () ,
584is legal and matches an empty (sub)string.
585An empty string is not a legal RE.
586.Pp
587A
588.Qq \&{
589followed by a digit is considered the beginning of bounds for a bounded
590repetition, which must then follow the syntax for bounds.
591A
592.Qq \&{
593.Em not
594followed by a digit is considered an ordinary character.
595.Pp
596.Qq \&^
597and
598.Qq \&$
599beginning and ending subexpressions in BREs are anchors, not ordinary
600characters.
601.Sh RETURN VALUES
602On successful completion, the
603.Fn regcomp
604function returns 0.
605Otherwise, it returns an integer value indicating an error as described in
606.In regex.h ,
607and the content of preg is undefined.
608.Pp
609On successful completion, the
610.Fn regexec
611function returns 0.
612Otherwise it returns
613.Dv REG_NOMATCH
614to indicate no match, or
615.Dv REG_ENOSYS
616to indicate that the function is not supported.
617.Pp
618Upon successful completion, the
619.Fn regerror
620function returns the number of bytes needed to hold the entire generated string.
621Otherwise, it returns 0 to indicate that the function is not implemented.
622.Pp
623The
624.Fn regfree
625function returns no value.
626.Pp
627The following constants are defined as error return values:
628.Pp
629.Bl -tag -width "REG_ECOLLATE" -compact
630.It Dv REG_NOMATCH
631The
632.Fn regexec
633function failed to match.
634.It Dv REG_BADPAT
635Invalid regular expression.
636.It Dv REG_ECOLLATE
637Invalid collating element referenced.
638.It Dv REG_ECTYPE
639Invalid character class type referenced.
640.It Dv REG_EESCAPE
641Trailing
642.Qq \&\e
643in pattern.
644.It Dv REG_ESUBREG
645Number in
646.Qq \&\e Ns Em digit
647invalid or in error.
648.It Dv REG_EBRACK
649.Qq []
650imbalance.
651.It Dv REG_ENOSYS
652The function is not supported.
653.It Dv REG_EPAREN
654.Qq \e(\e)
655or
656.Qq ()
657imbalance.
658.It Dv REG_EBRACE
659.Qq \e{\e}
660imbalance.
661.It Dv REG_BADBR
662Content of
663.Qq \e{\e}
664invalid: not a number, number too large, more than two
665numbers, first larger than second.
666.It Dv REG_ERANGE
667Invalid endpoint in range expression.
668.It Dv REG_ESPACE
669Out of memory.
670.It Dv REG_BADRPT
671.Qq \&? ,
672.Qq *
673or
674.Qq +
675not preceded by valid regular expression.
676.El
677.Sh USAGE
678An application could use:
679.Bd -literal -offset Ds
680regerror(code, preg, (char *)NULL, (size_t)0)
681.Ed
682.Pp
683to find out how big a buffer is needed for the generated string,
684.Fn malloc
685a buffer to hold the string, and then call
686.Fn regerror
687again to get the string
688.Po see
689.Xr malloc 3C
690.Pc .
691Alternately, it could allocate a fixed, static buffer that is big enough to hold
692most strings, and then use
693.Fn malloc
694allocate a larger buffer if it finds that this is too small.
695.Sh EXAMPLES
696Matching string against the extended regular expression in pattern.
697.Bd -literal -offset Ds
698#include <regex.h>
699
700/*
701* Match string against the extended regular expression in
702* pattern, treating errors as no match.
703*
704* return 1 for match, 0 for no match
705*/
706int
707match(const char *string, char *pattern)
708{
709	int status;
710	regex_t re;
711
712	if (regcomp(&re, pattern, REG_EXTENDED\||\|REG_NOSUB) != 0) {
713		return(0);      /* report error */
714	}
715	status = regexec(&re, string, (size_t) 0, NULL, 0);
716	regfree(&re);
717	if (status != 0) {
718		return(0);      /* report error */
719	}
720	return(1);
721}
722.Ed
723.Pp
724The following demonstrates how the
725.Dv REG_NOTBOL
726flag could be used with
727.Fn regexec
728to find all substrings in a line that match a pattern supplied by a user.
729.Pq For simplicity of the example, very little error checking is done.
730.Bd -literal -offset Ds
731(void) regcomp(&re, pattern, 0);
732/* this call to regexec() finds the first match on the line */
733error = regexec(&re, &buffer[0], 1, &pm, 0);
734while (error == 0) {    /* while matches found */
735	/* substring found between pm.rm_so and pm.rm_eo */
736	/* This call to regexec() finds the next match */
737	error = regexec(&re, buffer + pm.rm_eo, 1, &pm, REG_NOTBOL);
738}
739.Ed
740.Sh ERRORS
741No errors are defined.
742.Sh CODE SET INDEPENDENCE
743.Sy Enabled
744.Sh INTERFACE STABILITY
745.Sy Standard
746.Sh MT-LEVEL
747.Sy MT-Safe with exceptions
748.Pp
749The
750.Fn regcomp
751function can be used safely in a multithreaded application as long as
752.Xr setlocale 3C
753is not being called to change the locale.
754.Sh SEE ALSO
755.Xr attributes 5 ,
756.Xr regex 5 ,
757.Xr standards 5
758.Pp
759.St -p1003.2 ,
760sections 2.8
761.Pq Regular Expression Notation
762and
763B.5
764.Pq C Binding for Regular Expression Matching .
765