1.\" Copyright (c) 1992, 1993, 1994 Henry Spencer. 2.\" Copyright (c) 1992, 1993, 1994 3.\" The Regents of the University of California. All rights reserved. 4.\" 5.\" This code is derived from software contributed to Berkeley by 6.\" Henry Spencer. 7.\" 8.\" Redistribution and use in source and binary forms, with or without 9.\" modification, are permitted provided that the following conditions 10.\" are met: 11.\" 1. Redistributions of source code must retain the above copyright 12.\" notice, this list of conditions and the following disclaimer. 13.\" 2. Redistributions in binary form must reproduce the above copyright 14.\" notice, this list of conditions and the following disclaimer in the 15.\" documentation and/or other materials provided with the distribution. 16.\" 3. Neither the name of the University nor the names of its contributors 17.\" may be used to endorse or promote products derived from this software 18.\" without specific prior written permission. 19.\" 20.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30.\" SUCH DAMAGE. 31.\" 32.\" 33.\" Sun Microsystems, Inc. gratefully acknowledges The Open Group for permission 34.\" to reproduce portions of its copyrighted documentation. 35.\" 36.\" Original documentation from The Open Group can be obtained online at 37.\" http://www.opengroup.org/bookstore/. 38.\" 39.\" The Institute of Electrical and Electronics Engineers and The Open Group, 40.\" have given us permission to reprint portions of their documentation. In the 41.\" following statement, the phrase "this text" refers to portions of the system 42.\" documentation. 43.\" 44.\" Portions of this text are reprinted and reproduced in electronic form in the 45.\" Sun OS Reference Manual, from IEEE Std 1003.1, 2004 Edition, Standard for 46.\" Information Technology -- Portable Operating System Interface (POSIX), 47.\" The Open Group Base Specifications Issue 6, Copyright (C) 2001-2004 by the 48.\" Institute of Electrical and Electronics Engineers, Inc and The Open Group. 49.\" 50.\" In the event of any discrepancy between these versions and the original 51.\" IEEE and The Open Group Standard, the original IEEE and The Open Group 52.\" Standard is the referee document. 53.\" 54.\" The original Standard can be obtained online at 55.\" http://www.opengroup.org/unix/online.html. 56.\" 57.\" This notice shall appear on any product containing this material. 58.\" 59.\" The contents of this file are subject to the terms of the 60.\" Common Development and Distribution License (the "License"). 61.\" You may not use this file except in compliance with the License. 62.\" 63.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 64.\" or http://www.opensolaris.org/os/licensing. 65.\" See the License for the specific language governing permissions 66.\" and limitations under the License. 67.\" 68.\" When distributing Covered Code, include this CDDL HEADER in each 69.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. 70.\" If applicable, add the following below this CDDL HEADER, with the 71.\" fields enclosed by brackets "[]" replaced with your own identifying 72.\" information: Portions Copyright [yyyy] [name of copyright owner] 73.\" 74.\" 75.\" Copyright (c) 1992, X/Open Company Limited. All Rights Reserved. 76.\" Portions Copyright (c) 2003, Sun Microsystems, Inc. All Rights Reserved. 77.\" Copyright 2017 Nexenta Systems, Inc. 78.\" 79.Dd June 14, 2017 80.Dt REGCOMP 3C 81.Os 82.Sh NAME 83.Nm regcomp , 84.Nm regexec , 85.Nm regerror , 86.Nm regfree 87.Nd regular-expression library 88.Sh LIBRARY 89.Lb libc 90.Sh SYNOPSIS 91.In regex.h 92.Ft int 93.Fo regcomp 94.Fa "regex_t *restrict preg" "const char *restrict pattern" "int cflags" 95.Fc 96.Ft int 97.Fo regexec 98.Fa "const regex_t *restrict preg" "const char *restrict string" 99.Fa "size_t nmatch" "regmatch_t pmatch[restrict]" "int eflags" 100.Fc 101.Ft size_t 102.Fo regerror 103.Fa "int errcode" "const regex_t *restrict preg" 104.Fa "char *restrict errbuf" "size_t errbuf_size" 105.Fc 106.Ft void 107.Fn regfree "regex_t *preg" 108.Sh DESCRIPTION 109These routines implement 110.St -p1003.2 111regular expressions; see 112.Xr regex 5 . 113The 114.Fn regcomp 115function compiles an RE written as a string into an internal form, 116.Fn regexec 117matches that internal form against a string and reports results, 118.Fn regerror 119transforms error codes from either into human-readable messages, 120and 121.Fn regfree 122frees any dynamically-allocated storage used by the internal form 123of an RE. 124.Pp 125The header 126.In regex.h 127declares two structure types, 128.Ft regex_t 129and 130.Ft regmatch_t , 131the former for compiled internal forms and the latter for match reporting. 132It also declares the four functions, a type 133.Ft regoff_t , 134and a number of constants with names starting with 135.Qq Dv REG_ . 136.Ss Fn regcomp 137The 138.Fn regcomp 139function compiles the regular expression contained in the 140.Fa pattern 141string, subject to the flags in 142.Fa cflags , 143and places the results in the 144.Ft regex_t 145structure pointed to by 146.Fa preg . 147The 148.Fa cflags 149argument is the bitwise OR of zero or more of the following flags: 150.Bl -tag -width REG_EXTENDED 151.It Dv REG_EXTENDED 152Compile extended regular expressions 153.Pq EREs , 154rather than the basic regular expressions 155.Pq BREs 156that are the default. 157.It Dv REG_BASIC 158This is a synonym for 0, provided as a counterpart to 159.Dv REG_EXTENDED 160to improve readability. 161.It Dv REG_NOSPEC 162Compile with recognition of all special characters turned off. 163All characters are thus considered ordinary, so the RE is a literal string. 164This is an extension, compatible with but not specified by 165.St -p1003.2 , 166and should be used with caution in software intended to be portable to other 167systems. 168.Dv REG_EXTENDED 169and 170.Dv REG_NOSPEC 171may not be used in the same call to 172.Fn regcomp . 173.It Dv REG_ICASE 174Compile for matching that ignores upper/lower case distinctions. 175See 176.Xr regex 5 . 177.It Dv REG_NOSUB 178Compile for matching that need only report success or failure, 179not what was matched. 180.It Dv REG_NEWLINE 181Compile for newline-sensitive matching. 182By default, newline is a completely ordinary character with no special 183meaning in either REs or strings. 184With this flag, 185.Qq [^ 186bracket expressions and 187.Qq \&. 188never match newline, 189a 190.Qq \&^ 191anchor matches the null string after any newline in the string in addition to 192its normal function, and the 193.Qq \&$ 194anchor matches the null string before any newline in the string in addition to 195its normal function. 196.It Dv REG_PEND 197The regular expression ends, not at the first NUL, but just before the character 198pointed to by the 199.Va re_endp 200member of the structure pointed to by 201.Fa preg . 202The 203.Va re_endp 204member is of type 205.Vt "const char *" . 206This flag permits inclusion of NULs in the RE; they are considered ordinary 207characters. 208This is an extension, compatible with but not specified by 209.St -p1003.2 , 210and should be used with caution in software intended to be portable to other 211systems. 212.El 213.Pp 214When successful, 215.Fn regcomp 216returns 0 and fills in the structure pointed to by 217.Fa preg . 218One member of that structure 219.Po other than 220.Va re_endp 221.Pc 222is publicized: 223.Va re_nsub , 224of type 225.Ft size_t , 226contains the number of parenthesized subexpressions within the RE 227.Po except that the value of this member is undefined if the 228.Dv REG_NOSUB 229flag was used 230.Pc . 231.Ss Fn regexec 232The 233.Fn regexec 234function matches the compiled RE pointed to by 235.Fa preg 236against the 237.Fa string , 238subject to the flags in 239.Fa eflags , 240and reports results using 241.Fa nmatch , 242.Fa pmatch , 243and the returned value. 244The RE must have been compiled by a previous invocation of 245.Fn regcomp . 246The compiled form is not altered during execution of 247.Fn regexec , 248so a single compiled RE can be used simultaneously by multiple threads. 249.Pp 250By default, the NUL-terminated string pointed to by 251.Fa string 252is considered to be the text of an entire line, minus any terminating 253newline. 254The 255.Fa eflags 256argument is the bitwise OR of zero or more of the following flags: 257.Bl -tag -width REG_STARTEND 258.It Dv REG_NOTBOL 259The first character of the string is treated as the continuation 260of a line. 261This means that the anchors 262.Qq \&^ , 263.Qq [[:<:]] , 264and 265.Qq \e< 266do not match before it; but see 267.Dv REG_STARTEND 268below. 269This does not affect the behavior of newlines under 270.Dv REG_NEWLINE . 271.It Dv REG_NOTEOL 272The NUL terminating the string does not end a line, so the 273.Qq \&$ 274anchor does not match before it. 275This does not affect the behavior of newlines under 276.Dv REG_NEWLINE . 277.It Dv REG_STARTEND 278The string is considered to start at 279.Fa string No + 280.Fa pmatch Ns [0]. Ns Fa rm_so 281and to end before the byte located at 282.Fa string No + 283.Fa pmatch Ns [0]. Ns Fa rm_eo , 284regardless of the value of 285.Fa nmatch . 286See below for the definition of 287.Fa pmatch 288and 289.Fa nmatch . 290This is an extension, compatible with but not specified by 291.St -p1003.2 , 292and should be used with caution in software intended to be portable to other 293systems. 294.Pp 295Without 296.Dv REG_NOTBOL , 297the position 298.Fa rm_so 299is considered the beginning of a line, such that 300.Qq \&^ 301matches before it, and the beginning of a word if there is a word character at 302this position, such that 303.Qq [[:<:]] 304and 305.Qq \e< 306match before it. 307.Pp 308With 309.Dv REG_NOTBOL , 310the character at position 311.Fa rm_so 312is treated as the continuation of a line, and if 313.Fa rm_so 314is greater than 0, the preceding character is taken into consideration. 315If the preceding character is a newline and the regular expression was compiled 316with 317.Dv REG_NEWLINE , 318.Qq ^ 319matches before the string; if the preceding character is not a word character 320but the string starts with a word character, 321.Qq [[:<:]] 322and 323.Qq \e< 324match before the string. 325.El 326.Pp 327See 328.Xr regex 5 329for a discussion of what is matched in situations where an RE or a portion 330thereof could match any of several substrings of 331.Fa string . 332.Pp 333If 334.Dv REG_NOSUB 335was specified in the compilation of the RE, or if 336.Fa nmatch 337is 0, 338.Fn regexec 339ignores the 340.Fa pmatch 341argument 342.Po but see below for the case where 343.Dv REG_STARTEND 344is specified 345.Pc . 346Otherwise, 347.Fa pmatch 348points to an array of 349.Fa nmatch 350structures of type 351.Ft regmatch_t . 352Such a structure has at least the members 353.Va rm_so 354and 355.Va rm_eo , 356both of type 357.Ft regoff_t 358.Po a signed arithmetic type at least as large as an 359.Ft off_t 360and a 361.Ft ssize_t 362.Pc , 363containing respectively the offset of the first character of a substring 364and the offset of the first character after the end of the substring. 365Offsets are measured from the beginning of the 366.Fa string 367argument given to 368.Fn regexec . 369An empty substring is denoted by equal offsets, both indicating the character 370following the empty substring. 371.Pp 372The 0th member of the 373.Fa pmatch 374array is filled in to indicate what substring of 375.Fa string 376was matched by the entire RE. 377Remaining members report what substring was matched by parenthesized 378subexpressions within the RE; member 379.Va i 380reports subexpression 381.Va i , 382with subexpressions counted 383.Pq starting at 1 384by the order of their opening parentheses in the RE, left to right. 385Unused entries in the array 386.Po corresponding either to subexpressions that did not participate in the match 387at all, or to subexpressions that do not exist in the RE 388.Po that is, 389.Va i 390> 391.Fa preg Ns -> Ns Va re_nsub 392.Pc 393.Pc 394have both 395.Va rm_so 396and 397.Va rm_eo 398set to -1. 399If a subexpression participated in the match several times, 400the reported substring is the last one it matched. 401.Po Note, as an example in particular, that when the RE 402.Qq (b*)+ 403matches 404.Qq bbb , 405the parenthesized subexpression matches each of the three 406.So Li b Sc Ns s 407and then an infinite number of empty strings following the last 408.Qq b , 409so the reported substring is one of the empties. 410.Pc 411.Pp 412If 413.Dv REG_STARTEND 414is specified, 415.Fa pmatch 416must point to at least one 417.Ft regmatch_t 418.Po even if 419.Fa nmatch 420is 0 or 421.Dv REG_NOSUB 422was specified 423.Pc , 424to hold the input offsets for 425.Dv REG_STARTEND . 426Use for output is still entirely controlled by 427.Fa nmatch ; 428if 429.Fa nmatch 430is 0 or 431.Dv REG_NOSUB 432was specified, 433the value of 434.Fa pmatch Ns [0] 435will not be changed by a successful 436.Fn regexec . 437.Ss Fn regerror 438The 439.Fn regerror 440function maps a non-zero 441.Fa errcode 442from either 443.Fn regcomp 444or 445.Fn regexec 446to a human-readable, printable message. 447If 448.Fa preg 449is non-NULL, the error code should have arisen from use of the 450.Ft regex_t 451pointed to by 452.Fa preg , 453and if the error code came from 454.Fn regcomp , 455it should have been the result from the most recent 456.Fn regcomp 457using that 458.Ft regex_t . 459The 460.Po 461.Fn regerror 462may be able to supply a more detailed message using information 463from the 464.Ft regex_t . 465.Pc 466The 467.Fn regerror 468function places the NUL-terminated message into the buffer pointed to by 469.Fa errbuf , 470limiting the length 471.Pq including the NUL 472to at most 473.Fa errbuf_size 474bytes. 475If the whole message will not fit, as much of it as will fit before the 476terminating NUL is supplied. 477In any case, the returned value is the size of buffer needed to hold the whole 478message 479.Pq including terminating NUL . 480If 481.Fa errbuf_size 482is 0, 483.Fa errbuf 484is ignored but the return value is still correct. 485.Pp 486If the 487.Fa errcode 488given to 489.Fn regerror 490is first ORed with 491.Dv REG_ITOA , 492the 493.Qq message 494that results is the printable name of the error code, e.g. 495.Qq Dv REG_NOMATCH , 496rather than an explanation thereof. 497If 498.Fa errcode 499is 500.Dv REG_ATOI , 501then 502.Fa preg 503shall be non-NULL and the 504.Va re_endp 505member of the structure it points to must point to the printable name of an 506error code; in this case, the result in 507.Fa errbuf 508is the decimal digits of the numeric value of the error code 509.Pq 0 if the name is not recognized . 510.Dv REG_ITOA 511and 512.Dv REG_ATOI 513are intended primarily as debugging facilities; they are extensions, 514compatible with but not specified by 515.St -p1003.2 , 516and should be used with caution in software intended to be portable to other 517systems. 518.Ss Fn regfree 519The 520.Fn regfree 521function frees any dynamically-allocated storage associated with the compiled RE 522pointed to by 523.Fa preg . 524The remaining 525.Ft regex_t 526is no longer a valid compiled RE and the effect of supplying it to 527.Fn regexec 528or 529.Fn regerror 530is undefined. 531.Sh IMPLEMENTATION NOTES 532There are a number of decisions that 533.St -p1003.2 534leaves up to the implementor, 535either by explicitly saying 536.Qq undefined 537or by virtue of them being forbidden by the RE grammar. 538This implementation treats them as follows. 539.Pp 540There is no particular limit on the length of REs, except insofar as memory is 541limited. 542Memory usage is approximately linear in RE size, and largely insensitive 543to RE complexity, except for bounded repetitions. 544.Pp 545A backslashed character other than one specifically given a magic meaning by 546.St -p1003.2 547.Pq such magic meanings occur only in BREs 548is taken as an ordinary character. 549.Pp 550Any unmatched 551.Qq \&[ 552is a 553.Dv REG_EBRACK 554error. 555.Pp 556Equivalence classes cannot begin or end bracket-expression ranges. 557The endpoint of one range cannot begin another. 558.Pp 559.Dv RE_DUP_MAX , 560the limit on repetition counts in bounded repetitions, is 255. 561.Pp 562A repetition operator 563.Po 564.Qq \&? , 565.Qq \&* , 566.Qq \&+ , 567or bounds 568.Pc 569cannot follow another repetition operator. 570A repetition operator cannot begin an expression or subexpression 571or follow 572.Qq \&^ 573or 574.Qq \&| . 575.Pp 576.Qq \&| 577cannot appear first or last in a (sub)expression or after another 578.Qq \&| , 579i.e., an operand of 580.Qq \&| 581cannot be an empty subexpression. 582An empty parenthesized subexpression, 583.Qq () , 584is legal and matches an empty (sub)string. 585An empty string is not a legal RE. 586.Pp 587A 588.Qq \&{ 589followed by a digit is considered the beginning of bounds for a bounded 590repetition, which must then follow the syntax for bounds. 591A 592.Qq \&{ 593.Em not 594followed by a digit is considered an ordinary character. 595.Pp 596.Qq \&^ 597and 598.Qq \&$ 599beginning and ending subexpressions in BREs are anchors, not ordinary 600characters. 601.Sh RETURN VALUES 602On successful completion, the 603.Fn regcomp 604function returns 0. 605Otherwise, it returns an integer value indicating an error as described in 606.In regex.h , 607and the content of preg is undefined. 608.Pp 609On successful completion, the 610.Fn regexec 611function returns 0. 612Otherwise it returns 613.Dv REG_NOMATCH 614to indicate no match, or 615.Dv REG_ENOSYS 616to indicate that the function is not supported. 617.Pp 618Upon successful completion, the 619.Fn regerror 620function returns the number of bytes needed to hold the entire generated string. 621Otherwise, it returns 0 to indicate that the function is not implemented. 622.Pp 623The 624.Fn regfree 625function returns no value. 626.Pp 627The following constants are defined as error return values: 628.Pp 629.Bl -tag -width "REG_ECOLLATE" -compact 630.It Dv REG_NOMATCH 631The 632.Fn regexec 633function failed to match. 634.It Dv REG_BADPAT 635Invalid regular expression. 636.It Dv REG_ECOLLATE 637Invalid collating element referenced. 638.It Dv REG_ECTYPE 639Invalid character class type referenced. 640.It Dv REG_EESCAPE 641Trailing 642.Qq \&\e 643in pattern. 644.It Dv REG_ESUBREG 645Number in 646.Qq \&\e Ns Em digit 647invalid or in error. 648.It Dv REG_EBRACK 649.Qq [] 650imbalance. 651.It Dv REG_ENOSYS 652The function is not supported. 653.It Dv REG_EPAREN 654.Qq \e(\e) 655or 656.Qq () 657imbalance. 658.It Dv REG_EBRACE 659.Qq \e{\e} 660imbalance. 661.It Dv REG_BADBR 662Content of 663.Qq \e{\e} 664invalid: not a number, number too large, more than two 665numbers, first larger than second. 666.It Dv REG_ERANGE 667Invalid endpoint in range expression. 668.It Dv REG_ESPACE 669Out of memory. 670.It Dv REG_BADRPT 671.Qq \&? , 672.Qq * 673or 674.Qq + 675not preceded by valid regular expression. 676.El 677.Sh USAGE 678An application could use: 679.Bd -literal -offset Ds 680regerror(code, preg, (char *)NULL, (size_t)0) 681.Ed 682.Pp 683to find out how big a buffer is needed for the generated string, 684.Fn malloc 685a buffer to hold the string, and then call 686.Fn regerror 687again to get the string 688.Po see 689.Xr malloc 3C 690.Pc . 691Alternately, it could allocate a fixed, static buffer that is big enough to hold 692most strings, and then use 693.Fn malloc 694allocate a larger buffer if it finds that this is too small. 695.Sh EXAMPLES 696Matching string against the extended regular expression in pattern. 697.Bd -literal -offset Ds 698#include <regex.h> 699 700/* 701* Match string against the extended regular expression in 702* pattern, treating errors as no match. 703* 704* return 1 for match, 0 for no match 705*/ 706int 707match(const char *string, char *pattern) 708{ 709 int status; 710 regex_t re; 711 712 if (regcomp(&re, pattern, REG_EXTENDED\||\|REG_NOSUB) != 0) { 713 return(0); /* report error */ 714 } 715 status = regexec(&re, string, (size_t) 0, NULL, 0); 716 regfree(&re); 717 if (status != 0) { 718 return(0); /* report error */ 719 } 720 return(1); 721} 722.Ed 723.Pp 724The following demonstrates how the 725.Dv REG_NOTBOL 726flag could be used with 727.Fn regexec 728to find all substrings in a line that match a pattern supplied by a user. 729.Pq For simplicity of the example, very little error checking is done. 730.Bd -literal -offset Ds 731(void) regcomp(&re, pattern, 0); 732/* this call to regexec() finds the first match on the line */ 733error = regexec(&re, &buffer[0], 1, &pm, 0); 734while (error == 0) { /* while matches found */ 735 /* substring found between pm.rm_so and pm.rm_eo */ 736 /* This call to regexec() finds the next match */ 737 error = regexec(&re, buffer + pm.rm_eo, 1, &pm, REG_NOTBOL); 738} 739.Ed 740.Sh ERRORS 741No errors are defined. 742.Sh CODE SET INDEPENDENCE 743.Sy Enabled 744.Sh INTERFACE STABILITY 745.Sy Standard 746.Sh MT-LEVEL 747.Sy MT-Safe with exceptions 748.Pp 749The 750.Fn regcomp 751function can be used safely in a multithreaded application as long as 752.Xr setlocale 3C 753is not being called to change the locale. 754.Sh SEE ALSO 755.Xr attributes 5 , 756.Xr regex 5 , 757.Xr standards 5 758.Pp 759.St -p1003.2 , 760sections 2.8 761.Pq Regular Expression Notation 762and 763B.5 764.Pq C Binding for Regular Expression Matching . 765