1.\" Copyright (c) 1992, 1993, 1994 Henry Spencer. 2.\" Copyright (c) 1992, 1993, 1994 3.\" The Regents of the University of California. All rights reserved. 4.\" 5.\" This code is derived from software contributed to Berkeley by 6.\" Henry Spencer. 7.\" 8.\" Redistribution and use in source and binary forms, with or without 9.\" modification, are permitted provided that the following conditions 10.\" are met: 11.\" 1. Redistributions of source code must retain the above copyright 12.\" notice, this list of conditions and the following disclaimer. 13.\" 2. Redistributions in binary form must reproduce the above copyright 14.\" notice, this list of conditions and the following disclaimer in the 15.\" documentation and/or other materials provided with the distribution. 16.\" 3. Neither the name of the University nor the names of its contributors 17.\" may be used to endorse or promote products derived from this software 18.\" without specific prior written permission. 19.\" 20.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 21.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 23.\" ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 24.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 26.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 27.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 28.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 29.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 30.\" SUCH DAMAGE. 31.\" 32.\" Sun Microsystems, Inc. gratefully acknowledges The Open Group for 33.\" permission to reproduce portions of its copyrighted documentation. 34.\" Original documentation from The Open Group can be obtained online at 35.\" http://www.opengroup.org/bookstore/. 36.\" 37.\" The Institute of Electrical and Electronics Engineers and The Open 38.\" Group, have given us permission to reprint portions of their 39.\" documentation. 40.\" 41.\" In the following statement, the phrase ``this text'' refers to portions 42.\" of the system documentation. 43.\" 44.\" Portions of this text are reprinted and reproduced in electronic form 45.\" in the SunOS Reference Manual, from IEEE Std 1003.1, 2004 Edition, 46.\" Standard for Information Technology -- Portable Operating System 47.\" Interface (POSIX), The Open Group Base Specifications Issue 6, 48.\" Copyright (C) 2001-2004 by the Institute of Electrical and Electronics 49.\" Engineers, Inc and The Open Group. In the event of any discrepancy 50.\" between these versions and the original IEEE and The Open Group 51.\" Standard, the original IEEE and The Open Group Standard is the referee 52.\" document. The original Standard can be obtained online at 53.\" http://www.opengroup.org/unix/online.html. 54.\" 55.\" This notice shall appear on any product containing this material. 56.\" 57.\" The contents of this file are subject to the terms of the 58.\" Common Development and Distribution License (the "License"). 59.\" You may not use this file except in compliance with the License. 60.\" 61.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 62.\" or http://www.opensolaris.org/os/licensing. 63.\" See the License for the specific language governing permissions 64.\" and limitations under the License. 65.\" 66.\" When distributing Covered Code, include this CDDL HEADER in each 67.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. 68.\" If applicable, add the following below this CDDL HEADER, with the 69.\" fields enclosed by brackets "[]" replaced with your own identifying 70.\" information: Portions Copyright [yyyy] [name of copyright owner] 71.\" 72.\" 73.\" Copyright (c) 1992, X/Open Company Limited. All Rights Reserved. 74.\" Portions Copyright (c) 2003, Sun Microsystems, Inc. All Rights Reserved. 75.\" Copyright 2017 Nexenta Systems, Inc. 76.\" 77.Dd June 14, 2017 78.Dt REGCOMP 3C 79.Os 80.Sh NAME 81.Nm regcomp , 82.Nm regexec , 83.Nm regerror , 84.Nm regfree 85.Nd regular-expression library 86.Sh LIBRARY 87.Lb libc 88.Sh SYNOPSIS 89.In regex.h 90.Ft int 91.Fo regcomp 92.Fa "regex_t *restrict preg" "const char *restrict pattern" "int cflags" 93.Fc 94.Ft int 95.Fo regexec 96.Fa "const regex_t *restrict preg" "const char *restrict string" 97.Fa "size_t nmatch" "regmatch_t pmatch[restrict]" "int eflags" 98.Fc 99.Ft size_t 100.Fo regerror 101.Fa "int errcode" "const regex_t *restrict preg" 102.Fa "char *restrict errbuf" "size_t errbuf_size" 103.Fc 104.Ft void 105.Fn regfree "regex_t *preg" 106.Sh DESCRIPTION 107These routines implement 108.St -p1003.2 109regular expressions; see 110.Xr regex 5 . 111The 112.Fn regcomp 113function compiles an RE written as a string into an internal form, 114.Fn regexec 115matches that internal form against a string and reports results, 116.Fn regerror 117transforms error codes from either into human-readable messages, 118and 119.Fn regfree 120frees any dynamically-allocated storage used by the internal form 121of an RE. 122.Pp 123The header 124.In regex.h 125declares two structure types, 126.Ft regex_t 127and 128.Ft regmatch_t , 129the former for compiled internal forms and the latter for match reporting. 130It also declares the four functions, a type 131.Ft regoff_t , 132and a number of constants with names starting with 133.Qq Dv REG_ . 134.Ss Fn regcomp 135The 136.Fn regcomp 137function compiles the regular expression contained in the 138.Fa pattern 139string, subject to the flags in 140.Fa cflags , 141and places the results in the 142.Ft regex_t 143structure pointed to by 144.Fa preg . 145The 146.Fa cflags 147argument is the bitwise OR of zero or more of the following flags: 148.Bl -tag -width REG_EXTENDED 149.It Dv REG_EXTENDED 150Compile extended regular expressions 151.Pq EREs , 152rather than the basic regular expressions 153.Pq BREs 154that are the default. 155.It Dv REG_BASIC 156This is a synonym for 0, provided as a counterpart to 157.Dv REG_EXTENDED 158to improve readability. 159.It Dv REG_NOSPEC 160Compile with recognition of all special characters turned off. 161All characters are thus considered ordinary, so the RE is a literal string. 162This is an extension, compatible with but not specified by 163.St -p1003.2 , 164and should be used with caution in software intended to be portable to other 165systems. 166.Dv REG_EXTENDED 167and 168.Dv REG_NOSPEC 169may not be used in the same call to 170.Fn regcomp . 171.It Dv REG_ICASE 172Compile for matching that ignores upper/lower case distinctions. 173See 174.Xr regex 5 . 175.It Dv REG_NOSUB 176Compile for matching that need only report success or failure, 177not what was matched. 178.It Dv REG_NEWLINE 179Compile for newline-sensitive matching. 180By default, newline is a completely ordinary character with no special 181meaning in either REs or strings. 182With this flag, 183.Qq [^ 184bracket expressions and 185.Qq \&. 186never match newline, 187a 188.Qq \&^ 189anchor matches the null string after any newline in the string in addition to 190its normal function, and the 191.Qq \&$ 192anchor matches the null string before any newline in the string in addition to 193its normal function. 194.It Dv REG_PEND 195The regular expression ends, not at the first NUL, but just before the character 196pointed to by the 197.Va re_endp 198member of the structure pointed to by 199.Fa preg . 200The 201.Va re_endp 202member is of type 203.Vt "const char *" . 204This flag permits inclusion of NULs in the RE; they are considered ordinary 205characters. 206This is an extension, compatible with but not specified by 207.St -p1003.2 , 208and should be used with caution in software intended to be portable to other 209systems. 210.El 211.Pp 212When successful, 213.Fn regcomp 214returns 0 and fills in the structure pointed to by 215.Fa preg . 216One member of that structure 217.Po other than 218.Va re_endp 219.Pc 220is publicized: 221.Va re_nsub , 222of type 223.Ft size_t , 224contains the number of parenthesized subexpressions within the RE 225.Po except that the value of this member is undefined if the 226.Dv REG_NOSUB 227flag was used 228.Pc . 229.Ss Fn regexec 230The 231.Fn regexec 232function matches the compiled RE pointed to by 233.Fa preg 234against the 235.Fa string , 236subject to the flags in 237.Fa eflags , 238and reports results using 239.Fa nmatch , 240.Fa pmatch , 241and the returned value. 242The RE must have been compiled by a previous invocation of 243.Fn regcomp . 244The compiled form is not altered during execution of 245.Fn regexec , 246so a single compiled RE can be used simultaneously by multiple threads. 247.Pp 248By default, the NUL-terminated string pointed to by 249.Fa string 250is considered to be the text of an entire line, minus any terminating 251newline. 252The 253.Fa eflags 254argument is the bitwise OR of zero or more of the following flags: 255.Bl -tag -width REG_STARTEND 256.It Dv REG_NOTBOL 257The first character of the string is treated as the continuation 258of a line. 259This means that the anchors 260.Qq \&^ , 261.Qq [[:<:]] , 262and 263.Qq \e< 264do not match before it; but see 265.Dv REG_STARTEND 266below. 267This does not affect the behavior of newlines under 268.Dv REG_NEWLINE . 269.It Dv REG_NOTEOL 270The NUL terminating the string does not end a line, so the 271.Qq \&$ 272anchor does not match before it. 273This does not affect the behavior of newlines under 274.Dv REG_NEWLINE . 275.It Dv REG_STARTEND 276The string is considered to start at 277.Fa string No + 278.Fa pmatch Ns [0]. Ns Fa rm_so 279and to end before the byte located at 280.Fa string No + 281.Fa pmatch Ns [0]. Ns Fa rm_eo , 282regardless of the value of 283.Fa nmatch . 284See below for the definition of 285.Fa pmatch 286and 287.Fa nmatch . 288This is an extension, compatible with but not specified by 289.St -p1003.2 , 290and should be used with caution in software intended to be portable to other 291systems. 292.Pp 293Without 294.Dv REG_NOTBOL , 295the position 296.Fa rm_so 297is considered the beginning of a line, such that 298.Qq \&^ 299matches before it, and the beginning of a word if there is a word character at 300this position, such that 301.Qq [[:<:]] 302and 303.Qq \e< 304match before it. 305.Pp 306With 307.Dv REG_NOTBOL , 308the character at position 309.Fa rm_so 310is treated as the continuation of a line, and if 311.Fa rm_so 312is greater than 0, the preceding character is taken into consideration. 313If the preceding character is a newline and the regular expression was compiled 314with 315.Dv REG_NEWLINE , 316.Qq ^ 317matches before the string; if the preceding character is not a word character 318but the string starts with a word character, 319.Qq [[:<:]] 320and 321.Qq \e< 322match before the string. 323.El 324.Pp 325See 326.Xr regex 5 327for a discussion of what is matched in situations where an RE or a portion 328thereof could match any of several substrings of 329.Fa string . 330.Pp 331If 332.Dv REG_NOSUB 333was specified in the compilation of the RE, or if 334.Fa nmatch 335is 0, 336.Fn regexec 337ignores the 338.Fa pmatch 339argument 340.Po but see below for the case where 341.Dv REG_STARTEND 342is specified 343.Pc . 344Otherwise, 345.Fa pmatch 346points to an array of 347.Fa nmatch 348structures of type 349.Ft regmatch_t . 350Such a structure has at least the members 351.Va rm_so 352and 353.Va rm_eo , 354both of type 355.Ft regoff_t 356.Po a signed arithmetic type at least as large as an 357.Ft off_t 358and a 359.Ft ssize_t 360.Pc , 361containing respectively the offset of the first character of a substring 362and the offset of the first character after the end of the substring. 363Offsets are measured from the beginning of the 364.Fa string 365argument given to 366.Fn regexec . 367An empty substring is denoted by equal offsets, both indicating the character 368following the empty substring. 369.Pp 370The 0th member of the 371.Fa pmatch 372array is filled in to indicate what substring of 373.Fa string 374was matched by the entire RE. 375Remaining members report what substring was matched by parenthesized 376subexpressions within the RE; member 377.Va i 378reports subexpression 379.Va i , 380with subexpressions counted 381.Pq starting at 1 382by the order of their opening parentheses in the RE, left to right. 383Unused entries in the array 384.Po corresponding either to subexpressions that did not participate in the match 385at all, or to subexpressions that do not exist in the RE 386.Po that is, 387.Va i 388> 389.Fa preg Ns -> Ns Va re_nsub 390.Pc 391.Pc 392have both 393.Va rm_so 394and 395.Va rm_eo 396set to -1. 397If a subexpression participated in the match several times, 398the reported substring is the last one it matched. 399.Po Note, as an example in particular, that when the RE 400.Qq (b*)+ 401matches 402.Qq bbb , 403the parenthesized subexpression matches each of the three 404.So Li b Sc Ns s 405and then an infinite number of empty strings following the last 406.Qq b , 407so the reported substring is one of the empties. 408.Pc 409.Pp 410If 411.Dv REG_STARTEND 412is specified, 413.Fa pmatch 414must point to at least one 415.Ft regmatch_t 416.Po even if 417.Fa nmatch 418is 0 or 419.Dv REG_NOSUB 420was specified 421.Pc , 422to hold the input offsets for 423.Dv REG_STARTEND . 424Use for output is still entirely controlled by 425.Fa nmatch ; 426if 427.Fa nmatch 428is 0 or 429.Dv REG_NOSUB 430was specified, 431the value of 432.Fa pmatch Ns [0] 433will not be changed by a successful 434.Fn regexec . 435.Ss Fn regerror 436The 437.Fn regerror 438function maps a non-zero 439.Fa errcode 440from either 441.Fn regcomp 442or 443.Fn regexec 444to a human-readable, printable message. 445If 446.Fa preg 447is non-NULL, the error code should have arisen from use of the 448.Ft regex_t 449pointed to by 450.Fa preg , 451and if the error code came from 452.Fn regcomp , 453it should have been the result from the most recent 454.Fn regcomp 455using that 456.Ft regex_t . 457The 458.Po 459.Fn regerror 460may be able to supply a more detailed message using information 461from the 462.Ft regex_t . 463.Pc 464The 465.Fn regerror 466function places the NUL-terminated message into the buffer pointed to by 467.Fa errbuf , 468limiting the length 469.Pq including the NUL 470to at most 471.Fa errbuf_size 472bytes. 473If the whole message will not fit, as much of it as will fit before the 474terminating NUL is supplied. 475In any case, the returned value is the size of buffer needed to hold the whole 476message 477.Pq including terminating NUL . 478If 479.Fa errbuf_size 480is 0, 481.Fa errbuf 482is ignored but the return value is still correct. 483.Pp 484If the 485.Fa errcode 486given to 487.Fn regerror 488is first ORed with 489.Dv REG_ITOA , 490the 491.Qq message 492that results is the printable name of the error code, e.g. 493.Qq Dv REG_NOMATCH , 494rather than an explanation thereof. 495If 496.Fa errcode 497is 498.Dv REG_ATOI , 499then 500.Fa preg 501shall be non-NULL and the 502.Va re_endp 503member of the structure it points to must point to the printable name of an 504error code; in this case, the result in 505.Fa errbuf 506is the decimal digits of the numeric value of the error code 507.Pq 0 if the name is not recognized . 508.Dv REG_ITOA 509and 510.Dv REG_ATOI 511are intended primarily as debugging facilities; they are extensions, 512compatible with but not specified by 513.St -p1003.2 , 514and should be used with caution in software intended to be portable to other 515systems. 516.Ss Fn regfree 517The 518.Fn regfree 519function frees any dynamically-allocated storage associated with the compiled RE 520pointed to by 521.Fa preg . 522The remaining 523.Ft regex_t 524is no longer a valid compiled RE and the effect of supplying it to 525.Fn regexec 526or 527.Fn regerror 528is undefined. 529.Sh IMPLEMENTATION NOTES 530There are a number of decisions that 531.St -p1003.2 532leaves up to the implementor, 533either by explicitly saying 534.Qq undefined 535or by virtue of them being forbidden by the RE grammar. 536This implementation treats them as follows. 537.Pp 538There is no particular limit on the length of REs, except insofar as memory is 539limited. 540Memory usage is approximately linear in RE size, and largely insensitive 541to RE complexity, except for bounded repetitions. 542.Pp 543A backslashed character other than one specifically given a magic meaning by 544.St -p1003.2 545.Pq such magic meanings occur only in BREs 546is taken as an ordinary character. 547.Pp 548Any unmatched 549.Qq \&[ 550is a 551.Dv REG_EBRACK 552error. 553.Pp 554Equivalence classes cannot begin or end bracket-expression ranges. 555The endpoint of one range cannot begin another. 556.Pp 557.Dv RE_DUP_MAX , 558the limit on repetition counts in bounded repetitions, is 255. 559.Pp 560A repetition operator 561.Po 562.Qq \&? , 563.Qq \&* , 564.Qq \&+ , 565or bounds 566.Pc 567cannot follow another repetition operator. 568A repetition operator cannot begin an expression or subexpression 569or follow 570.Qq \&^ 571or 572.Qq \&| . 573.Pp 574.Qq \&| 575cannot appear first or last in a (sub)expression or after another 576.Qq \&| , 577i.e., an operand of 578.Qq \&| 579cannot be an empty subexpression. 580An empty parenthesized subexpression, 581.Qq () , 582is legal and matches an empty (sub)string. 583An empty string is not a legal RE. 584.Pp 585A 586.Qq \&{ 587followed by a digit is considered the beginning of bounds for a bounded 588repetition, which must then follow the syntax for bounds. 589A 590.Qq \&{ 591.Em not 592followed by a digit is considered an ordinary character. 593.Pp 594.Qq \&^ 595and 596.Qq \&$ 597beginning and ending subexpressions in BREs are anchors, not ordinary 598characters. 599.Sh RETURN VALUES 600On successful completion, the 601.Fn regcomp 602function returns 0. 603Otherwise, it returns an integer value indicating an error as described in 604.In regex.h , 605and the content of preg is undefined. 606.Pp 607On successful completion, the 608.Fn regexec 609function returns 0. 610Otherwise it returns 611.Dv REG_NOMATCH 612to indicate no match, or 613.Dv REG_ENOSYS 614to indicate that the function is not supported. 615.Pp 616Upon successful completion, the 617.Fn regerror 618function returns the number of bytes needed to hold the entire generated string. 619Otherwise, it returns 0 to indicate that the function is not implemented. 620.Pp 621The 622.Fn regfree 623function returns no value. 624.Pp 625The following constants are defined as error return values: 626.Pp 627.Bl -tag -width "REG_ECOLLATE" -compact 628.It Dv REG_NOMATCH 629The 630.Fn regexec 631function failed to match. 632.It Dv REG_BADPAT 633Invalid regular expression. 634.It Dv REG_ECOLLATE 635Invalid collating element referenced. 636.It Dv REG_ECTYPE 637Invalid character class type referenced. 638.It Dv REG_EESCAPE 639Trailing 640.Qq \&\e 641in pattern. 642.It Dv REG_ESUBREG 643Number in 644.Qq \&\e Ns Em digit 645invalid or in error. 646.It Dv REG_EBRACK 647.Qq [] 648imbalance. 649.It Dv REG_ENOSYS 650The function is not supported. 651.It Dv REG_EPAREN 652.Qq \e(\e) 653or 654.Qq () 655imbalance. 656.It Dv REG_EBRACE 657.Qq \e{\e} 658imbalance. 659.It Dv REG_BADBR 660Content of 661.Qq \e{\e} 662invalid: not a number, number too large, more than two 663numbers, first larger than second. 664.It Dv REG_ERANGE 665Invalid endpoint in range expression. 666.It Dv REG_ESPACE 667Out of memory. 668.It Dv REG_BADRPT 669.Qq \&? , 670.Qq * 671or 672.Qq + 673not preceded by valid regular expression. 674.El 675.Sh USAGE 676An application could use: 677.Bd -literal -offset Ds 678regerror(code, preg, (char *)NULL, (size_t)0) 679.Ed 680.Pp 681to find out how big a buffer is needed for the generated string, 682.Fn malloc 683a buffer to hold the string, and then call 684.Fn regerror 685again to get the string 686.Po see 687.Xr malloc 3C 688.Pc . 689Alternately, it could allocate a fixed, static buffer that is big enough to hold 690most strings, and then use 691.Fn malloc 692allocate a larger buffer if it finds that this is too small. 693.Sh EXAMPLES 694Matching string against the extended regular expression in pattern. 695.Bd -literal -offset Ds 696#include <regex.h> 697 698/* 699* Match string against the extended regular expression in 700* pattern, treating errors as no match. 701* 702* return 1 for match, 0 for no match 703*/ 704int 705match(const char *string, char *pattern) 706{ 707 int status; 708 regex_t re; 709 710 if (regcomp(&re, pattern, REG_EXTENDED\||\|REG_NOSUB) != 0) { 711 return(0); /* report error */ 712 } 713 status = regexec(&re, string, (size_t) 0, NULL, 0); 714 regfree(&re); 715 if (status != 0) { 716 return(0); /* report error */ 717 } 718 return(1); 719} 720.Ed 721.Pp 722The following demonstrates how the 723.Dv REG_NOTBOL 724flag could be used with 725.Fn regexec 726to find all substrings in a line that match a pattern supplied by a user. 727.Pq For simplicity of the example, very little error checking is done. 728.Bd -literal -offset Ds 729(void) regcomp(&re, pattern, 0); 730/* this call to regexec() finds the first match on the line */ 731error = regexec(&re, &buffer[0], 1, &pm, 0); 732while (error == 0) { /* while matches found */ 733 /* substring found between pm.rm_so and pm.rm_eo */ 734 /* This call to regexec() finds the next match */ 735 error = regexec(&re, buffer + pm.rm_eo, 1, &pm, REG_NOTBOL); 736} 737.Ed 738.Sh ERRORS 739No errors are defined. 740.Sh CODE SET INDEPENDENCE 741.Sy Enabled 742.Sh INTERFACE STABILITY 743.Sy Standard 744.Sh MT-LEVEL 745.Sy MT-Safe with exceptions 746.Pp 747The 748.Fn regcomp 749function can be used safely in a multithreaded application as long as 750.Xr setlocale 3C 751is not being called to change the locale. 752.Sh SEE ALSO 753.Xr attributes 5 , 754.Xr regex 5 , 755.Xr standards 5 756.Pp 757.St -p1003.2 , 758sections 2.8 759.Pq Regular Expression Notation 760and 761B.5 762.Pq C Binding for Regular Expression Matching . 763