1#! /bin/sh 2# From arnold@f7.net Sun Apr 22 20:15:25 2007 3# Date: Thu, 19 Apr 2007 17:09:02 +0300 4# From: Pekka Pessi <Pekka.Pessi@nokia.com> 5# X-Face: #V(jdpv[lI!TNUU=2*oh:="#suS*ponXW"yr6G;~L}<xZn_2^0)V{jqdc4y}@2b]ffd}SY# 6# :9||1pew85O,WjiYA"6C7bW^zt^+.{b#B{lEE+4$9lrXL(55g}dU>uZ\JfD\"IG#G{j`hZI;=DmT\H 7# pfDMyJ`i=:M;BM3R.`[>P^ER8+]i 8# Subject: UTF-8 locale and \n in regexps 9# To: bug-gawk@gnu.org 10# Cc: Pekka.Pessi@nokia.com 11# Message-id: <pvlkgoh2wx.fsf@nokia.com> 12# MIME-version: 1.0 13# Content-type: multipart/mixed; boundary="=-=-=" 14# 15# --=-=-= 16# 17# Hello, 18# 19# It looks like regexp with \n in [^] behaves badly if locale has 20# an UTF-8 ctype. 21# 22# It looks like if there is \n and an range without \n, like /\n[^x\n]foo/, 23# and first \n ends an even-numbered line within the string, regexp 24# does not match. 25# 26# Please see the attached script for an demonstration. 27# 28# --Pekka Pessi 29# 30# 31# --=-=-= 32# Content-Disposition: inline; filename=gawk-test 33# 34#! /bin/sh 35 36if [ -z "$AWK" ]; then 37 printf '$AWK must be set\n' >&2 38 exit 1 39fi 40 41# April 2010: Remove UNKNOWN, causes spurious failures on some systems 42for LC_ALL in C POSIX en_US.ISO8859-1 en_US.UTF-8 #UNKNOWN 43do 44export LC_ALL 45cat <<EOF | 46line1 47line2 48line3 49line4 50line5 51line6 52line7 53line8 54line9 55EOF 56$AWK ' 57BEGIN { RS="\0"; } 58{ 59 if (match($0, /\n[^2\n]*2/)) { got2=1; } else { print "no match 2"; } 60 if (match($0, /\n[^3\n]*3/)) { got3=1; } else { print "no match 3"; } 61 if (match($0, /\n[^4\n]*4/)) { got4=1; } else { print "no match 4"; } 62 if (match($0, /\n[^5\t]*5/)) { got5=1; } else { print "no match 5"; } 63 if (match($0, /\n[^6\n]*6/)) { got6=1; } else { print "no match 6"; } 64 if (match($0, /\n[a-z]*7\n/)){ got7=1; } else { print "no match 7"; } 65 if (match($0, /\n[^8\n]*8/)) { got8=1; } else { print "no match 8"; } 66 if (match($0, /8.[^9\n]+9/)) { got9=1; } else { print "no match 9"; } 67} 68 69END { exit(!(got2 && got3 && got4 && got5 && got6 && got7 && got8 && got9)); } 70' || { 71 echo LC_ALL=$LC_ALL FAILED 72 exit 1 73} 74echo LC_ALL=$LC_ALL passed 75done 76# 77# --=-=-=-- 78# 79