xref: /freebsd/lib/libc/tests/regex/multibyte.sh (revision 53120fbb68952b7d620c2c0e1cf05c5017fc1b27)
1atf_test_case bmpat
2bmpat_head()
3{
4	atf_set "descr" "Check matching multibyte characters (PR153502)"
5}
6bmpat_body()
7{
8	export LC_CTYPE="C.UTF-8"
9
10	printf 'é' | atf_check -o "inline:é" \
11	    sed -ne '/^.$/p'
12	printf 'éé' | atf_check -o "inline:éé" \
13	    sed -ne '/^..$/p'
14	printf 'aéa' | atf_check -o "inline:aéa" \
15	    sed -ne '/a.a/p'
16	printf 'aéa'| atf_check -o "inline:aéa" \
17	    sed -ne '/a.*a/p'
18	printf 'aaéaa' | atf_check -o "inline:aaéaa" \
19	    sed -ne '/aa.aa/p'
20	printf 'aéaéa' | atf_check -o "inline:aéaéa" \
21	    sed -ne '/a.a.a/p'
22	printf 'éa' | atf_check -o "inline:éa" \
23	    sed -ne '/.a/p'
24	printf 'aéaa' | atf_check -o "inline:aéaa" \
25	    sed -ne '/a.aa/p'
26	printf 'éaé' | atf_check -o "inline:éaé" \
27	    sed -ne '/.a./p'
28}
29
30atf_test_case icase
31icase_head()
32{
33	atf_set "descr" "Check case-insensitive matching for characters 128-255"
34}
35icase_body()
36{
37	export LC_CTYPE="C.UTF-8"
38
39	a=$(printf '\302\265\n')	# U+00B5
40	b=$(printf '\316\234\n')	# U+039C
41	c=$(printf '\316\274\n')	# U+03BC
42
43	echo $b | atf_check -o "inline:$b\n" sed -ne "/$a/Ip"
44	echo $c | atf_check -o "inline:$c\n" sed -ne "/$a/Ip"
45}
46
47atf_test_case mbset cleanup
48mbset_head()
49{
50	atf_set "descr" "Check multibyte sets matching"
51}
52mbset_body()
53{
54	export LC_CTYPE="C.UTF-8"
55
56	# This involved an erroneously implemented optimization which reduces
57	# single-element sets to an exact match with a single codepoint.
58	# Match sets record small-codepoint characters in a bitmap and
59	# large-codepoint characters in an array; the optimization would falsely
60	# trigger if either the bitmap or the array was a singleton, ignoring
61	# the members of the other side of the set.
62	#
63	# To exercise this, we construct sets which have one member of one side
64	# and one or more of the other, and verify that all members can be
65	# found.
66	printf "a" > mbset; atf_check -o not-empty sed -ne '/[aà]/p' mbset
67	printf "à" > mbset; atf_check -o not-empty sed -ne '/[aà]/p' mbset
68	printf "a" > mbset; atf_check -o not-empty sed -ne '/[aàá]/p' mbset
69	printf "à" > mbset; atf_check -o not-empty sed -ne '/[aàá]/p' mbset
70	printf "á" > mbset; atf_check -o not-empty sed -ne '/[aàá]/p' mbset
71	printf "à" > mbset; atf_check -o not-empty sed -ne '/[abà]/p' mbset
72	printf "a" > mbset; atf_check -o not-empty sed -ne '/[abà]/p' mbset
73	printf "b" > mbset; atf_check -o not-empty sed -ne '/[abà]/p' mbset
74	printf "a" > mbset; atf_check -o not-empty sed -Ene '/[aà]/p' mbset
75	printf "à" > mbset; atf_check -o not-empty sed -Ene '/[aà]/p' mbset
76	printf "a" > mbset; atf_check -o not-empty sed -Ene '/[aàá]/p' mbset
77	printf "à" > mbset; atf_check -o not-empty sed -Ene '/[aàá]/p' mbset
78	printf "á" > mbset; atf_check -o not-empty sed -Ene '/[aàá]/p' mbset
79	printf "à" > mbset; atf_check -o not-empty sed -Ene '/[abà]/p' mbset
80	printf "a" > mbset; atf_check -o not-empty sed -Ene '/[abà]/p' mbset
81	printf "b" > mbset; atf_check -o not-empty sed -Ene '/[abà]/p' mbset
82}
83mbset_cleanup()
84{
85	rm -f mbset
86}
87
88atf_init_test_cases()
89{
90	atf_add_test_case bmpat
91	atf_add_test_case icase
92	atf_add_test_case mbset
93}
94