xref: /freebsd/usr.bin/sort/vsort.c (revision 02e9120893770924227138ba49df1edb3896112a)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (C) 2012 Oleg Moskalenko <mom040267@gmail.com>
5  * Copyright (C) 2012 Gabor Kovesdan <gabor@FreeBSD.org>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/types.h>
31 
32 #include <ctype.h>
33 #include <stdlib.h>
34 #include <string.h>
35 
36 #include "sort.h"
37 #include "vsort.h"
38 
39 static inline bool
40 isdigit_clocale(wchar_t c)
41 {
42 
43 	return (c >= L'0' && c <= L'9');
44 }
45 
46 static inline bool
47 isalpha_clocale(wchar_t c)
48 {
49 
50 	return ((c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z'));
51 }
52 
53 static inline bool
54 isalnum_clocale(wchar_t c)
55 {
56 
57 	return ((c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z') ||
58 	    (c >= L'0' && c <= L'9'));
59 }
60 
61 /*
62  * Find string suffix of format: (\.[A-Za-z~][A-Za-z0-9~]*)*$
63  * Set length of string before suffix.
64  */
65 static void
66 find_suffix(bwstring_iterator si, bwstring_iterator se, size_t *len)
67 {
68 	wchar_t c;
69 	size_t clen;
70 	bool expect_alpha, sfx;
71 
72 	sfx = false;
73 	expect_alpha = false;
74 	*len = 0;
75 	clen = 0;
76 
77 	while ((si < se) && (c = bws_get_iter_value(si))) {
78 		if (expect_alpha) {
79 			expect_alpha = false;
80 			if (!isalpha_clocale(c) && (c != L'~'))
81 				sfx = false;
82 		} else if (c == L'.') {
83 			expect_alpha = true;
84 			if (!sfx) {
85 				sfx = true;
86 				*len = clen;
87 			}
88 		} else if (!isalnum_clocale(c) && (c != L'~'))
89 			sfx = false;
90 
91 		si = bws_iterator_inc(si, 1);
92 		++clen;
93 	}
94 
95 	/* This code must be here to make the implementation compatible
96 	 * with WORDING of GNU sort documentation.
97 	 * But the GNU sort implementation is not following its own
98 	 * documentation.  GNU sort allows empty file extensions
99 	 * (just dot with nothing after); but the regular expression in
100 	 * their documentation does not allow empty file extensions.
101 	 * We chose to make our implementation compatible with GNU sort
102 	 * implementation.  If they will ever fix their bug, this code
103 	 * must be uncommented. Or they may choose to fix the info page,
104 	 * then the code stays commented.
105 	 *
106 	 if (expect_alpha)
107 	 	sfx = false;
108 	 */
109 
110 	if (!sfx)
111 		*len = clen;
112 }
113 
114 static inline int
115 cmp_chars(wchar_t c1, wchar_t c2)
116 {
117 
118 	if (c1 == c2)
119 		return (0);
120 
121 	if (c1 == L'~')
122 		return (-1);
123 	if (c2 == L'~')
124 		return (+1);
125 
126 	if (isdigit_clocale(c1) || !c1)
127 		return ((isdigit_clocale(c2) || !c2) ? 0 : -1);
128 
129 	if (isdigit_clocale(c2) || !c2)
130 		return (+1);
131 
132 	if (isalpha_clocale(c1))
133 		return ((isalpha_clocale(c2)) ? ((int) c1 - (int) c2) : -1);
134 
135 	if (isalpha_clocale(c2))
136 		return (+1);
137 
138 	return ((int) c1 - (int) c2);
139 }
140 
141 static int
142 cmpversions(bwstring_iterator si1, bwstring_iterator se1,
143     bwstring_iterator si2, bwstring_iterator se2)
144 {
145 	int cmp, diff;
146 
147 	while ((si1 < se1) || (si2 < se2)) {
148 		diff = 0;
149 
150 		while (((si1 < se1) &&
151 		    !isdigit_clocale(bws_get_iter_value(si1))) ||
152 		    ((si2 < se2) && !isdigit_clocale(bws_get_iter_value(si2)))) {
153 			wchar_t c1, c2;
154 
155 			c1 = (si1 < se1) ? bws_get_iter_value(si1) : 0;
156 			c2 = (si2 < se2) ? bws_get_iter_value(si2) : 0;
157 
158 			cmp = cmp_chars(c1, c2);
159 			if (cmp)
160 				return (cmp);
161 
162 			if (si1 < se1)
163 				si1 = bws_iterator_inc(si1, 1);
164 			if (si2 < se2)
165 				si2 = bws_iterator_inc(si2, 1);
166 		}
167 
168 		while (bws_get_iter_value(si1) == L'0')
169 			si1 = bws_iterator_inc(si1, 1);
170 
171 		while (bws_get_iter_value(si2) == L'0')
172 			si2 = bws_iterator_inc(si2, 1);
173 
174 		while (isdigit_clocale(bws_get_iter_value(si1)) &&
175 		    isdigit_clocale(bws_get_iter_value(si2))) {
176 			if (!diff)
177 				diff = ((int)bws_get_iter_value(si1) -
178 				    (int)bws_get_iter_value(si2));
179 			si1 = bws_iterator_inc(si1, 1);
180 			si2 = bws_iterator_inc(si2, 1);
181 		}
182 
183 		if (isdigit_clocale(bws_get_iter_value(si1)))
184 			return (1);
185 
186 		if (isdigit_clocale(bws_get_iter_value(si2)))
187 			return (-1);
188 
189 		if (diff)
190 			return (diff);
191 	}
192 
193 	return (0);
194 }
195 
196 /*
197  * Compare two version strings
198  */
199 int
200 vcmp(struct bwstring *s1, struct bwstring *s2)
201 {
202 	bwstring_iterator si1, si2;
203 	wchar_t c1, c2;
204 	size_t len1, len2, slen1, slen2;
205 	int cmp_bytes, cmp_res;
206 
207 	if (s1 == s2)
208 		return (0);
209 
210 	cmp_bytes = bwscmp(s1, s2, 0);
211 	if (cmp_bytes == 0)
212 		return (0);
213 
214 	len1 = slen1 = BWSLEN(s1);
215 	len2 = slen2 = BWSLEN(s2);
216 
217 	if (slen1 < 1)
218 		return (-1);
219 	if (slen2 < 1)
220 		return (+1);
221 
222 	si1 = bws_begin(s1);
223 	si2 = bws_begin(s2);
224 
225 	c1 = bws_get_iter_value(si1);
226 	c2 = bws_get_iter_value(si2);
227 
228 	if (c1 == L'.' && (slen1 == 1))
229 		return (-1);
230 
231 	if (c2 == L'.' && (slen2 == 1))
232 		return (+1);
233 
234 	if (slen1 == 2 && c1 == L'.' &&
235 	    bws_get_iter_value(bws_iterator_inc(si1, 1)) == L'.')
236 		return (-1);
237 	if (slen2 == 2 && c2 == L'.' &&
238 	    bws_get_iter_value(bws_iterator_inc(si2, 1)) == L'.')
239 		return (+1);
240 
241 	if (c1 == L'.' && c2 != L'.')
242 		return (-1);
243 	if (c1 != L'.' && c2 == L'.')
244 		return (+1);
245 
246 	if (c1 == L'.' && c2 == L'.') {
247 		si1 = bws_iterator_inc(si1, 1);
248 		si2 = bws_iterator_inc(si2, 1);
249 	}
250 
251 	find_suffix(si1, bws_end(s1), &len1);
252 	find_suffix(si2, bws_end(s2), &len2);
253 
254 	if ((len1 == len2) && (bws_iterator_cmp(si1, si2, len1) == 0))
255 		return (cmp_bytes);
256 
257 	cmp_res = cmpversions(si1, bws_iterator_inc(si1, len1), si2,
258 	    bws_iterator_inc(si2, len2));
259 
260 	if (cmp_res == 0)
261 	  cmp_res = cmp_bytes;
262 
263 	return (cmp_res);
264 }
265