xref: /freebsd/usr.bin/sort/vsort.c (revision cfd6422a5217410fbd66f7a7a8a64d9d85e61229)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (C) 2012 Oleg Moskalenko <mom040267@gmail.com>
5  * Copyright (C) 2012 Gabor Kovesdan <gabor@FreeBSD.org>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include <sys/types.h>
34 
35 #include <ctype.h>
36 #include <stdlib.h>
37 #include <string.h>
38 
39 #include "sort.h"
40 #include "vsort.h"
41 
42 static inline bool
43 isdigit_clocale(wchar_t c)
44 {
45 
46 	return (c >= L'0' && c <= L'9');
47 }
48 
49 static inline bool
50 isalpha_clocale(wchar_t c)
51 {
52 
53 	return ((c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z'));
54 }
55 
56 static inline bool
57 isalnum_clocale(wchar_t c)
58 {
59 
60 	return ((c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z') ||
61 	    (c >= L'0' && c <= L'9'));
62 }
63 
64 /*
65  * Find string suffix of format: (\.[A-Za-z~][A-Za-z0-9~]*)*$
66  * Set length of string before suffix.
67  */
68 static void
69 find_suffix(bwstring_iterator si, bwstring_iterator se, size_t *len)
70 {
71 	wchar_t c;
72 	size_t clen;
73 	bool expect_alpha, sfx;
74 
75 	sfx = false;
76 	expect_alpha = false;
77 	*len = 0;
78 	clen = 0;
79 
80 	while ((si < se) && (c = bws_get_iter_value(si))) {
81 		if (expect_alpha) {
82 			expect_alpha = false;
83 			if (!isalpha_clocale(c) && (c != L'~'))
84 				sfx = false;
85 		} else if (c == L'.') {
86 			expect_alpha = true;
87 			if (!sfx) {
88 				sfx = true;
89 				*len = clen;
90 			}
91 		} else if (!isalnum_clocale(c) && (c != L'~'))
92 			sfx = false;
93 
94 		si = bws_iterator_inc(si, 1);
95 		++clen;
96 	}
97 
98 	/* This code must be here to make the implementation compatible
99 	 * with WORDING of GNU sort documentation.
100 	 * But the GNU sort implementation is not following its own
101 	 * documentation.  GNU sort allows empty file extensions
102 	 * (just dot with nothing after); but the regular expression in
103 	 * their documentation does not allow empty file extensions.
104 	 * We chose to make our implementation compatible with GNU sort
105 	 * implementation.  If they will ever fix their bug, this code
106 	 * must be uncommented. Or they may choose to fix the info page,
107 	 * then the code stays commented.
108 	 *
109 	 if (expect_alpha)
110 	 	sfx = false;
111 	 */
112 
113 	if (!sfx)
114 		*len = clen;
115 }
116 
117 static inline int
118 cmp_chars(wchar_t c1, wchar_t c2)
119 {
120 
121 	if (c1 == c2)
122 		return (0);
123 
124 	if (c1 == L'~')
125 		return (-1);
126 	if (c2 == L'~')
127 		return (+1);
128 
129 	if (isdigit_clocale(c1) || !c1)
130 		return ((isdigit_clocale(c2) || !c2) ? 0 : -1);
131 
132 	if (isdigit_clocale(c2) || !c2)
133 		return (+1);
134 
135 	if (isalpha_clocale(c1))
136 		return ((isalpha_clocale(c2)) ? ((int) c1 - (int) c2) : -1);
137 
138 	if (isalpha_clocale(c2))
139 		return (+1);
140 
141 	return ((int) c1 - (int) c2);
142 }
143 
144 static int
145 cmpversions(bwstring_iterator si1, bwstring_iterator se1,
146     bwstring_iterator si2, bwstring_iterator se2)
147 {
148 	int cmp, diff;
149 
150 	while ((si1 < se1) || (si2 < se2)) {
151 		diff = 0;
152 
153 		while (((si1 < se1) &&
154 		    !isdigit_clocale(bws_get_iter_value(si1))) ||
155 		    ((si2 < se2) && !isdigit_clocale(bws_get_iter_value(si2)))) {
156 			wchar_t c1, c2;
157 
158 			c1 = (si1 < se1) ? bws_get_iter_value(si1) : 0;
159 			c2 = (si2 < se2) ? bws_get_iter_value(si2) : 0;
160 
161 			cmp = cmp_chars(c1, c2);
162 			if (cmp)
163 				return (cmp);
164 
165 			if (si1 < se1)
166 				si1 = bws_iterator_inc(si1, 1);
167 			if (si2 < se2)
168 				si2 = bws_iterator_inc(si2, 1);
169 		}
170 
171 		while (bws_get_iter_value(si1) == L'0')
172 			si1 = bws_iterator_inc(si1, 1);
173 
174 		while (bws_get_iter_value(si2) == L'0')
175 			si2 = bws_iterator_inc(si2, 1);
176 
177 		while (isdigit_clocale(bws_get_iter_value(si1)) &&
178 		    isdigit_clocale(bws_get_iter_value(si2))) {
179 			if (!diff)
180 				diff = ((int)bws_get_iter_value(si1) -
181 				    (int)bws_get_iter_value(si2));
182 			si1 = bws_iterator_inc(si1, 1);
183 			si2 = bws_iterator_inc(si2, 1);
184 		}
185 
186 		if (isdigit_clocale(bws_get_iter_value(si1)))
187 			return (1);
188 
189 		if (isdigit_clocale(bws_get_iter_value(si2)))
190 			return (-1);
191 
192 		if (diff)
193 			return (diff);
194 	}
195 
196 	return (0);
197 }
198 
199 /*
200  * Compare two version strings
201  */
202 int
203 vcmp(struct bwstring *s1, struct bwstring *s2)
204 {
205 	bwstring_iterator si1, si2;
206 	wchar_t c1, c2;
207 	size_t len1, len2, slen1, slen2;
208 	int cmp_bytes, cmp_res;
209 
210 	if (s1 == s2)
211 		return (0);
212 
213 	cmp_bytes = bwscmp(s1, s2, 0);
214 	if (cmp_bytes == 0)
215 		return (0);
216 
217 	len1 = slen1 = BWSLEN(s1);
218 	len2 = slen2 = BWSLEN(s2);
219 
220 	if (slen1 < 1)
221 		return (-1);
222 	if (slen2 < 1)
223 		return (+1);
224 
225 	si1 = bws_begin(s1);
226 	si2 = bws_begin(s2);
227 
228 	c1 = bws_get_iter_value(si1);
229 	c2 = bws_get_iter_value(si2);
230 
231 	if (c1 == L'.' && (slen1 == 1))
232 		return (-1);
233 
234 	if (c2 == L'.' && (slen2 == 1))
235 		return (+1);
236 
237 	if (slen1 == 2 && c1 == L'.' &&
238 	    bws_get_iter_value(bws_iterator_inc(si1, 1)) == L'.')
239 		return (-1);
240 	if (slen2 == 2 && c2 == L'.' &&
241 	    bws_get_iter_value(bws_iterator_inc(si2, 1)) == L'.')
242 		return (+1);
243 
244 	if (c1 == L'.' && c2 != L'.')
245 		return (-1);
246 	if (c1 != L'.' && c2 == L'.')
247 		return (+1);
248 
249 	if (c1 == L'.' && c2 == L'.') {
250 		si1 = bws_iterator_inc(si1, 1);
251 		si2 = bws_iterator_inc(si2, 1);
252 	}
253 
254 	find_suffix(si1, bws_end(s1), &len1);
255 	find_suffix(si2, bws_end(s2), &len2);
256 
257 	if ((len1 == len2) && (bws_iterator_cmp(si1, si2, len1) == 0))
258 		return (cmp_bytes);
259 
260 	cmp_res = cmpversions(si1, bws_iterator_inc(si1, len1), si2,
261 	    bws_iterator_inc(si2, len2));
262 
263 	if (cmp_res == 0)
264 	  cmp_res = cmp_bytes;
265 
266 	return (cmp_res);
267 }
268