xref: /freebsd/usr.bin/sort/vsort.c (revision d0b2dbfa0ecf2bbc9709efc5e20baf8e4b44bbbf)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (C) 2012 Oleg Moskalenko <mom040267@gmail.com>
5  * Copyright (C) 2012 Gabor Kovesdan <gabor@FreeBSD.org>
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 #include <sys/types.h>
32 
33 #include <ctype.h>
34 #include <stdlib.h>
35 #include <string.h>
36 
37 #include "sort.h"
38 #include "vsort.h"
39 
40 static inline bool
41 isdigit_clocale(wchar_t c)
42 {
43 
44 	return (c >= L'0' && c <= L'9');
45 }
46 
47 static inline bool
48 isalpha_clocale(wchar_t c)
49 {
50 
51 	return ((c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z'));
52 }
53 
54 static inline bool
55 isalnum_clocale(wchar_t c)
56 {
57 
58 	return ((c >= L'a' && c <= L'z') || (c >= L'A' && c <= L'Z') ||
59 	    (c >= L'0' && c <= L'9'));
60 }
61 
62 /*
63  * Find string suffix of format: (\.[A-Za-z~][A-Za-z0-9~]*)*$
64  * Set length of string before suffix.
65  */
66 static void
67 find_suffix(bwstring_iterator si, bwstring_iterator se, size_t *len)
68 {
69 	wchar_t c;
70 	size_t clen;
71 	bool expect_alpha, sfx;
72 
73 	sfx = false;
74 	expect_alpha = false;
75 	*len = 0;
76 	clen = 0;
77 
78 	while ((si < se) && (c = bws_get_iter_value(si))) {
79 		if (expect_alpha) {
80 			expect_alpha = false;
81 			if (!isalpha_clocale(c) && (c != L'~'))
82 				sfx = false;
83 		} else if (c == L'.') {
84 			expect_alpha = true;
85 			if (!sfx) {
86 				sfx = true;
87 				*len = clen;
88 			}
89 		} else if (!isalnum_clocale(c) && (c != L'~'))
90 			sfx = false;
91 
92 		si = bws_iterator_inc(si, 1);
93 		++clen;
94 	}
95 
96 	/* This code must be here to make the implementation compatible
97 	 * with WORDING of GNU sort documentation.
98 	 * But the GNU sort implementation is not following its own
99 	 * documentation.  GNU sort allows empty file extensions
100 	 * (just dot with nothing after); but the regular expression in
101 	 * their documentation does not allow empty file extensions.
102 	 * We chose to make our implementation compatible with GNU sort
103 	 * implementation.  If they will ever fix their bug, this code
104 	 * must be uncommented. Or they may choose to fix the info page,
105 	 * then the code stays commented.
106 	 *
107 	 if (expect_alpha)
108 	 	sfx = false;
109 	 */
110 
111 	if (!sfx)
112 		*len = clen;
113 }
114 
115 static inline int
116 cmp_chars(wchar_t c1, wchar_t c2)
117 {
118 
119 	if (c1 == c2)
120 		return (0);
121 
122 	if (c1 == L'~')
123 		return (-1);
124 	if (c2 == L'~')
125 		return (+1);
126 
127 	if (isdigit_clocale(c1) || !c1)
128 		return ((isdigit_clocale(c2) || !c2) ? 0 : -1);
129 
130 	if (isdigit_clocale(c2) || !c2)
131 		return (+1);
132 
133 	if (isalpha_clocale(c1))
134 		return ((isalpha_clocale(c2)) ? ((int) c1 - (int) c2) : -1);
135 
136 	if (isalpha_clocale(c2))
137 		return (+1);
138 
139 	return ((int) c1 - (int) c2);
140 }
141 
142 static int
143 cmpversions(bwstring_iterator si1, bwstring_iterator se1,
144     bwstring_iterator si2, bwstring_iterator se2)
145 {
146 	int cmp, diff;
147 
148 	while ((si1 < se1) || (si2 < se2)) {
149 		diff = 0;
150 
151 		while (((si1 < se1) &&
152 		    !isdigit_clocale(bws_get_iter_value(si1))) ||
153 		    ((si2 < se2) && !isdigit_clocale(bws_get_iter_value(si2)))) {
154 			wchar_t c1, c2;
155 
156 			c1 = (si1 < se1) ? bws_get_iter_value(si1) : 0;
157 			c2 = (si2 < se2) ? bws_get_iter_value(si2) : 0;
158 
159 			cmp = cmp_chars(c1, c2);
160 			if (cmp)
161 				return (cmp);
162 
163 			if (si1 < se1)
164 				si1 = bws_iterator_inc(si1, 1);
165 			if (si2 < se2)
166 				si2 = bws_iterator_inc(si2, 1);
167 		}
168 
169 		while (bws_get_iter_value(si1) == L'0')
170 			si1 = bws_iterator_inc(si1, 1);
171 
172 		while (bws_get_iter_value(si2) == L'0')
173 			si2 = bws_iterator_inc(si2, 1);
174 
175 		while (isdigit_clocale(bws_get_iter_value(si1)) &&
176 		    isdigit_clocale(bws_get_iter_value(si2))) {
177 			if (!diff)
178 				diff = ((int)bws_get_iter_value(si1) -
179 				    (int)bws_get_iter_value(si2));
180 			si1 = bws_iterator_inc(si1, 1);
181 			si2 = bws_iterator_inc(si2, 1);
182 		}
183 
184 		if (isdigit_clocale(bws_get_iter_value(si1)))
185 			return (1);
186 
187 		if (isdigit_clocale(bws_get_iter_value(si2)))
188 			return (-1);
189 
190 		if (diff)
191 			return (diff);
192 	}
193 
194 	return (0);
195 }
196 
197 /*
198  * Compare two version strings
199  */
200 int
201 vcmp(struct bwstring *s1, struct bwstring *s2)
202 {
203 	bwstring_iterator si1, si2;
204 	wchar_t c1, c2;
205 	size_t len1, len2, slen1, slen2;
206 	int cmp_bytes, cmp_res;
207 
208 	if (s1 == s2)
209 		return (0);
210 
211 	cmp_bytes = bwscmp(s1, s2, 0);
212 	if (cmp_bytes == 0)
213 		return (0);
214 
215 	len1 = slen1 = BWSLEN(s1);
216 	len2 = slen2 = BWSLEN(s2);
217 
218 	if (slen1 < 1)
219 		return (-1);
220 	if (slen2 < 1)
221 		return (+1);
222 
223 	si1 = bws_begin(s1);
224 	si2 = bws_begin(s2);
225 
226 	c1 = bws_get_iter_value(si1);
227 	c2 = bws_get_iter_value(si2);
228 
229 	if (c1 == L'.' && (slen1 == 1))
230 		return (-1);
231 
232 	if (c2 == L'.' && (slen2 == 1))
233 		return (+1);
234 
235 	if (slen1 == 2 && c1 == L'.' &&
236 	    bws_get_iter_value(bws_iterator_inc(si1, 1)) == L'.')
237 		return (-1);
238 	if (slen2 == 2 && c2 == L'.' &&
239 	    bws_get_iter_value(bws_iterator_inc(si2, 1)) == L'.')
240 		return (+1);
241 
242 	if (c1 == L'.' && c2 != L'.')
243 		return (-1);
244 	if (c1 != L'.' && c2 == L'.')
245 		return (+1);
246 
247 	if (c1 == L'.' && c2 == L'.') {
248 		si1 = bws_iterator_inc(si1, 1);
249 		si2 = bws_iterator_inc(si2, 1);
250 	}
251 
252 	find_suffix(si1, bws_end(s1), &len1);
253 	find_suffix(si2, bws_end(s2), &len2);
254 
255 	if ((len1 == len2) && (bws_iterator_cmp(si1, si2, len1) == 0))
256 		return (cmp_bytes);
257 
258 	cmp_res = cmpversions(si1, bws_iterator_inc(si1, len1), si2,
259 	    bws_iterator_inc(si2, len2));
260 
261 	if (cmp_res == 0)
262 	  cmp_res = cmp_bytes;
263 
264 	return (cmp_res);
265 }
266