xref: /freebsd/sys/fs/udf/osta.c (revision 51a7b740a11434580f649a98c2b44b98d60e4292)
1 /*
2  * Various routines from the OSTA 2.01 specs.  Copyrights are included with
3  * each code segment.  Slight whitespace modifications have been made for
4  * formatting purposes.  Typos/bugs have been fixed.
5  *
6  * $FreeBSD$
7  */
8 
9 #include <fs/udf/osta.h>
10 
11 /*****************************************************************************/
12 /***********************************************************************
13  * OSTA compliant Unicode compression, uncompression routines.
14  * Copyright 1995 Micro Design International, Inc.
15  * Written by Jason M. Rinn.
16  * Micro Design International gives permission for the free use of the
17  * following source code.
18  */
19 
20 #include <stddef.h>
21 /***********************************************************************
22  * Takes an OSTA CS0 compressed unicode name, and converts
23  * it to Unicode.
24  * The Unicode output will be in the byte order
25  * that the local compiler uses for 16-bit values.
26  * NOTE: This routine only performs error checking on the compID.
27  * It is up to the user to ensure that the unicode buffer is large
28  * enough, and that the compressed unicode name is correct.
29  *
30  * RETURN VALUE
31  *
32  * The number of unicode characters which were uncompressed.
33  * A -1 is returned if the compression ID is invalid.
34  */
35 int
36 udf_UncompressUnicode(
37 	int numberOfBytes,	/* (Input) number of bytes read from media. */
38 	byte *UDFCompressed,	/* (Input) bytes read from media. */
39 	unicode_t *unicode)	/* (Output) uncompressed unicode characters. */
40 {
41 	unsigned int compID;
42 	int returnValue, unicodeIndex, byteIndex;
43 
44 	/* Use UDFCompressed to store current byte being read. */
45 	compID = UDFCompressed[0];
46 
47 	/* First check for valid compID. */
48 	if (compID != 8 && compID != 16) {
49 		returnValue = -1;
50 	} else {
51 		unicodeIndex = 0;
52 		byteIndex = 1;
53 
54 		/* Loop through all the bytes. */
55 		while (byteIndex < numberOfBytes) {
56 			if (compID == 16) {
57 				/* Move the first byte to the high bits of the
58 				 * unicode char.
59 				 */
60 				unicode[unicodeIndex] =
61 				    UDFCompressed[byteIndex++] << 8;
62 			} else {
63 				unicode[unicodeIndex] = 0;
64 			}
65 			if (byteIndex < numberOfBytes) {
66 				/*Then the next byte to the low bits. */
67 				unicode[unicodeIndex] |=
68 				    UDFCompressed[byteIndex++];
69 			}
70 			unicodeIndex++;
71 		}
72 		returnValue = unicodeIndex;
73 	}
74 	return(returnValue);
75 }
76 
77 /***********************************************************************
78  * DESCRIPTION:
79  * Takes a string of unicode wide characters and returns an OSTA CS0
80  * compressed unicode string. The unicode MUST be in the byte order of
81  * the compiler in order to obtain correct results. Returns an error
82  * if the compression ID is invalid.
83  *
84  * NOTE: This routine assumes the implementation already knows, by
85  * the local environment, how many bits are appropriate and
86  * therefore does no checking to test if the input characters fit
87  * into that number of bits or not.
88  *
89  * RETURN VALUE
90  *
91  * The total number of bytes in the compressed OSTA CS0 string,
92  * including the compression ID.
93  * A -1 is returned if the compression ID is invalid.
94  */
95 int
96 udf_CompressUnicode(
97 	int numberOfChars,	/* (Input) number of unicode characters. */
98 	int compID,		/* (Input) compression ID to be used. */
99 	unicode_t *unicode,	/* (Input) unicode characters to compress. */
100 	byte *UDFCompressed)	/* (Output) compressed string, as bytes. */
101 {
102 	int byteIndex, unicodeIndex;
103 
104 	if (compID != 8 && compID != 16) {
105 		byteIndex = -1; /* Unsupported compression ID ! */
106 	} else {
107 		/* Place compression code in first byte. */
108 		UDFCompressed[0] = compID;
109 
110 		byteIndex = 1;
111 		unicodeIndex = 0;
112 		while (unicodeIndex < numberOfChars) {
113 			if (compID == 16) {
114 				/* First, place the high bits of the char
115 				 * into the byte stream.
116 				 */
117 				UDFCompressed[byteIndex++] =
118 				    (unicode[unicodeIndex] & 0xFF00) >> 8;
119 			}
120 			/*Then place the low bits into the stream. */
121 			UDFCompressed[byteIndex++] =
122 			    unicode[unicodeIndex] & 0x00FF;
123 			unicodeIndex++;
124 		}
125 	}
126 	return(byteIndex);
127 }
128 
129 /*****************************************************************************/
130 /*
131  * CRC 010041
132  */
133 static unsigned short crc_table[256] = {
134 	0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50A5, 0x60C6, 0x70E7,
135 	0x8108, 0x9129, 0xA14A, 0xB16B, 0xC18C, 0xD1AD, 0xE1CE, 0xF1EF,
136 	0x1231, 0x0210, 0x3273, 0x2252, 0x52B5, 0x4294, 0x72F7, 0x62D6,
137 	0x9339, 0x8318, 0xB37B, 0xA35A, 0xD3BD, 0xC39C, 0xF3FF, 0xE3DE,
138 	0x2462, 0x3443, 0x0420, 0x1401, 0x64E6, 0x74C7, 0x44A4, 0x5485,
139 	0xA56A, 0xB54B, 0x8528, 0x9509, 0xE5EE, 0xF5CF, 0xC5AC, 0xD58D,
140 	0x3653, 0x2672, 0x1611, 0x0630, 0x76D7, 0x66F6, 0x5695, 0x46B4,
141 	0xB75B, 0xA77A, 0x9719, 0x8738, 0xF7DF, 0xE7FE, 0xD79D, 0xC7BC,
142 	0x48C4, 0x58E5, 0x6886, 0x78A7, 0x0840, 0x1861, 0x2802, 0x3823,
143 	0xC9CC, 0xD9ED, 0xE98E, 0xF9AF, 0x8948, 0x9969, 0xA90A, 0xB92B,
144 	0x5AF5, 0x4AD4, 0x7AB7, 0x6A96, 0x1A71, 0x0A50, 0x3A33, 0x2A12,
145 	0xDBFD, 0xCBDC, 0xFBBF, 0xEB9E, 0x9B79, 0x8B58, 0xBB3B, 0xAB1A,
146 	0x6CA6, 0x7C87, 0x4CE4, 0x5CC5, 0x2C22, 0x3C03, 0x0C60, 0x1C41,
147 	0xEDAE, 0xFD8F, 0xCDEC, 0xDDCD, 0xAD2A, 0xBD0B, 0x8D68, 0x9D49,
148 	0x7E97, 0x6EB6, 0x5ED5, 0x4EF4, 0x3E13, 0x2E32, 0x1E51, 0x0E70,
149 	0xFF9F, 0xEFBE, 0xDFDD, 0xCFFC, 0xBF1B, 0xAF3A, 0x9F59, 0x8F78,
150 	0x9188, 0x81A9, 0xB1CA, 0xA1EB, 0xD10C, 0xC12D, 0xF14E, 0xE16F,
151 	0x1080, 0x00A1, 0x30C2, 0x20E3, 0x5004, 0x4025, 0x7046, 0x6067,
152 	0x83B9, 0x9398, 0xA3FB, 0xB3DA, 0xC33D, 0xD31C, 0xE37F, 0xF35E,
153 	0x02B1, 0x1290, 0x22F3, 0x32D2, 0x4235, 0x5214, 0x6277, 0x7256,
154 	0xB5EA, 0xA5CB, 0x95A8, 0x8589, 0xF56E, 0xE54F, 0xD52C, 0xC50D,
155 	0x34E2, 0x24C3, 0x14A0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405,
156 	0xA7DB, 0xB7FA, 0x8799, 0x97B8, 0xE75F, 0xF77E, 0xC71D, 0xD73C,
157 	0x26D3, 0x36F2, 0x0691, 0x16B0, 0x6657, 0x7676, 0x4615, 0x5634,
158 	0xD94C, 0xC96D, 0xF90E, 0xE92F, 0x99C8, 0x89E9, 0xB98A, 0xA9AB,
159 	0x5844, 0x4865, 0x7806, 0x6827, 0x18C0, 0x08E1, 0x3882, 0x28A3,
160 	0xCB7D, 0xDB5C, 0xEB3F, 0xFB1E, 0x8BF9, 0x9BD8, 0xABBB, 0xBB9A,
161 	0x4A75, 0x5A54, 0x6A37, 0x7A16, 0x0AF1, 0x1AD0, 0x2AB3, 0x3A92,
162 	0xFD2E, 0xED0F, 0xDD6C, 0xCD4D, 0xBDAA, 0xAD8B, 0x9DE8, 0x8DC9,
163 	0x7C26, 0x6C07, 0x5C64, 0x4C45, 0x3CA2, 0x2C83, 0x1CE0, 0x0CC1,
164 	0xEF1F, 0xFF3E, 0xCF5D, 0xDF7C, 0xAF9B, 0xBFBA, 0x8FD9, 0x9FF8,
165 	0x6E17, 0x7E36, 0x4E55, 0x5E74, 0x2E93, 0x3EB2, 0x0ED1, 0x1EF0
166 };
167 
168 unsigned short
169 udf_cksum(s, n)
170 	unsigned char *s;
171 	int n;
172 {
173 	unsigned short crc=0;
174 
175 	while (n-- > 0)
176 		crc = crc_table[(crc>>8 ^ *s++) & 0xff] ^ (crc<<8);
177 	return crc;
178 }
179 
180 /* UNICODE Checksum */
181 unsigned short
182 udf_unicode_cksum(s, n)
183 	unsigned short *s;
184 	int n;
185 {
186 	unsigned short crc=0;
187 
188 	while (n-- > 0) {
189 		/* Take high order byte first--corresponds to a big endian
190 		 * byte stream.
191 		 */
192 		crc = crc_table[(crc>>8 ^ (*s>>8)) & 0xff] ^ (crc<<8);
193 		crc = crc_table[(crc>>8 ^ (*s++ & 0xff)) & 0xff] ^ (crc<<8);
194 	}
195 	return crc;
196 }
197 
198 #ifdef MAIN
199 unsigned char bytes[] = { 0x70, 0x6A, 0x77 };
200 
201 main()
202 {
203 	unsigned short x;
204 	x = cksum(bytes, sizeof bytes);
205 	printf("checksum: calculated=%4.4x, correct=%4.4x\en", x, 0x3299);
206 	exit(0);
207 }
208 #endif
209 
210 /*****************************************************************************/
211 #ifdef NEEDS_ISPRINT
212 /***********************************************************************
213  * OSTA UDF compliant file name translation routine for OS/2,
214  * Windows 95, Windows NT, Macintosh and UNIX.
215  * Copyright 1995 Micro Design International, Inc.
216  * Written by Jason M. Rinn.
217  * Micro Design International gives permission for the free use of the
218  * following source code.
219  */
220 
221 /***********************************************************************
222  * To use these routines with different operating systems.
223  *
224  * OS/2
225  * Define OS2
226  * Define MAXLEN = 254
227  *
228  * Windows 95
229  * Define WIN_95
230  * Define MAXLEN = 255
231  *
232  * Windows NT
233  * Define WIN_NT
234  * Define MAXLEN = 255
235  *
236  * Macintosh:
237  * Define MAC.
238  * Define MAXLEN = 31.
239  *
240  * UNIX
241  * Define UNIX.
242  * Define MAXLEN as specified by unix version.
243  */
244 
245 #define	ILLEGAL_CHAR_MARK	0x005F
246 #define	CRC_MARK	0x0023
247 #define	EXT_SIZE	5
248 #define	TRUE	1
249 #define	FALSE	0
250 #define	PERIOD	0x002E
251 #define	SPACE	0x0020
252 
253 /*** PROTOTYPES ***/
254 int IsIllegal(unicode_t ch);
255 
256 /* Define a function or macro which determines if a Unicode character is
257  * printable under your implementation.
258  */
259 int UnicodeIsPrint(unicode_t);
260 
261 /***********************************************************************
262  * Translates a long file name to one using a MAXLEN and an illegal
263  * char set in accord with the OSTA requirements. Assumes the name has
264  * already been translated to Unicode.
265  *
266  * RETURN VALUE
267  *
268  * Number of unicode characters in translated name.
269  */
270 int UDFTransName(
271 	unicode_t *newName,	/* (Output)Translated name. Must be of length
272 				 * MAXLEN */
273 	unicode_t *udfName,	/* (Input) Name from UDF volume.*/
274 	int udfLen)		/* (Input) Length of UDF Name. */
275 {
276 	int index, newIndex = 0, needsCRC = FALSE;
277 	int extIndex = 0, newExtIndex = 0, hasExt = FALSE;
278 #if defined OS2 || defined WIN_95 || defined WIN_NT
279 	int trailIndex = 0;
280 #endif
281 	unsigned short valueCRC;
282 	unicode_t current;
283 	const char hexChar[] = "0123456789ABCDEF";
284 
285 	for (index = 0; index < udfLen; index++) {
286 		current = udfName[index];
287 
288 		if (IsIllegal(current) || !UnicodeIsPrint(current)) {
289 			needsCRC = TRUE;
290 			/* Replace Illegal and non-displayable chars with
291 			 * underscore.
292 			 */
293 			current = ILLEGAL_CHAR_MARK;
294 			/* Skip any other illegal or non-displayable
295 			 * characters.
296 			 */
297 			while(index+1 < udfLen && (IsIllegal(udfName[index+1])
298 			    || !UnicodeIsPrint(udfName[index+1]))) {
299 				index++;
300 			}
301 		}
302 
303 		/* Record position of extension, if one is found. */
304 		if (current == PERIOD && (udfLen - index -1) <= EXT_SIZE) {
305 			if (udfLen == index + 1) {
306 				/* A trailing period is NOT an extension. */
307 				hasExt = FALSE;
308 			} else {
309 				hasExt = TRUE;
310 				extIndex = index;
311 				newExtIndex = newIndex;
312 			}
313 		}
314 
315 #if defined OS2 || defined WIN_95 || defined WIN_NT
316 		/* Record position of last char which is NOT period or space. */
317 		else if (current != PERIOD && current != SPACE) {
318 			trailIndex = newIndex;
319 		}
320 #endif
321 
322 		if (newIndex < MAXLEN) {
323 			newName[newIndex++] = current;
324 		} else {
325 			needsCRC = TRUE;
326 		}
327 	}
328 
329 #if defined OS2 || defined WIN_95 || defined WIN_NT
330 	/* For OS2, 95 & NT, truncate any trailing periods and\or spaces. */
331 	if (trailIndex != newIndex - 1) {
332 		newIndex = trailIndex + 1;
333 		needsCRC = TRUE;
334 		hasExt = FALSE; /* Trailing period does not make an
335 				 * extension. */
336 	}
337 #endif
338 
339 	if (needsCRC) {
340 		unicode_t ext[EXT_SIZE];
341 		int localExtIndex = 0;
342 		if (hasExt) {
343 			int maxFilenameLen;
344 			/* Translate extension, and store it in ext. */
345 			for(index = 0; index<EXT_SIZE &&
346 			    extIndex + index +1 < udfLen; index++ ) {
347 				current = udfName[extIndex + index + 1];
348 				if (IsIllegal(current) ||
349 				    !UnicodeIsPrint(current)) {
350 					needsCRC = 1;
351 					/* Replace Illegal and non-displayable
352 					 * chars with underscore.
353 					 */
354 					current = ILLEGAL_CHAR_MARK;
355 					/* Skip any other illegal or
356 					 * non-displayable characters.
357 					 */
358 					while(index + 1 < EXT_SIZE
359 					    && (IsIllegal(udfName[extIndex +
360 					    index + 2]) ||
361 					    !isprint(udfName[extIndex +
362 					    index + 2]))) {
363 						index++;
364 					}
365 				}
366 				ext[localExtIndex++] = current;
367 			}
368 
369 			/* Truncate filename to leave room for extension and
370 			 * CRC.
371 			 */
372 			maxFilenameLen = ((MAXLEN - 5) - localExtIndex - 1);
373 			if (newIndex > maxFilenameLen) {
374 				newIndex = maxFilenameLen;
375 			} else {
376 				newIndex = newExtIndex;
377 			}
378 		} else if (newIndex > MAXLEN - 5) {
379 			/*If no extension, make sure to leave room for CRC. */
380 			newIndex = MAXLEN - 5;
381 		}
382 		newName[newIndex++] = CRC_MARK; /* Add mark for CRC. */
383 
384 		/*Calculate CRC from original filename from FileIdentifier. */
385 		valueCRC = udf_unicode_cksum(udfName, udfLen);
386 		/* Convert 16-bits of CRC to hex characters. */
387 		newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12];
388 		newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8];
389 		newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4];
390 		newName[newIndex++] = hexChar[(valueCRC & 0x000f)];
391 
392 		/* Place a translated extension at end, if found. */
393 		if (hasExt) {
394 			newName[newIndex++] = PERIOD;
395 			for (index = 0;index < localExtIndex ;index++ ) {
396 				newName[newIndex++] = ext[index];
397 			}
398 		}
399 	}
400 	return(newIndex);
401 }
402 
403 #if defined OS2 || defined WIN_95 || defined WIN_NT
404 /***********************************************************************
405  * Decides if a Unicode character matches one of a list
406  * of ASCII characters.
407  * Used by OS2 version of IsIllegal for readability, since all of the
408  * illegal characters above 0x0020 are in the ASCII subset of Unicode.
409  * Works very similarly to the standard C function strchr().
410  *
411  * RETURN VALUE
412  *
413  * Non-zero if the Unicode character is in the given ASCII string.
414  */
415 int UnicodeInString(
416 	unsigned char *string,	/* (Input) String to search through. */
417 	unicode_t ch)		/* (Input) Unicode char to search for. */
418 {
419 	int found = FALSE;
420 	while (*string != '\0' && found == FALSE) {
421 		/* These types should compare, since both are unsigned
422 		 * numbers. */
423 		if (*string == ch) {
424 			found = TRUE;
425 		}
426 		string++;
427 	}
428 	return(found);
429 }
430 #endif /* OS2 */
431 
432 /***********************************************************************
433  * Decides whether the given character is illegal for a given OS.
434  *
435  * RETURN VALUE
436  *
437  * Non-zero if char is illegal.
438  */
439 int IsIllegal(unicode_t ch)
440 {
441 #ifdef MAC
442 	/* Only illegal character on the MAC is the colon. */
443 	if (ch == 0x003A) {
444 		return(1);
445 	} else {
446 		return(0);
447 	}
448 
449 #elif defined UNIX
450 	/* Illegal UNIX characters are NULL and slash. */
451 	if (ch == 0x0000 || ch == 0x002F) {
452 		return(1);
453 	} else {
454 		return(0);
455 	}
456 
457 #elif defined OS2 || defined WIN_95 || defined WIN_NT
458 	/* Illegal char's for OS/2 according to WARP toolkit. */
459 	if (ch < 0x0020 || UnicodeInString("\\/:*?\"<>|", ch)) {
460 		return(1);
461 	} else {
462 		return(0);
463 	}
464 #endif
465 }
466 #endif
467