xref: /freebsd/sys/fs/udf/osta.c (revision 74bf4e164ba5851606a27d4feff27717452583e5)
1 /*
2  * Various routines from the OSTA 2.01 specs.  Copyrights are included with
3  * each code segment.  Slight whitespace modifications have been made for
4  * formatting purposes.  Typos/bugs have been fixed.
5  *
6  * $FreeBSD$
7  */
8 
9 #include <fs/udf/osta.h>
10 
11 /*****************************************************************************/
12 /***********************************************************************
13  * OSTA compliant Unicode compression, uncompression routines.
14  * Copyright 1995 Micro Design International, Inc.
15  * Written by Jason M. Rinn.
16  * Micro Design International gives permission for the free use of the
17  * following source code.
18  */
19 
20 /***********************************************************************
21  * Takes an OSTA CS0 compressed unicode name, and converts
22  * it to Unicode.
23  * The Unicode output will be in the byte order
24  * that the local compiler uses for 16-bit values.
25  * NOTE: This routine only performs error checking on the compID.
26  * It is up to the user to ensure that the unicode buffer is large
27  * enough, and that the compressed unicode name is correct.
28  *
29  * RETURN VALUE
30  *
31  * The number of unicode characters which were uncompressed.
32  * A -1 is returned if the compression ID is invalid.
33  */
34 int
35 udf_UncompressUnicode(
36 	int numberOfBytes,	/* (Input) number of bytes read from media. */
37 	byte *UDFCompressed,	/* (Input) bytes read from media. */
38 	unicode_t *unicode)	/* (Output) uncompressed unicode characters. */
39 {
40 	unsigned int compID;
41 	int returnValue, unicodeIndex, byteIndex;
42 
43 	/* Use UDFCompressed to store current byte being read. */
44 	compID = UDFCompressed[0];
45 
46 	/* First check for valid compID. */
47 	if (compID != 8 && compID != 16) {
48 		returnValue = -1;
49 	} else {
50 		unicodeIndex = 0;
51 		byteIndex = 1;
52 
53 		/* Loop through all the bytes. */
54 		while (byteIndex < numberOfBytes) {
55 			if (compID == 16) {
56 				/* Move the first byte to the high bits of the
57 				 * unicode char.
58 				 */
59 				unicode[unicodeIndex] =
60 				    UDFCompressed[byteIndex++] << 8;
61 			} else {
62 				unicode[unicodeIndex] = 0;
63 			}
64 			if (byteIndex < numberOfBytes) {
65 				/*Then the next byte to the low bits. */
66 				unicode[unicodeIndex] |=
67 				    UDFCompressed[byteIndex++];
68 			}
69 			unicodeIndex++;
70 		}
71 		returnValue = unicodeIndex;
72 	}
73 	return(returnValue);
74 }
75 
76 /*
77  * Almost same as udf_UncompressUnicode(). The difference is that
78  * it keeps byte order of unicode string.
79  */
80 int
81 udf_UncompressUnicodeByte(
82 	int numberOfBytes,	/* (Input) number of bytes read from media. */
83 	byte *UDFCompressed,	/* (Input) bytes read from media. */
84 	byte *unicode)		/* (Output) uncompressed unicode characters. */
85 {
86 	unsigned int compID;
87 	int returnValue, unicodeIndex, byteIndex;
88 
89 	/* Use UDFCompressed to store current byte being read. */
90 	compID = UDFCompressed[0];
91 
92 	/* First check for valid compID. */
93 	if (compID != 8 && compID != 16) {
94 		returnValue = -1;
95 	} else {
96 		unicodeIndex = 0;
97 		byteIndex = 1;
98 
99 		/* Loop through all the bytes. */
100 		while (byteIndex < numberOfBytes) {
101 			if (compID == 16) {
102 				/* Move the first byte to the high bits of the
103 				 * unicode char.
104 				 */
105 				unicode[unicodeIndex++] =
106 				    UDFCompressed[byteIndex++];
107 			} else {
108 				unicode[unicodeIndex++] = 0;
109 			}
110 			if (byteIndex < numberOfBytes) {
111 				/*Then the next byte to the low bits. */
112 				unicode[unicodeIndex++] =
113 				    UDFCompressed[byteIndex++];
114 			}
115 		}
116 		returnValue = unicodeIndex;
117 	}
118 	return(returnValue);
119 }
120 
121 /***********************************************************************
122  * DESCRIPTION:
123  * Takes a string of unicode wide characters and returns an OSTA CS0
124  * compressed unicode string. The unicode MUST be in the byte order of
125  * the compiler in order to obtain correct results. Returns an error
126  * if the compression ID is invalid.
127  *
128  * NOTE: This routine assumes the implementation already knows, by
129  * the local environment, how many bits are appropriate and
130  * therefore does no checking to test if the input characters fit
131  * into that number of bits or not.
132  *
133  * RETURN VALUE
134  *
135  * The total number of bytes in the compressed OSTA CS0 string,
136  * including the compression ID.
137  * A -1 is returned if the compression ID is invalid.
138  */
139 int
140 udf_CompressUnicode(
141 	int numberOfChars,	/* (Input) number of unicode characters. */
142 	int compID,		/* (Input) compression ID to be used. */
143 	unicode_t *unicode,	/* (Input) unicode characters to compress. */
144 	byte *UDFCompressed)	/* (Output) compressed string, as bytes. */
145 {
146 	int byteIndex, unicodeIndex;
147 
148 	if (compID != 8 && compID != 16) {
149 		byteIndex = -1; /* Unsupported compression ID ! */
150 	} else {
151 		/* Place compression code in first byte. */
152 		UDFCompressed[0] = compID;
153 
154 		byteIndex = 1;
155 		unicodeIndex = 0;
156 		while (unicodeIndex < numberOfChars) {
157 			if (compID == 16) {
158 				/* First, place the high bits of the char
159 				 * into the byte stream.
160 				 */
161 				UDFCompressed[byteIndex++] =
162 				    (unicode[unicodeIndex] & 0xFF00) >> 8;
163 			}
164 			/*Then place the low bits into the stream. */
165 			UDFCompressed[byteIndex++] =
166 			    unicode[unicodeIndex] & 0x00FF;
167 			unicodeIndex++;
168 		}
169 	}
170 	return(byteIndex);
171 }
172 
173 /*****************************************************************************/
174 /*
175  * CRC 010041
176  */
177 static unsigned short crc_table[256] = {
178 	0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50A5, 0x60C6, 0x70E7,
179 	0x8108, 0x9129, 0xA14A, 0xB16B, 0xC18C, 0xD1AD, 0xE1CE, 0xF1EF,
180 	0x1231, 0x0210, 0x3273, 0x2252, 0x52B5, 0x4294, 0x72F7, 0x62D6,
181 	0x9339, 0x8318, 0xB37B, 0xA35A, 0xD3BD, 0xC39C, 0xF3FF, 0xE3DE,
182 	0x2462, 0x3443, 0x0420, 0x1401, 0x64E6, 0x74C7, 0x44A4, 0x5485,
183 	0xA56A, 0xB54B, 0x8528, 0x9509, 0xE5EE, 0xF5CF, 0xC5AC, 0xD58D,
184 	0x3653, 0x2672, 0x1611, 0x0630, 0x76D7, 0x66F6, 0x5695, 0x46B4,
185 	0xB75B, 0xA77A, 0x9719, 0x8738, 0xF7DF, 0xE7FE, 0xD79D, 0xC7BC,
186 	0x48C4, 0x58E5, 0x6886, 0x78A7, 0x0840, 0x1861, 0x2802, 0x3823,
187 	0xC9CC, 0xD9ED, 0xE98E, 0xF9AF, 0x8948, 0x9969, 0xA90A, 0xB92B,
188 	0x5AF5, 0x4AD4, 0x7AB7, 0x6A96, 0x1A71, 0x0A50, 0x3A33, 0x2A12,
189 	0xDBFD, 0xCBDC, 0xFBBF, 0xEB9E, 0x9B79, 0x8B58, 0xBB3B, 0xAB1A,
190 	0x6CA6, 0x7C87, 0x4CE4, 0x5CC5, 0x2C22, 0x3C03, 0x0C60, 0x1C41,
191 	0xEDAE, 0xFD8F, 0xCDEC, 0xDDCD, 0xAD2A, 0xBD0B, 0x8D68, 0x9D49,
192 	0x7E97, 0x6EB6, 0x5ED5, 0x4EF4, 0x3E13, 0x2E32, 0x1E51, 0x0E70,
193 	0xFF9F, 0xEFBE, 0xDFDD, 0xCFFC, 0xBF1B, 0xAF3A, 0x9F59, 0x8F78,
194 	0x9188, 0x81A9, 0xB1CA, 0xA1EB, 0xD10C, 0xC12D, 0xF14E, 0xE16F,
195 	0x1080, 0x00A1, 0x30C2, 0x20E3, 0x5004, 0x4025, 0x7046, 0x6067,
196 	0x83B9, 0x9398, 0xA3FB, 0xB3DA, 0xC33D, 0xD31C, 0xE37F, 0xF35E,
197 	0x02B1, 0x1290, 0x22F3, 0x32D2, 0x4235, 0x5214, 0x6277, 0x7256,
198 	0xB5EA, 0xA5CB, 0x95A8, 0x8589, 0xF56E, 0xE54F, 0xD52C, 0xC50D,
199 	0x34E2, 0x24C3, 0x14A0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405,
200 	0xA7DB, 0xB7FA, 0x8799, 0x97B8, 0xE75F, 0xF77E, 0xC71D, 0xD73C,
201 	0x26D3, 0x36F2, 0x0691, 0x16B0, 0x6657, 0x7676, 0x4615, 0x5634,
202 	0xD94C, 0xC96D, 0xF90E, 0xE92F, 0x99C8, 0x89E9, 0xB98A, 0xA9AB,
203 	0x5844, 0x4865, 0x7806, 0x6827, 0x18C0, 0x08E1, 0x3882, 0x28A3,
204 	0xCB7D, 0xDB5C, 0xEB3F, 0xFB1E, 0x8BF9, 0x9BD8, 0xABBB, 0xBB9A,
205 	0x4A75, 0x5A54, 0x6A37, 0x7A16, 0x0AF1, 0x1AD0, 0x2AB3, 0x3A92,
206 	0xFD2E, 0xED0F, 0xDD6C, 0xCD4D, 0xBDAA, 0xAD8B, 0x9DE8, 0x8DC9,
207 	0x7C26, 0x6C07, 0x5C64, 0x4C45, 0x3CA2, 0x2C83, 0x1CE0, 0x0CC1,
208 	0xEF1F, 0xFF3E, 0xCF5D, 0xDF7C, 0xAF9B, 0xBFBA, 0x8FD9, 0x9FF8,
209 	0x6E17, 0x7E36, 0x4E55, 0x5E74, 0x2E93, 0x3EB2, 0x0ED1, 0x1EF0
210 };
211 
212 unsigned short
213 udf_cksum(s, n)
214 	unsigned char *s;
215 	int n;
216 {
217 	unsigned short crc=0;
218 
219 	while (n-- > 0)
220 		crc = crc_table[(crc>>8 ^ *s++) & 0xff] ^ (crc<<8);
221 	return crc;
222 }
223 
224 /* UNICODE Checksum */
225 unsigned short
226 udf_unicode_cksum(s, n)
227 	unsigned short *s;
228 	int n;
229 {
230 	unsigned short crc=0;
231 
232 	while (n-- > 0) {
233 		/* Take high order byte first--corresponds to a big endian
234 		 * byte stream.
235 		 */
236 		crc = crc_table[(crc>>8 ^ (*s>>8)) & 0xff] ^ (crc<<8);
237 		crc = crc_table[(crc>>8 ^ (*s++ & 0xff)) & 0xff] ^ (crc<<8);
238 	}
239 	return crc;
240 }
241 
242 #ifdef MAIN
243 unsigned char bytes[] = { 0x70, 0x6A, 0x77 };
244 
245 main()
246 {
247 	unsigned short x;
248 	x = cksum(bytes, sizeof bytes);
249 	printf("checksum: calculated=%4.4x, correct=%4.4x\en", x, 0x3299);
250 	exit(0);
251 }
252 #endif
253 
254 /*****************************************************************************/
255 #ifdef NEEDS_ISPRINT
256 /***********************************************************************
257  * OSTA UDF compliant file name translation routine for OS/2,
258  * Windows 95, Windows NT, Macintosh and UNIX.
259  * Copyright 1995 Micro Design International, Inc.
260  * Written by Jason M. Rinn.
261  * Micro Design International gives permission for the free use of the
262  * following source code.
263  */
264 
265 /***********************************************************************
266  * To use these routines with different operating systems.
267  *
268  * OS/2
269  * Define OS2
270  * Define MAXLEN = 254
271  *
272  * Windows 95
273  * Define WIN_95
274  * Define MAXLEN = 255
275  *
276  * Windows NT
277  * Define WIN_NT
278  * Define MAXLEN = 255
279  *
280  * Macintosh:
281  * Define MAC.
282  * Define MAXLEN = 31.
283  *
284  * UNIX
285  * Define UNIX.
286  * Define MAXLEN as specified by unix version.
287  */
288 
289 #define	ILLEGAL_CHAR_MARK	0x005F
290 #define	CRC_MARK	0x0023
291 #define	EXT_SIZE	5
292 #define	TRUE	1
293 #define	FALSE	0
294 #define	PERIOD	0x002E
295 #define	SPACE	0x0020
296 
297 /*** PROTOTYPES ***/
298 int IsIllegal(unicode_t ch);
299 
300 /* Define a function or macro which determines if a Unicode character is
301  * printable under your implementation.
302  */
303 int UnicodeIsPrint(unicode_t);
304 
305 /***********************************************************************
306  * Translates a long file name to one using a MAXLEN and an illegal
307  * char set in accord with the OSTA requirements. Assumes the name has
308  * already been translated to Unicode.
309  *
310  * RETURN VALUE
311  *
312  * Number of unicode characters in translated name.
313  */
314 int UDFTransName(
315 	unicode_t *newName,	/* (Output)Translated name. Must be of length
316 				 * MAXLEN */
317 	unicode_t *udfName,	/* (Input) Name from UDF volume.*/
318 	int udfLen)		/* (Input) Length of UDF Name. */
319 {
320 	int index, newIndex = 0, needsCRC = FALSE;
321 	int extIndex = 0, newExtIndex = 0, hasExt = FALSE;
322 #if defined OS2 || defined WIN_95 || defined WIN_NT
323 	int trailIndex = 0;
324 #endif
325 	unsigned short valueCRC;
326 	unicode_t current;
327 	const char hexChar[] = "0123456789ABCDEF";
328 
329 	for (index = 0; index < udfLen; index++) {
330 		current = udfName[index];
331 
332 		if (IsIllegal(current) || !UnicodeIsPrint(current)) {
333 			needsCRC = TRUE;
334 			/* Replace Illegal and non-displayable chars with
335 			 * underscore.
336 			 */
337 			current = ILLEGAL_CHAR_MARK;
338 			/* Skip any other illegal or non-displayable
339 			 * characters.
340 			 */
341 			while(index+1 < udfLen && (IsIllegal(udfName[index+1])
342 			    || !UnicodeIsPrint(udfName[index+1]))) {
343 				index++;
344 			}
345 		}
346 
347 		/* Record position of extension, if one is found. */
348 		if (current == PERIOD && (udfLen - index -1) <= EXT_SIZE) {
349 			if (udfLen == index + 1) {
350 				/* A trailing period is NOT an extension. */
351 				hasExt = FALSE;
352 			} else {
353 				hasExt = TRUE;
354 				extIndex = index;
355 				newExtIndex = newIndex;
356 			}
357 		}
358 
359 #if defined OS2 || defined WIN_95 || defined WIN_NT
360 		/* Record position of last char which is NOT period or space. */
361 		else if (current != PERIOD && current != SPACE) {
362 			trailIndex = newIndex;
363 		}
364 #endif
365 
366 		if (newIndex < MAXLEN) {
367 			newName[newIndex++] = current;
368 		} else {
369 			needsCRC = TRUE;
370 		}
371 	}
372 
373 #if defined OS2 || defined WIN_95 || defined WIN_NT
374 	/* For OS2, 95 & NT, truncate any trailing periods and\or spaces. */
375 	if (trailIndex != newIndex - 1) {
376 		newIndex = trailIndex + 1;
377 		needsCRC = TRUE;
378 		hasExt = FALSE; /* Trailing period does not make an
379 				 * extension. */
380 	}
381 #endif
382 
383 	if (needsCRC) {
384 		unicode_t ext[EXT_SIZE];
385 		int localExtIndex = 0;
386 		if (hasExt) {
387 			int maxFilenameLen;
388 			/* Translate extension, and store it in ext. */
389 			for(index = 0; index<EXT_SIZE &&
390 			    extIndex + index +1 < udfLen; index++ ) {
391 				current = udfName[extIndex + index + 1];
392 				if (IsIllegal(current) ||
393 				    !UnicodeIsPrint(current)) {
394 					needsCRC = 1;
395 					/* Replace Illegal and non-displayable
396 					 * chars with underscore.
397 					 */
398 					current = ILLEGAL_CHAR_MARK;
399 					/* Skip any other illegal or
400 					 * non-displayable characters.
401 					 */
402 					while(index + 1 < EXT_SIZE
403 					    && (IsIllegal(udfName[extIndex +
404 					    index + 2]) ||
405 					    !isprint(udfName[extIndex +
406 					    index + 2]))) {
407 						index++;
408 					}
409 				}
410 				ext[localExtIndex++] = current;
411 			}
412 
413 			/* Truncate filename to leave room for extension and
414 			 * CRC.
415 			 */
416 			maxFilenameLen = ((MAXLEN - 5) - localExtIndex - 1);
417 			if (newIndex > maxFilenameLen) {
418 				newIndex = maxFilenameLen;
419 			} else {
420 				newIndex = newExtIndex;
421 			}
422 		} else if (newIndex > MAXLEN - 5) {
423 			/*If no extension, make sure to leave room for CRC. */
424 			newIndex = MAXLEN - 5;
425 		}
426 		newName[newIndex++] = CRC_MARK; /* Add mark for CRC. */
427 
428 		/*Calculate CRC from original filename from FileIdentifier. */
429 		valueCRC = udf_unicode_cksum(udfName, udfLen);
430 		/* Convert 16-bits of CRC to hex characters. */
431 		newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12];
432 		newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8];
433 		newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4];
434 		newName[newIndex++] = hexChar[(valueCRC & 0x000f)];
435 
436 		/* Place a translated extension at end, if found. */
437 		if (hasExt) {
438 			newName[newIndex++] = PERIOD;
439 			for (index = 0;index < localExtIndex ;index++ ) {
440 				newName[newIndex++] = ext[index];
441 			}
442 		}
443 	}
444 	return(newIndex);
445 }
446 
447 #if defined OS2 || defined WIN_95 || defined WIN_NT
448 /***********************************************************************
449  * Decides if a Unicode character matches one of a list
450  * of ASCII characters.
451  * Used by OS2 version of IsIllegal for readability, since all of the
452  * illegal characters above 0x0020 are in the ASCII subset of Unicode.
453  * Works very similarly to the standard C function strchr().
454  *
455  * RETURN VALUE
456  *
457  * Non-zero if the Unicode character is in the given ASCII string.
458  */
459 int UnicodeInString(
460 	unsigned char *string,	/* (Input) String to search through. */
461 	unicode_t ch)		/* (Input) Unicode char to search for. */
462 {
463 	int found = FALSE;
464 	while (*string != '\0' && found == FALSE) {
465 		/* These types should compare, since both are unsigned
466 		 * numbers. */
467 		if (*string == ch) {
468 			found = TRUE;
469 		}
470 		string++;
471 	}
472 	return(found);
473 }
474 #endif /* OS2 */
475 
476 /***********************************************************************
477  * Decides whether the given character is illegal for a given OS.
478  *
479  * RETURN VALUE
480  *
481  * Non-zero if char is illegal.
482  */
483 int IsIllegal(unicode_t ch)
484 {
485 #ifdef MAC
486 	/* Only illegal character on the MAC is the colon. */
487 	if (ch == 0x003A) {
488 		return(1);
489 	} else {
490 		return(0);
491 	}
492 
493 #elif defined UNIX
494 	/* Illegal UNIX characters are NULL and slash. */
495 	if (ch == 0x0000 || ch == 0x002F) {
496 		return(1);
497 	} else {
498 		return(0);
499 	}
500 
501 #elif defined OS2 || defined WIN_95 || defined WIN_NT
502 	/* Illegal char's for OS/2 according to WARP toolkit. */
503 	if (ch < 0x0020 || UnicodeInString("\\/:*?\"<>|", ch)) {
504 		return(1);
505 	} else {
506 		return(0);
507 	}
508 #endif
509 }
510 #endif
511