xref: /freebsd/sys/fs/udf/osta.c (revision 5e3190f700637fcfc1a52daeaa4a031fdd2557c7)
1 /*
2  * Various routines from the OSTA 2.01 specs.  Copyrights are included with
3  * each code segment.  Slight whitespace modifications have been made for
4  * formatting purposes.  Typos/bugs have been fixed.
5  */
6 
7 #include <fs/udf/osta.h>
8 
9 /*****************************************************************************/
10 /*-
11  **********************************************************************
12  * OSTA compliant Unicode compression, uncompression routines.
13  * Copyright 1995 Micro Design International, Inc.
14  * Written by Jason M. Rinn.
15  * Micro Design International gives permission for the free use of the
16  * following source code.
17  */
18 
19 /***********************************************************************
20  * Takes an OSTA CS0 compressed unicode name, and converts
21  * it to Unicode.
22  * The Unicode output will be in the byte order
23  * that the local compiler uses for 16-bit values.
24  * NOTE: This routine only performs error checking on the compID.
25  * It is up to the user to ensure that the unicode buffer is large
26  * enough, and that the compressed unicode name is correct.
27  *
28  * RETURN VALUE
29  *
30  * The number of unicode characters which were uncompressed.
31  * A -1 is returned if the compression ID is invalid.
32  */
33 int
34 udf_UncompressUnicode(
35 	int numberOfBytes,	/* (Input) number of bytes read from media. */
36 	byte *UDFCompressed,	/* (Input) bytes read from media. */
37 	unicode_t *unicode)	/* (Output) uncompressed unicode characters. */
38 {
39 	unsigned int compID;
40 	int returnValue, unicodeIndex, byteIndex;
41 
42 	/* Use UDFCompressed to store current byte being read. */
43 	compID = UDFCompressed[0];
44 
45 	/* First check for valid compID. */
46 	if (compID != 8 && compID != 16) {
47 		returnValue = -1;
48 	} else {
49 		unicodeIndex = 0;
50 		byteIndex = 1;
51 
52 		/* Loop through all the bytes. */
53 		while (byteIndex < numberOfBytes) {
54 			if (compID == 16) {
55 				/* Move the first byte to the high bits of the
56 				 * unicode char.
57 				 */
58 				unicode[unicodeIndex] =
59 				    UDFCompressed[byteIndex++] << 8;
60 			} else {
61 				unicode[unicodeIndex] = 0;
62 			}
63 			if (byteIndex < numberOfBytes) {
64 				/*Then the next byte to the low bits. */
65 				unicode[unicodeIndex] |=
66 				    UDFCompressed[byteIndex++];
67 			}
68 			unicodeIndex++;
69 		}
70 		returnValue = unicodeIndex;
71 	}
72 	return(returnValue);
73 }
74 
75 /*
76  * Almost same as udf_UncompressUnicode(). The difference is that
77  * it keeps byte order of unicode string.
78  */
79 int
80 udf_UncompressUnicodeByte(
81 	int numberOfBytes,	/* (Input) number of bytes read from media. */
82 	byte *UDFCompressed,	/* (Input) bytes read from media. */
83 	byte *unicode)		/* (Output) uncompressed unicode characters. */
84 {
85 	unsigned int compID;
86 	int returnValue, unicodeIndex, byteIndex;
87 
88 	/* Use UDFCompressed to store current byte being read. */
89 	compID = UDFCompressed[0];
90 
91 	/* First check for valid compID. */
92 	if (compID != 8 && compID != 16) {
93 		returnValue = -1;
94 	} else {
95 		unicodeIndex = 0;
96 		byteIndex = 1;
97 
98 		/* Loop through all the bytes. */
99 		while (byteIndex < numberOfBytes) {
100 			if (compID == 16) {
101 				/* Move the first byte to the high bits of the
102 				 * unicode char.
103 				 */
104 				unicode[unicodeIndex++] =
105 				    UDFCompressed[byteIndex++];
106 			} else {
107 				unicode[unicodeIndex++] = 0;
108 			}
109 			if (byteIndex < numberOfBytes) {
110 				/*Then the next byte to the low bits. */
111 				unicode[unicodeIndex++] =
112 				    UDFCompressed[byteIndex++];
113 			}
114 		}
115 		returnValue = unicodeIndex;
116 	}
117 	return(returnValue);
118 }
119 
120 /***********************************************************************
121  * DESCRIPTION:
122  * Takes a string of unicode wide characters and returns an OSTA CS0
123  * compressed unicode string. The unicode MUST be in the byte order of
124  * the compiler in order to obtain correct results. Returns an error
125  * if the compression ID is invalid.
126  *
127  * NOTE: This routine assumes the implementation already knows, by
128  * the local environment, how many bits are appropriate and
129  * therefore does no checking to test if the input characters fit
130  * into that number of bits or not.
131  *
132  * RETURN VALUE
133  *
134  * The total number of bytes in the compressed OSTA CS0 string,
135  * including the compression ID.
136  * A -1 is returned if the compression ID is invalid.
137  */
138 int
139 udf_CompressUnicode(
140 	int numberOfChars,	/* (Input) number of unicode characters. */
141 	int compID,		/* (Input) compression ID to be used. */
142 	unicode_t *unicode,	/* (Input) unicode characters to compress. */
143 	byte *UDFCompressed)	/* (Output) compressed string, as bytes. */
144 {
145 	int byteIndex, unicodeIndex;
146 
147 	if (compID != 8 && compID != 16) {
148 		byteIndex = -1; /* Unsupported compression ID ! */
149 	} else {
150 		/* Place compression code in first byte. */
151 		UDFCompressed[0] = compID;
152 
153 		byteIndex = 1;
154 		unicodeIndex = 0;
155 		while (unicodeIndex < numberOfChars) {
156 			if (compID == 16) {
157 				/* First, place the high bits of the char
158 				 * into the byte stream.
159 				 */
160 				UDFCompressed[byteIndex++] =
161 				    (unicode[unicodeIndex] & 0xFF00) >> 8;
162 			}
163 			/*Then place the low bits into the stream. */
164 			UDFCompressed[byteIndex++] =
165 			    unicode[unicodeIndex] & 0x00FF;
166 			unicodeIndex++;
167 		}
168 	}
169 	return(byteIndex);
170 }
171 
172 /*****************************************************************************/
173 /*
174  * CRC 010041
175  */
176 static unsigned short crc_table[256] = {
177 	0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50A5, 0x60C6, 0x70E7,
178 	0x8108, 0x9129, 0xA14A, 0xB16B, 0xC18C, 0xD1AD, 0xE1CE, 0xF1EF,
179 	0x1231, 0x0210, 0x3273, 0x2252, 0x52B5, 0x4294, 0x72F7, 0x62D6,
180 	0x9339, 0x8318, 0xB37B, 0xA35A, 0xD3BD, 0xC39C, 0xF3FF, 0xE3DE,
181 	0x2462, 0x3443, 0x0420, 0x1401, 0x64E6, 0x74C7, 0x44A4, 0x5485,
182 	0xA56A, 0xB54B, 0x8528, 0x9509, 0xE5EE, 0xF5CF, 0xC5AC, 0xD58D,
183 	0x3653, 0x2672, 0x1611, 0x0630, 0x76D7, 0x66F6, 0x5695, 0x46B4,
184 	0xB75B, 0xA77A, 0x9719, 0x8738, 0xF7DF, 0xE7FE, 0xD79D, 0xC7BC,
185 	0x48C4, 0x58E5, 0x6886, 0x78A7, 0x0840, 0x1861, 0x2802, 0x3823,
186 	0xC9CC, 0xD9ED, 0xE98E, 0xF9AF, 0x8948, 0x9969, 0xA90A, 0xB92B,
187 	0x5AF5, 0x4AD4, 0x7AB7, 0x6A96, 0x1A71, 0x0A50, 0x3A33, 0x2A12,
188 	0xDBFD, 0xCBDC, 0xFBBF, 0xEB9E, 0x9B79, 0x8B58, 0xBB3B, 0xAB1A,
189 	0x6CA6, 0x7C87, 0x4CE4, 0x5CC5, 0x2C22, 0x3C03, 0x0C60, 0x1C41,
190 	0xEDAE, 0xFD8F, 0xCDEC, 0xDDCD, 0xAD2A, 0xBD0B, 0x8D68, 0x9D49,
191 	0x7E97, 0x6EB6, 0x5ED5, 0x4EF4, 0x3E13, 0x2E32, 0x1E51, 0x0E70,
192 	0xFF9F, 0xEFBE, 0xDFDD, 0xCFFC, 0xBF1B, 0xAF3A, 0x9F59, 0x8F78,
193 	0x9188, 0x81A9, 0xB1CA, 0xA1EB, 0xD10C, 0xC12D, 0xF14E, 0xE16F,
194 	0x1080, 0x00A1, 0x30C2, 0x20E3, 0x5004, 0x4025, 0x7046, 0x6067,
195 	0x83B9, 0x9398, 0xA3FB, 0xB3DA, 0xC33D, 0xD31C, 0xE37F, 0xF35E,
196 	0x02B1, 0x1290, 0x22F3, 0x32D2, 0x4235, 0x5214, 0x6277, 0x7256,
197 	0xB5EA, 0xA5CB, 0x95A8, 0x8589, 0xF56E, 0xE54F, 0xD52C, 0xC50D,
198 	0x34E2, 0x24C3, 0x14A0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405,
199 	0xA7DB, 0xB7FA, 0x8799, 0x97B8, 0xE75F, 0xF77E, 0xC71D, 0xD73C,
200 	0x26D3, 0x36F2, 0x0691, 0x16B0, 0x6657, 0x7676, 0x4615, 0x5634,
201 	0xD94C, 0xC96D, 0xF90E, 0xE92F, 0x99C8, 0x89E9, 0xB98A, 0xA9AB,
202 	0x5844, 0x4865, 0x7806, 0x6827, 0x18C0, 0x08E1, 0x3882, 0x28A3,
203 	0xCB7D, 0xDB5C, 0xEB3F, 0xFB1E, 0x8BF9, 0x9BD8, 0xABBB, 0xBB9A,
204 	0x4A75, 0x5A54, 0x6A37, 0x7A16, 0x0AF1, 0x1AD0, 0x2AB3, 0x3A92,
205 	0xFD2E, 0xED0F, 0xDD6C, 0xCD4D, 0xBDAA, 0xAD8B, 0x9DE8, 0x8DC9,
206 	0x7C26, 0x6C07, 0x5C64, 0x4C45, 0x3CA2, 0x2C83, 0x1CE0, 0x0CC1,
207 	0xEF1F, 0xFF3E, 0xCF5D, 0xDF7C, 0xAF9B, 0xBFBA, 0x8FD9, 0x9FF8,
208 	0x6E17, 0x7E36, 0x4E55, 0x5E74, 0x2E93, 0x3EB2, 0x0ED1, 0x1EF0
209 };
210 
211 unsigned short
212 udf_cksum(unsigned char *s, int n)
213 {
214 	unsigned short crc=0;
215 
216 	while (n-- > 0)
217 		crc = crc_table[(crc>>8 ^ *s++) & 0xff] ^ (crc<<8);
218 	return crc;
219 }
220 
221 /* UNICODE Checksum */
222 unsigned short
223 udf_unicode_cksum(unsigned short *s, int n)
224 {
225 	unsigned short crc=0;
226 
227 	while (n-- > 0) {
228 		/* Take high order byte first--corresponds to a big endian
229 		 * byte stream.
230 		 */
231 		crc = crc_table[(crc>>8 ^ (*s>>8)) & 0xff] ^ (crc<<8);
232 		crc = crc_table[(crc>>8 ^ (*s++ & 0xff)) & 0xff] ^ (crc<<8);
233 	}
234 	return crc;
235 }
236 
237 #ifdef MAIN
238 unsigned char bytes[] = { 0x70, 0x6A, 0x77 };
239 
240 main()
241 {
242 	unsigned short x;
243 	x = cksum(bytes, sizeof bytes);
244 	printf("checksum: calculated=%4.4x, correct=%4.4x\en", x, 0x3299);
245 	exit(0);
246 }
247 #endif
248 
249 /*****************************************************************************/
250 #ifdef NEEDS_ISPRINT
251 /*-
252  **********************************************************************
253  * OSTA UDF compliant file name translation routine for OS/2,
254  * Windows 95, Windows NT, Macintosh and UNIX.
255  * Copyright 1995 Micro Design International, Inc.
256  * Written by Jason M. Rinn.
257  * Micro Design International gives permission for the free use of the
258  * following source code.
259  */
260 
261 /***********************************************************************
262  * To use these routines with different operating systems.
263  *
264  * OS/2
265  * Define OS2
266  * Define MAXLEN = 254
267  *
268  * Windows 95
269  * Define WIN_95
270  * Define MAXLEN = 255
271  *
272  * Windows NT
273  * Define WIN_NT
274  * Define MAXLEN = 255
275  *
276  * Macintosh:
277  * Define APPLE_MAC.
278  * Define MAXLEN = 31.
279  *
280  * UNIX
281  * Define UNIX.
282  * Define MAXLEN as specified by unix version.
283  */
284 
285 #define	ILLEGAL_CHAR_MARK	0x005F
286 #define	CRC_MARK	0x0023
287 #define	EXT_SIZE	5
288 #define	TRUE	1
289 #define	FALSE	0
290 #define	PERIOD	0x002E
291 #define	SPACE	0x0020
292 
293 /*** PROTOTYPES ***/
294 int IsIllegal(unicode_t ch);
295 
296 /* Define a function or macro which determines if a Unicode character is
297  * printable under your implementation.
298  */
299 int UnicodeIsPrint(unicode_t);
300 
301 /***********************************************************************
302  * Translates a long file name to one using a MAXLEN and an illegal
303  * char set in accord with the OSTA requirements. Assumes the name has
304  * already been translated to Unicode.
305  *
306  * RETURN VALUE
307  *
308  * Number of unicode characters in translated name.
309  */
310 int UDFTransName(
311 	unicode_t *newName,	/* (Output)Translated name. Must be of length
312 				 * MAXLEN */
313 	unicode_t *udfName,	/* (Input) Name from UDF volume.*/
314 	int udfLen)		/* (Input) Length of UDF Name. */
315 {
316 	int index, newIndex = 0, needsCRC = FALSE;
317 	int extIndex = 0, newExtIndex = 0, hasExt = FALSE;
318 #if defined OS2 || defined WIN_95 || defined WIN_NT
319 	int trailIndex = 0;
320 #endif
321 	unsigned short valueCRC;
322 	unicode_t current;
323 	const char hexChar[] = "0123456789ABCDEF";
324 
325 	for (index = 0; index < udfLen; index++) {
326 		current = udfName[index];
327 
328 		if (IsIllegal(current) || !UnicodeIsPrint(current)) {
329 			needsCRC = TRUE;
330 			/* Replace Illegal and non-displayable chars with
331 			 * underscore.
332 			 */
333 			current = ILLEGAL_CHAR_MARK;
334 			/* Skip any other illegal or non-displayable
335 			 * characters.
336 			 */
337 			while(index+1 < udfLen && (IsIllegal(udfName[index+1])
338 			    || !UnicodeIsPrint(udfName[index+1]))) {
339 				index++;
340 			}
341 		}
342 
343 		/* Record position of extension, if one is found. */
344 		if (current == PERIOD && (udfLen - index -1) <= EXT_SIZE) {
345 			if (udfLen == index + 1) {
346 				/* A trailing period is NOT an extension. */
347 				hasExt = FALSE;
348 			} else {
349 				hasExt = TRUE;
350 				extIndex = index;
351 				newExtIndex = newIndex;
352 			}
353 		}
354 
355 #if defined OS2 || defined WIN_95 || defined WIN_NT
356 		/* Record position of last char which is NOT period or space. */
357 		else if (current != PERIOD && current != SPACE) {
358 			trailIndex = newIndex;
359 		}
360 #endif
361 
362 		if (newIndex < MAXLEN) {
363 			newName[newIndex++] = current;
364 		} else {
365 			needsCRC = TRUE;
366 		}
367 	}
368 
369 #if defined OS2 || defined WIN_95 || defined WIN_NT
370 	/* For OS2, 95 & NT, truncate any trailing periods and\or spaces. */
371 	if (trailIndex != newIndex - 1) {
372 		newIndex = trailIndex + 1;
373 		needsCRC = TRUE;
374 		hasExt = FALSE; /* Trailing period does not make an
375 				 * extension. */
376 	}
377 #endif
378 
379 	if (needsCRC) {
380 		unicode_t ext[EXT_SIZE];
381 		int localExtIndex = 0;
382 		if (hasExt) {
383 			int maxFilenameLen;
384 			/* Translate extension, and store it in ext. */
385 			for(index = 0; index<EXT_SIZE &&
386 			    extIndex + index +1 < udfLen; index++ ) {
387 				current = udfName[extIndex + index + 1];
388 				if (IsIllegal(current) ||
389 				    !UnicodeIsPrint(current)) {
390 					needsCRC = 1;
391 					/* Replace Illegal and non-displayable
392 					 * chars with underscore.
393 					 */
394 					current = ILLEGAL_CHAR_MARK;
395 					/* Skip any other illegal or
396 					 * non-displayable characters.
397 					 */
398 					while(index + 1 < EXT_SIZE
399 					    && (IsIllegal(udfName[extIndex +
400 					    index + 2]) ||
401 					    !isprint(udfName[extIndex +
402 					    index + 2]))) {
403 						index++;
404 					}
405 				}
406 				ext[localExtIndex++] = current;
407 			}
408 
409 			/* Truncate filename to leave room for extension and
410 			 * CRC.
411 			 */
412 			maxFilenameLen = ((MAXLEN - 5) - localExtIndex - 1);
413 			if (newIndex > maxFilenameLen) {
414 				newIndex = maxFilenameLen;
415 			} else {
416 				newIndex = newExtIndex;
417 			}
418 		} else if (newIndex > MAXLEN - 5) {
419 			/*If no extension, make sure to leave room for CRC. */
420 			newIndex = MAXLEN - 5;
421 		}
422 		newName[newIndex++] = CRC_MARK; /* Add mark for CRC. */
423 
424 		/*Calculate CRC from original filename from FileIdentifier. */
425 		valueCRC = udf_unicode_cksum(udfName, udfLen);
426 		/* Convert 16-bits of CRC to hex characters. */
427 		newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12];
428 		newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8];
429 		newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4];
430 		newName[newIndex++] = hexChar[(valueCRC & 0x000f)];
431 
432 		/* Place a translated extension at end, if found. */
433 		if (hasExt) {
434 			newName[newIndex++] = PERIOD;
435 			for (index = 0;index < localExtIndex ;index++ ) {
436 				newName[newIndex++] = ext[index];
437 			}
438 		}
439 	}
440 	return(newIndex);
441 }
442 
443 #if defined OS2 || defined WIN_95 || defined WIN_NT
444 /***********************************************************************
445  * Decides if a Unicode character matches one of a list
446  * of ASCII characters.
447  * Used by OS2 version of IsIllegal for readability, since all of the
448  * illegal characters above 0x0020 are in the ASCII subset of Unicode.
449  * Works very similarly to the standard C function strchr().
450  *
451  * RETURN VALUE
452  *
453  * Non-zero if the Unicode character is in the given ASCII string.
454  */
455 int UnicodeInString(
456 	unsigned char *string,	/* (Input) String to search through. */
457 	unicode_t ch)		/* (Input) Unicode char to search for. */
458 {
459 	int found = FALSE;
460 	while (*string != '\0' && found == FALSE) {
461 		/* These types should compare, since both are unsigned
462 		 * numbers. */
463 		if (*string == ch) {
464 			found = TRUE;
465 		}
466 		string++;
467 	}
468 	return(found);
469 }
470 #endif /* OS2 */
471 
472 /***********************************************************************
473  * Decides whether the given character is illegal for a given OS.
474  *
475  * RETURN VALUE
476  *
477  * Non-zero if char is illegal.
478  */
479 int IsIllegal(unicode_t ch)
480 {
481 #ifdef APPLE_MAC
482 	/* Only illegal character on the MAC is the colon. */
483 	if (ch == 0x003A) {
484 		return(1);
485 	} else {
486 		return(0);
487 	}
488 
489 #elif defined UNIX
490 	/* Illegal UNIX characters are NULL and slash. */
491 	if (ch == 0x0000 || ch == 0x002F) {
492 		return(1);
493 	} else {
494 		return(0);
495 	}
496 
497 #elif defined OS2 || defined WIN_95 || defined WIN_NT
498 	/* Illegal char's for OS/2 according to WARP toolkit. */
499 	if (ch < 0x0020 || UnicodeInString("\\/:*?\"<>|", ch)) {
500 		return(1);
501 	} else {
502 		return(0);
503 	}
504 #endif
505 }
506 #endif
507