xref: /freebsd/sys/fs/udf/osta.c (revision 6780ab54325a71e7e70112b11657973edde8655e)
1 /*
2  * Various routines from the OSTA 2.01 specs.  Copyrights are included with
3  * each code segment.  Slight whitespace modifications have been made for
4  * formatting purposes.  Typos/bugs have been fixed.
5  *
6  * $FreeBSD$
7  */
8 
9 #include <fs/udf/osta.h>
10 
11 /*****************************************************************************/
12 /***********************************************************************
13  * OSTA compliant Unicode compression, uncompression routines.
14  * Copyright 1995 Micro Design International, Inc.
15  * Written by Jason M. Rinn.
16  * Micro Design International gives permission for the free use of the
17  * following source code.
18  */
19 
20 /***********************************************************************
21  * Takes an OSTA CS0 compressed unicode name, and converts
22  * it to Unicode.
23  * The Unicode output will be in the byte order
24  * that the local compiler uses for 16-bit values.
25  * NOTE: This routine only performs error checking on the compID.
26  * It is up to the user to ensure that the unicode buffer is large
27  * enough, and that the compressed unicode name is correct.
28  *
29  * RETURN VALUE
30  *
31  * The number of unicode characters which were uncompressed.
32  * A -1 is returned if the compression ID is invalid.
33  */
34 int
35 udf_UncompressUnicode(
36 	int numberOfBytes,	/* (Input) number of bytes read from media. */
37 	byte *UDFCompressed,	/* (Input) bytes read from media. */
38 	unicode_t *unicode)	/* (Output) uncompressed unicode characters. */
39 {
40 	unsigned int compID;
41 	int returnValue, unicodeIndex, byteIndex;
42 
43 	/* Use UDFCompressed to store current byte being read. */
44 	compID = UDFCompressed[0];
45 
46 	/* First check for valid compID. */
47 	if (compID != 8 && compID != 16) {
48 		returnValue = -1;
49 	} else {
50 		unicodeIndex = 0;
51 		byteIndex = 1;
52 
53 		/* Loop through all the bytes. */
54 		while (byteIndex < numberOfBytes) {
55 			if (compID == 16) {
56 				/* Move the first byte to the high bits of the
57 				 * unicode char.
58 				 */
59 				unicode[unicodeIndex] =
60 				    UDFCompressed[byteIndex++] << 8;
61 			} else {
62 				unicode[unicodeIndex] = 0;
63 			}
64 			if (byteIndex < numberOfBytes) {
65 				/*Then the next byte to the low bits. */
66 				unicode[unicodeIndex] |=
67 				    UDFCompressed[byteIndex++];
68 			}
69 			unicodeIndex++;
70 		}
71 		returnValue = unicodeIndex;
72 	}
73 	return(returnValue);
74 }
75 
76 /***********************************************************************
77  * DESCRIPTION:
78  * Takes a string of unicode wide characters and returns an OSTA CS0
79  * compressed unicode string. The unicode MUST be in the byte order of
80  * the compiler in order to obtain correct results. Returns an error
81  * if the compression ID is invalid.
82  *
83  * NOTE: This routine assumes the implementation already knows, by
84  * the local environment, how many bits are appropriate and
85  * therefore does no checking to test if the input characters fit
86  * into that number of bits or not.
87  *
88  * RETURN VALUE
89  *
90  * The total number of bytes in the compressed OSTA CS0 string,
91  * including the compression ID.
92  * A -1 is returned if the compression ID is invalid.
93  */
94 int
95 udf_CompressUnicode(
96 	int numberOfChars,	/* (Input) number of unicode characters. */
97 	int compID,		/* (Input) compression ID to be used. */
98 	unicode_t *unicode,	/* (Input) unicode characters to compress. */
99 	byte *UDFCompressed)	/* (Output) compressed string, as bytes. */
100 {
101 	int byteIndex, unicodeIndex;
102 
103 	if (compID != 8 && compID != 16) {
104 		byteIndex = -1; /* Unsupported compression ID ! */
105 	} else {
106 		/* Place compression code in first byte. */
107 		UDFCompressed[0] = compID;
108 
109 		byteIndex = 1;
110 		unicodeIndex = 0;
111 		while (unicodeIndex < numberOfChars) {
112 			if (compID == 16) {
113 				/* First, place the high bits of the char
114 				 * into the byte stream.
115 				 */
116 				UDFCompressed[byteIndex++] =
117 				    (unicode[unicodeIndex] & 0xFF00) >> 8;
118 			}
119 			/*Then place the low bits into the stream. */
120 			UDFCompressed[byteIndex++] =
121 			    unicode[unicodeIndex] & 0x00FF;
122 			unicodeIndex++;
123 		}
124 	}
125 	return(byteIndex);
126 }
127 
128 /*****************************************************************************/
129 /*
130  * CRC 010041
131  */
132 static unsigned short crc_table[256] = {
133 	0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50A5, 0x60C6, 0x70E7,
134 	0x8108, 0x9129, 0xA14A, 0xB16B, 0xC18C, 0xD1AD, 0xE1CE, 0xF1EF,
135 	0x1231, 0x0210, 0x3273, 0x2252, 0x52B5, 0x4294, 0x72F7, 0x62D6,
136 	0x9339, 0x8318, 0xB37B, 0xA35A, 0xD3BD, 0xC39C, 0xF3FF, 0xE3DE,
137 	0x2462, 0x3443, 0x0420, 0x1401, 0x64E6, 0x74C7, 0x44A4, 0x5485,
138 	0xA56A, 0xB54B, 0x8528, 0x9509, 0xE5EE, 0xF5CF, 0xC5AC, 0xD58D,
139 	0x3653, 0x2672, 0x1611, 0x0630, 0x76D7, 0x66F6, 0x5695, 0x46B4,
140 	0xB75B, 0xA77A, 0x9719, 0x8738, 0xF7DF, 0xE7FE, 0xD79D, 0xC7BC,
141 	0x48C4, 0x58E5, 0x6886, 0x78A7, 0x0840, 0x1861, 0x2802, 0x3823,
142 	0xC9CC, 0xD9ED, 0xE98E, 0xF9AF, 0x8948, 0x9969, 0xA90A, 0xB92B,
143 	0x5AF5, 0x4AD4, 0x7AB7, 0x6A96, 0x1A71, 0x0A50, 0x3A33, 0x2A12,
144 	0xDBFD, 0xCBDC, 0xFBBF, 0xEB9E, 0x9B79, 0x8B58, 0xBB3B, 0xAB1A,
145 	0x6CA6, 0x7C87, 0x4CE4, 0x5CC5, 0x2C22, 0x3C03, 0x0C60, 0x1C41,
146 	0xEDAE, 0xFD8F, 0xCDEC, 0xDDCD, 0xAD2A, 0xBD0B, 0x8D68, 0x9D49,
147 	0x7E97, 0x6EB6, 0x5ED5, 0x4EF4, 0x3E13, 0x2E32, 0x1E51, 0x0E70,
148 	0xFF9F, 0xEFBE, 0xDFDD, 0xCFFC, 0xBF1B, 0xAF3A, 0x9F59, 0x8F78,
149 	0x9188, 0x81A9, 0xB1CA, 0xA1EB, 0xD10C, 0xC12D, 0xF14E, 0xE16F,
150 	0x1080, 0x00A1, 0x30C2, 0x20E3, 0x5004, 0x4025, 0x7046, 0x6067,
151 	0x83B9, 0x9398, 0xA3FB, 0xB3DA, 0xC33D, 0xD31C, 0xE37F, 0xF35E,
152 	0x02B1, 0x1290, 0x22F3, 0x32D2, 0x4235, 0x5214, 0x6277, 0x7256,
153 	0xB5EA, 0xA5CB, 0x95A8, 0x8589, 0xF56E, 0xE54F, 0xD52C, 0xC50D,
154 	0x34E2, 0x24C3, 0x14A0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405,
155 	0xA7DB, 0xB7FA, 0x8799, 0x97B8, 0xE75F, 0xF77E, 0xC71D, 0xD73C,
156 	0x26D3, 0x36F2, 0x0691, 0x16B0, 0x6657, 0x7676, 0x4615, 0x5634,
157 	0xD94C, 0xC96D, 0xF90E, 0xE92F, 0x99C8, 0x89E9, 0xB98A, 0xA9AB,
158 	0x5844, 0x4865, 0x7806, 0x6827, 0x18C0, 0x08E1, 0x3882, 0x28A3,
159 	0xCB7D, 0xDB5C, 0xEB3F, 0xFB1E, 0x8BF9, 0x9BD8, 0xABBB, 0xBB9A,
160 	0x4A75, 0x5A54, 0x6A37, 0x7A16, 0x0AF1, 0x1AD0, 0x2AB3, 0x3A92,
161 	0xFD2E, 0xED0F, 0xDD6C, 0xCD4D, 0xBDAA, 0xAD8B, 0x9DE8, 0x8DC9,
162 	0x7C26, 0x6C07, 0x5C64, 0x4C45, 0x3CA2, 0x2C83, 0x1CE0, 0x0CC1,
163 	0xEF1F, 0xFF3E, 0xCF5D, 0xDF7C, 0xAF9B, 0xBFBA, 0x8FD9, 0x9FF8,
164 	0x6E17, 0x7E36, 0x4E55, 0x5E74, 0x2E93, 0x3EB2, 0x0ED1, 0x1EF0
165 };
166 
167 unsigned short
168 udf_cksum(s, n)
169 	unsigned char *s;
170 	int n;
171 {
172 	unsigned short crc=0;
173 
174 	while (n-- > 0)
175 		crc = crc_table[(crc>>8 ^ *s++) & 0xff] ^ (crc<<8);
176 	return crc;
177 }
178 
179 /* UNICODE Checksum */
180 unsigned short
181 udf_unicode_cksum(s, n)
182 	unsigned short *s;
183 	int n;
184 {
185 	unsigned short crc=0;
186 
187 	while (n-- > 0) {
188 		/* Take high order byte first--corresponds to a big endian
189 		 * byte stream.
190 		 */
191 		crc = crc_table[(crc>>8 ^ (*s>>8)) & 0xff] ^ (crc<<8);
192 		crc = crc_table[(crc>>8 ^ (*s++ & 0xff)) & 0xff] ^ (crc<<8);
193 	}
194 	return crc;
195 }
196 
197 #ifdef MAIN
198 unsigned char bytes[] = { 0x70, 0x6A, 0x77 };
199 
200 main()
201 {
202 	unsigned short x;
203 	x = cksum(bytes, sizeof bytes);
204 	printf("checksum: calculated=%4.4x, correct=%4.4x\en", x, 0x3299);
205 	exit(0);
206 }
207 #endif
208 
209 /*****************************************************************************/
210 #ifdef NEEDS_ISPRINT
211 /***********************************************************************
212  * OSTA UDF compliant file name translation routine for OS/2,
213  * Windows 95, Windows NT, Macintosh and UNIX.
214  * Copyright 1995 Micro Design International, Inc.
215  * Written by Jason M. Rinn.
216  * Micro Design International gives permission for the free use of the
217  * following source code.
218  */
219 
220 /***********************************************************************
221  * To use these routines with different operating systems.
222  *
223  * OS/2
224  * Define OS2
225  * Define MAXLEN = 254
226  *
227  * Windows 95
228  * Define WIN_95
229  * Define MAXLEN = 255
230  *
231  * Windows NT
232  * Define WIN_NT
233  * Define MAXLEN = 255
234  *
235  * Macintosh:
236  * Define MAC.
237  * Define MAXLEN = 31.
238  *
239  * UNIX
240  * Define UNIX.
241  * Define MAXLEN as specified by unix version.
242  */
243 
244 #define	ILLEGAL_CHAR_MARK	0x005F
245 #define	CRC_MARK	0x0023
246 #define	EXT_SIZE	5
247 #define	TRUE	1
248 #define	FALSE	0
249 #define	PERIOD	0x002E
250 #define	SPACE	0x0020
251 
252 /*** PROTOTYPES ***/
253 int IsIllegal(unicode_t ch);
254 
255 /* Define a function or macro which determines if a Unicode character is
256  * printable under your implementation.
257  */
258 int UnicodeIsPrint(unicode_t);
259 
260 /***********************************************************************
261  * Translates a long file name to one using a MAXLEN and an illegal
262  * char set in accord with the OSTA requirements. Assumes the name has
263  * already been translated to Unicode.
264  *
265  * RETURN VALUE
266  *
267  * Number of unicode characters in translated name.
268  */
269 int UDFTransName(
270 	unicode_t *newName,	/* (Output)Translated name. Must be of length
271 				 * MAXLEN */
272 	unicode_t *udfName,	/* (Input) Name from UDF volume.*/
273 	int udfLen)		/* (Input) Length of UDF Name. */
274 {
275 	int index, newIndex = 0, needsCRC = FALSE;
276 	int extIndex = 0, newExtIndex = 0, hasExt = FALSE;
277 #if defined OS2 || defined WIN_95 || defined WIN_NT
278 	int trailIndex = 0;
279 #endif
280 	unsigned short valueCRC;
281 	unicode_t current;
282 	const char hexChar[] = "0123456789ABCDEF";
283 
284 	for (index = 0; index < udfLen; index++) {
285 		current = udfName[index];
286 
287 		if (IsIllegal(current) || !UnicodeIsPrint(current)) {
288 			needsCRC = TRUE;
289 			/* Replace Illegal and non-displayable chars with
290 			 * underscore.
291 			 */
292 			current = ILLEGAL_CHAR_MARK;
293 			/* Skip any other illegal or non-displayable
294 			 * characters.
295 			 */
296 			while(index+1 < udfLen && (IsIllegal(udfName[index+1])
297 			    || !UnicodeIsPrint(udfName[index+1]))) {
298 				index++;
299 			}
300 		}
301 
302 		/* Record position of extension, if one is found. */
303 		if (current == PERIOD && (udfLen - index -1) <= EXT_SIZE) {
304 			if (udfLen == index + 1) {
305 				/* A trailing period is NOT an extension. */
306 				hasExt = FALSE;
307 			} else {
308 				hasExt = TRUE;
309 				extIndex = index;
310 				newExtIndex = newIndex;
311 			}
312 		}
313 
314 #if defined OS2 || defined WIN_95 || defined WIN_NT
315 		/* Record position of last char which is NOT period or space. */
316 		else if (current != PERIOD && current != SPACE) {
317 			trailIndex = newIndex;
318 		}
319 #endif
320 
321 		if (newIndex < MAXLEN) {
322 			newName[newIndex++] = current;
323 		} else {
324 			needsCRC = TRUE;
325 		}
326 	}
327 
328 #if defined OS2 || defined WIN_95 || defined WIN_NT
329 	/* For OS2, 95 & NT, truncate any trailing periods and\or spaces. */
330 	if (trailIndex != newIndex - 1) {
331 		newIndex = trailIndex + 1;
332 		needsCRC = TRUE;
333 		hasExt = FALSE; /* Trailing period does not make an
334 				 * extension. */
335 	}
336 #endif
337 
338 	if (needsCRC) {
339 		unicode_t ext[EXT_SIZE];
340 		int localExtIndex = 0;
341 		if (hasExt) {
342 			int maxFilenameLen;
343 			/* Translate extension, and store it in ext. */
344 			for(index = 0; index<EXT_SIZE &&
345 			    extIndex + index +1 < udfLen; index++ ) {
346 				current = udfName[extIndex + index + 1];
347 				if (IsIllegal(current) ||
348 				    !UnicodeIsPrint(current)) {
349 					needsCRC = 1;
350 					/* Replace Illegal and non-displayable
351 					 * chars with underscore.
352 					 */
353 					current = ILLEGAL_CHAR_MARK;
354 					/* Skip any other illegal or
355 					 * non-displayable characters.
356 					 */
357 					while(index + 1 < EXT_SIZE
358 					    && (IsIllegal(udfName[extIndex +
359 					    index + 2]) ||
360 					    !isprint(udfName[extIndex +
361 					    index + 2]))) {
362 						index++;
363 					}
364 				}
365 				ext[localExtIndex++] = current;
366 			}
367 
368 			/* Truncate filename to leave room for extension and
369 			 * CRC.
370 			 */
371 			maxFilenameLen = ((MAXLEN - 5) - localExtIndex - 1);
372 			if (newIndex > maxFilenameLen) {
373 				newIndex = maxFilenameLen;
374 			} else {
375 				newIndex = newExtIndex;
376 			}
377 		} else if (newIndex > MAXLEN - 5) {
378 			/*If no extension, make sure to leave room for CRC. */
379 			newIndex = MAXLEN - 5;
380 		}
381 		newName[newIndex++] = CRC_MARK; /* Add mark for CRC. */
382 
383 		/*Calculate CRC from original filename from FileIdentifier. */
384 		valueCRC = udf_unicode_cksum(udfName, udfLen);
385 		/* Convert 16-bits of CRC to hex characters. */
386 		newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12];
387 		newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8];
388 		newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4];
389 		newName[newIndex++] = hexChar[(valueCRC & 0x000f)];
390 
391 		/* Place a translated extension at end, if found. */
392 		if (hasExt) {
393 			newName[newIndex++] = PERIOD;
394 			for (index = 0;index < localExtIndex ;index++ ) {
395 				newName[newIndex++] = ext[index];
396 			}
397 		}
398 	}
399 	return(newIndex);
400 }
401 
402 #if defined OS2 || defined WIN_95 || defined WIN_NT
403 /***********************************************************************
404  * Decides if a Unicode character matches one of a list
405  * of ASCII characters.
406  * Used by OS2 version of IsIllegal for readability, since all of the
407  * illegal characters above 0x0020 are in the ASCII subset of Unicode.
408  * Works very similarly to the standard C function strchr().
409  *
410  * RETURN VALUE
411  *
412  * Non-zero if the Unicode character is in the given ASCII string.
413  */
414 int UnicodeInString(
415 	unsigned char *string,	/* (Input) String to search through. */
416 	unicode_t ch)		/* (Input) Unicode char to search for. */
417 {
418 	int found = FALSE;
419 	while (*string != '\0' && found == FALSE) {
420 		/* These types should compare, since both are unsigned
421 		 * numbers. */
422 		if (*string == ch) {
423 			found = TRUE;
424 		}
425 		string++;
426 	}
427 	return(found);
428 }
429 #endif /* OS2 */
430 
431 /***********************************************************************
432  * Decides whether the given character is illegal for a given OS.
433  *
434  * RETURN VALUE
435  *
436  * Non-zero if char is illegal.
437  */
438 int IsIllegal(unicode_t ch)
439 {
440 #ifdef MAC
441 	/* Only illegal character on the MAC is the colon. */
442 	if (ch == 0x003A) {
443 		return(1);
444 	} else {
445 		return(0);
446 	}
447 
448 #elif defined UNIX
449 	/* Illegal UNIX characters are NULL and slash. */
450 	if (ch == 0x0000 || ch == 0x002F) {
451 		return(1);
452 	} else {
453 		return(0);
454 	}
455 
456 #elif defined OS2 || defined WIN_95 || defined WIN_NT
457 	/* Illegal char's for OS/2 according to WARP toolkit. */
458 	if (ch < 0x0020 || UnicodeInString("\\/:*?\"<>|", ch)) {
459 		return(1);
460 	} else {
461 		return(0);
462 	}
463 #endif
464 }
465 #endif
466