1*c9083b85SXin LI 2*c9083b85SXin LI 3*c9083b85SXin LI 4*c9083b85SXin LI 5*c9083b85SXin LI 6*c9083b85SXin LI 7*c9083b85SXin LINetwork Working Group P. Deutsch 8*c9083b85SXin LIRequest for Comments: 1952 Aladdin Enterprises 9*c9083b85SXin LICategory: Informational May 1996 10*c9083b85SXin LI 11*c9083b85SXin LI 12*c9083b85SXin LI GZIP file format specification version 4.3 13*c9083b85SXin LI 14*c9083b85SXin LIStatus of This Memo 15*c9083b85SXin LI 16*c9083b85SXin LI This memo provides information for the Internet community. This memo 17*c9083b85SXin LI does not specify an Internet standard of any kind. Distribution of 18*c9083b85SXin LI this memo is unlimited. 19*c9083b85SXin LI 20*c9083b85SXin LIIESG Note: 21*c9083b85SXin LI 22*c9083b85SXin LI The IESG takes no position on the validity of any Intellectual 23*c9083b85SXin LI Property Rights statements contained in this document. 24*c9083b85SXin LI 25*c9083b85SXin LINotices 26*c9083b85SXin LI 27*c9083b85SXin LI Copyright (c) 1996 L. Peter Deutsch 28*c9083b85SXin LI 29*c9083b85SXin LI Permission is granted to copy and distribute this document for any 30*c9083b85SXin LI purpose and without charge, including translations into other 31*c9083b85SXin LI languages and incorporation into compilations, provided that the 32*c9083b85SXin LI copyright notice and this notice are preserved, and that any 33*c9083b85SXin LI substantive changes or deletions from the original are clearly 34*c9083b85SXin LI marked. 35*c9083b85SXin LI 36*c9083b85SXin LI A pointer to the latest version of this and related documentation in 37*c9083b85SXin LI HTML format can be found at the URL 38*c9083b85SXin LI <ftp://ftp.uu.net/graphics/png/documents/zlib/zdoc-index.html>. 39*c9083b85SXin LI 40*c9083b85SXin LIAbstract 41*c9083b85SXin LI 42*c9083b85SXin LI This specification defines a lossless compressed data format that is 43*c9083b85SXin LI compatible with the widely used GZIP utility. The format includes a 44*c9083b85SXin LI cyclic redundancy check value for detecting data corruption. The 45*c9083b85SXin LI format presently uses the DEFLATE method of compression but can be 46*c9083b85SXin LI easily extended to use other compression methods. The format can be 47*c9083b85SXin LI implemented readily in a manner not covered by patents. 48*c9083b85SXin LI 49*c9083b85SXin LI 50*c9083b85SXin LI 51*c9083b85SXin LI 52*c9083b85SXin LI 53*c9083b85SXin LI 54*c9083b85SXin LI 55*c9083b85SXin LI 56*c9083b85SXin LI 57*c9083b85SXin LI 58*c9083b85SXin LIDeutsch Informational [Page 1] 59*c9083b85SXin LI 60*c9083b85SXin LIRFC 1952 GZIP File Format Specification May 1996 61*c9083b85SXin LI 62*c9083b85SXin LI 63*c9083b85SXin LITable of Contents 64*c9083b85SXin LI 65*c9083b85SXin LI 1. Introduction ................................................... 2 66*c9083b85SXin LI 1.1. Purpose ................................................... 2 67*c9083b85SXin LI 1.2. Intended audience ......................................... 3 68*c9083b85SXin LI 1.3. Scope ..................................................... 3 69*c9083b85SXin LI 1.4. Compliance ................................................ 3 70*c9083b85SXin LI 1.5. Definitions of terms and conventions used ................. 3 71*c9083b85SXin LI 1.6. Changes from previous versions ............................ 3 72*c9083b85SXin LI 2. Detailed specification ......................................... 4 73*c9083b85SXin LI 2.1. Overall conventions ....................................... 4 74*c9083b85SXin LI 2.2. File format ............................................... 5 75*c9083b85SXin LI 2.3. Member format ............................................. 5 76*c9083b85SXin LI 2.3.1. Member header and trailer ........................... 6 77*c9083b85SXin LI 2.3.1.1. Extra field ................................... 8 78*c9083b85SXin LI 2.3.1.2. Compliance .................................... 9 79*c9083b85SXin LI 3. References .................................................. 9 80*c9083b85SXin LI 4. Security Considerations .................................... 10 81*c9083b85SXin LI 5. Acknowledgements ........................................... 10 82*c9083b85SXin LI 6. Author's Address ........................................... 10 83*c9083b85SXin LI 7. Appendix: Jean-Loup Gailly's gzip utility .................. 11 84*c9083b85SXin LI 8. Appendix: Sample CRC Code .................................. 11 85*c9083b85SXin LI 86*c9083b85SXin LI1. Introduction 87*c9083b85SXin LI 88*c9083b85SXin LI 1.1. Purpose 89*c9083b85SXin LI 90*c9083b85SXin LI The purpose of this specification is to define a lossless 91*c9083b85SXin LI compressed data format that: 92*c9083b85SXin LI 93*c9083b85SXin LI * Is independent of CPU type, operating system, file system, 94*c9083b85SXin LI and character set, and hence can be used for interchange; 95*c9083b85SXin LI * Can compress or decompress a data stream (as opposed to a 96*c9083b85SXin LI randomly accessible file) to produce another data stream, 97*c9083b85SXin LI using only an a priori bounded amount of intermediate 98*c9083b85SXin LI storage, and hence can be used in data communications or 99*c9083b85SXin LI similar structures such as Unix filters; 100*c9083b85SXin LI * Compresses data with efficiency comparable to the best 101*c9083b85SXin LI currently available general-purpose compression methods, 102*c9083b85SXin LI and in particular considerably better than the "compress" 103*c9083b85SXin LI program; 104*c9083b85SXin LI * Can be implemented readily in a manner not covered by 105*c9083b85SXin LI patents, and hence can be practiced freely; 106*c9083b85SXin LI * Is compatible with the file format produced by the current 107*c9083b85SXin LI widely used gzip utility, in that conforming decompressors 108*c9083b85SXin LI will be able to read data produced by the existing gzip 109*c9083b85SXin LI compressor. 110*c9083b85SXin LI 111*c9083b85SXin LI 112*c9083b85SXin LI 113*c9083b85SXin LI 114*c9083b85SXin LIDeutsch Informational [Page 2] 115*c9083b85SXin LI 116*c9083b85SXin LIRFC 1952 GZIP File Format Specification May 1996 117*c9083b85SXin LI 118*c9083b85SXin LI 119*c9083b85SXin LI The data format defined by this specification does not attempt to: 120*c9083b85SXin LI 121*c9083b85SXin LI * Provide random access to compressed data; 122*c9083b85SXin LI * Compress specialized data (e.g., raster graphics) as well as 123*c9083b85SXin LI the best currently available specialized algorithms. 124*c9083b85SXin LI 125*c9083b85SXin LI 1.2. Intended audience 126*c9083b85SXin LI 127*c9083b85SXin LI This specification is intended for use by implementors of software 128*c9083b85SXin LI to compress data into gzip format and/or decompress data from gzip 129*c9083b85SXin LI format. 130*c9083b85SXin LI 131*c9083b85SXin LI The text of the specification assumes a basic background in 132*c9083b85SXin LI programming at the level of bits and other primitive data 133*c9083b85SXin LI representations. 134*c9083b85SXin LI 135*c9083b85SXin LI 1.3. Scope 136*c9083b85SXin LI 137*c9083b85SXin LI The specification specifies a compression method and a file format 138*c9083b85SXin LI (the latter assuming only that a file can store a sequence of 139*c9083b85SXin LI arbitrary bytes). It does not specify any particular interface to 140*c9083b85SXin LI a file system or anything about character sets or encodings 141*c9083b85SXin LI (except for file names and comments, which are optional). 142*c9083b85SXin LI 143*c9083b85SXin LI 1.4. Compliance 144*c9083b85SXin LI 145*c9083b85SXin LI Unless otherwise indicated below, a compliant decompressor must be 146*c9083b85SXin LI able to accept and decompress any file that conforms to all the 147*c9083b85SXin LI specifications presented here; a compliant compressor must produce 148*c9083b85SXin LI files that conform to all the specifications presented here. The 149*c9083b85SXin LI material in the appendices is not part of the specification per se 150*c9083b85SXin LI and is not relevant to compliance. 151*c9083b85SXin LI 152*c9083b85SXin LI 1.5. Definitions of terms and conventions used 153*c9083b85SXin LI 154*c9083b85SXin LI byte: 8 bits stored or transmitted as a unit (same as an octet). 155*c9083b85SXin LI (For this specification, a byte is exactly 8 bits, even on 156*c9083b85SXin LI machines which store a character on a number of bits different 157*c9083b85SXin LI from 8.) See below for the numbering of bits within a byte. 158*c9083b85SXin LI 159*c9083b85SXin LI 1.6. Changes from previous versions 160*c9083b85SXin LI 161*c9083b85SXin LI There have been no technical changes to the gzip format since 162*c9083b85SXin LI version 4.1 of this specification. In version 4.2, some 163*c9083b85SXin LI terminology was changed, and the sample CRC code was rewritten for 164*c9083b85SXin LI clarity and to eliminate the requirement for the caller to do pre- 165*c9083b85SXin LI and post-conditioning. Version 4.3 is a conversion of the 166*c9083b85SXin LI specification to RFC style. 167*c9083b85SXin LI 168*c9083b85SXin LI 169*c9083b85SXin LI 170*c9083b85SXin LIDeutsch Informational [Page 3] 171*c9083b85SXin LI 172*c9083b85SXin LIRFC 1952 GZIP File Format Specification May 1996 173*c9083b85SXin LI 174*c9083b85SXin LI 175*c9083b85SXin LI2. Detailed specification 176*c9083b85SXin LI 177*c9083b85SXin LI 2.1. Overall conventions 178*c9083b85SXin LI 179*c9083b85SXin LI In the diagrams below, a box like this: 180*c9083b85SXin LI 181*c9083b85SXin LI +---+ 182*c9083b85SXin LI | | <-- the vertical bars might be missing 183*c9083b85SXin LI +---+ 184*c9083b85SXin LI 185*c9083b85SXin LI represents one byte; a box like this: 186*c9083b85SXin LI 187*c9083b85SXin LI +==============+ 188*c9083b85SXin LI | | 189*c9083b85SXin LI +==============+ 190*c9083b85SXin LI 191*c9083b85SXin LI represents a variable number of bytes. 192*c9083b85SXin LI 193*c9083b85SXin LI Bytes stored within a computer do not have a "bit order", since 194*c9083b85SXin LI they are always treated as a unit. However, a byte considered as 195*c9083b85SXin LI an integer between 0 and 255 does have a most- and least- 196*c9083b85SXin LI significant bit, and since we write numbers with the most- 197*c9083b85SXin LI significant digit on the left, we also write bytes with the most- 198*c9083b85SXin LI significant bit on the left. In the diagrams below, we number the 199*c9083b85SXin LI bits of a byte so that bit 0 is the least-significant bit, i.e., 200*c9083b85SXin LI the bits are numbered: 201*c9083b85SXin LI 202*c9083b85SXin LI +--------+ 203*c9083b85SXin LI |76543210| 204*c9083b85SXin LI +--------+ 205*c9083b85SXin LI 206*c9083b85SXin LI This document does not address the issue of the order in which 207*c9083b85SXin LI bits of a byte are transmitted on a bit-sequential medium, since 208*c9083b85SXin LI the data format described here is byte- rather than bit-oriented. 209*c9083b85SXin LI 210*c9083b85SXin LI Within a computer, a number may occupy multiple bytes. All 211*c9083b85SXin LI multi-byte numbers in the format described here are stored with 212*c9083b85SXin LI the least-significant byte first (at the lower memory address). 213*c9083b85SXin LI For example, the decimal number 520 is stored as: 214*c9083b85SXin LI 215*c9083b85SXin LI 0 1 216*c9083b85SXin LI +--------+--------+ 217*c9083b85SXin LI |00001000|00000010| 218*c9083b85SXin LI +--------+--------+ 219*c9083b85SXin LI ^ ^ 220*c9083b85SXin LI | | 221*c9083b85SXin LI | + more significant byte = 2 x 256 222*c9083b85SXin LI + less significant byte = 8 223*c9083b85SXin LI 224*c9083b85SXin LI 225*c9083b85SXin LI 226*c9083b85SXin LIDeutsch Informational [Page 4] 227*c9083b85SXin LI 228*c9083b85SXin LIRFC 1952 GZIP File Format Specification May 1996 229*c9083b85SXin LI 230*c9083b85SXin LI 231*c9083b85SXin LI 2.2. File format 232*c9083b85SXin LI 233*c9083b85SXin LI A gzip file consists of a series of "members" (compressed data 234*c9083b85SXin LI sets). The format of each member is specified in the following 235*c9083b85SXin LI section. The members simply appear one after another in the file, 236*c9083b85SXin LI with no additional information before, between, or after them. 237*c9083b85SXin LI 238*c9083b85SXin LI 2.3. Member format 239*c9083b85SXin LI 240*c9083b85SXin LI Each member has the following structure: 241*c9083b85SXin LI 242*c9083b85SXin LI +---+---+---+---+---+---+---+---+---+---+ 243*c9083b85SXin LI |ID1|ID2|CM |FLG| MTIME |XFL|OS | (more-->) 244*c9083b85SXin LI +---+---+---+---+---+---+---+---+---+---+ 245*c9083b85SXin LI 246*c9083b85SXin LI (if FLG.FEXTRA set) 247*c9083b85SXin LI 248*c9083b85SXin LI +---+---+=================================+ 249*c9083b85SXin LI | XLEN |...XLEN bytes of "extra field"...| (more-->) 250*c9083b85SXin LI +---+---+=================================+ 251*c9083b85SXin LI 252*c9083b85SXin LI (if FLG.FNAME set) 253*c9083b85SXin LI 254*c9083b85SXin LI +=========================================+ 255*c9083b85SXin LI |...original file name, zero-terminated...| (more-->) 256*c9083b85SXin LI +=========================================+ 257*c9083b85SXin LI 258*c9083b85SXin LI (if FLG.FCOMMENT set) 259*c9083b85SXin LI 260*c9083b85SXin LI +===================================+ 261*c9083b85SXin LI |...file comment, zero-terminated...| (more-->) 262*c9083b85SXin LI +===================================+ 263*c9083b85SXin LI 264*c9083b85SXin LI (if FLG.FHCRC set) 265*c9083b85SXin LI 266*c9083b85SXin LI +---+---+ 267*c9083b85SXin LI | CRC16 | 268*c9083b85SXin LI +---+---+ 269*c9083b85SXin LI 270*c9083b85SXin LI +=======================+ 271*c9083b85SXin LI |...compressed blocks...| (more-->) 272*c9083b85SXin LI +=======================+ 273*c9083b85SXin LI 274*c9083b85SXin LI 0 1 2 3 4 5 6 7 275*c9083b85SXin LI +---+---+---+---+---+---+---+---+ 276*c9083b85SXin LI | CRC32 | ISIZE | 277*c9083b85SXin LI +---+---+---+---+---+---+---+---+ 278*c9083b85SXin LI 279*c9083b85SXin LI 280*c9083b85SXin LI 281*c9083b85SXin LI 282*c9083b85SXin LIDeutsch Informational [Page 5] 283*c9083b85SXin LI 284*c9083b85SXin LIRFC 1952 GZIP File Format Specification May 1996 285*c9083b85SXin LI 286*c9083b85SXin LI 287*c9083b85SXin LI 2.3.1. Member header and trailer 288*c9083b85SXin LI 289*c9083b85SXin LI ID1 (IDentification 1) 290*c9083b85SXin LI ID2 (IDentification 2) 291*c9083b85SXin LI These have the fixed values ID1 = 31 (0x1f, \037), ID2 = 139 292*c9083b85SXin LI (0x8b, \213), to identify the file as being in gzip format. 293*c9083b85SXin LI 294*c9083b85SXin LI CM (Compression Method) 295*c9083b85SXin LI This identifies the compression method used in the file. CM 296*c9083b85SXin LI = 0-7 are reserved. CM = 8 denotes the "deflate" 297*c9083b85SXin LI compression method, which is the one customarily used by 298*c9083b85SXin LI gzip and which is documented elsewhere. 299*c9083b85SXin LI 300*c9083b85SXin LI FLG (FLaGs) 301*c9083b85SXin LI This flag byte is divided into individual bits as follows: 302*c9083b85SXin LI 303*c9083b85SXin LI bit 0 FTEXT 304*c9083b85SXin LI bit 1 FHCRC 305*c9083b85SXin LI bit 2 FEXTRA 306*c9083b85SXin LI bit 3 FNAME 307*c9083b85SXin LI bit 4 FCOMMENT 308*c9083b85SXin LI bit 5 reserved 309*c9083b85SXin LI bit 6 reserved 310*c9083b85SXin LI bit 7 reserved 311*c9083b85SXin LI 312*c9083b85SXin LI If FTEXT is set, the file is probably ASCII text. This is 313*c9083b85SXin LI an optional indication, which the compressor may set by 314*c9083b85SXin LI checking a small amount of the input data to see whether any 315*c9083b85SXin LI non-ASCII characters are present. In case of doubt, FTEXT 316*c9083b85SXin LI is cleared, indicating binary data. For systems which have 317*c9083b85SXin LI different file formats for ascii text and binary data, the 318*c9083b85SXin LI decompressor can use FTEXT to choose the appropriate format. 319*c9083b85SXin LI We deliberately do not specify the algorithm used to set 320*c9083b85SXin LI this bit, since a compressor always has the option of 321*c9083b85SXin LI leaving it cleared and a decompressor always has the option 322*c9083b85SXin LI of ignoring it and letting some other program handle issues 323*c9083b85SXin LI of data conversion. 324*c9083b85SXin LI 325*c9083b85SXin LI If FHCRC is set, a CRC16 for the gzip header is present, 326*c9083b85SXin LI immediately before the compressed data. The CRC16 consists 327*c9083b85SXin LI of the two least significant bytes of the CRC32 for all 328*c9083b85SXin LI bytes of the gzip header up to and not including the CRC16. 329*c9083b85SXin LI [The FHCRC bit was never set by versions of gzip up to 330*c9083b85SXin LI 1.2.4, even though it was documented with a different 331*c9083b85SXin LI meaning in gzip 1.2.4.] 332*c9083b85SXin LI 333*c9083b85SXin LI If FEXTRA is set, optional extra fields are present, as 334*c9083b85SXin LI described in a following section. 335*c9083b85SXin LI 336*c9083b85SXin LI 337*c9083b85SXin LI 338*c9083b85SXin LIDeutsch Informational [Page 6] 339*c9083b85SXin LI 340*c9083b85SXin LIRFC 1952 GZIP File Format Specification May 1996 341*c9083b85SXin LI 342*c9083b85SXin LI 343*c9083b85SXin LI If FNAME is set, an original file name is present, 344*c9083b85SXin LI terminated by a zero byte. The name must consist of ISO 345*c9083b85SXin LI 8859-1 (LATIN-1) characters; on operating systems using 346*c9083b85SXin LI EBCDIC or any other character set for file names, the name 347*c9083b85SXin LI must be translated to the ISO LATIN-1 character set. This 348*c9083b85SXin LI is the original name of the file being compressed, with any 349*c9083b85SXin LI directory components removed, and, if the file being 350*c9083b85SXin LI compressed is on a file system with case insensitive names, 351*c9083b85SXin LI forced to lower case. There is no original file name if the 352*c9083b85SXin LI data was compressed from a source other than a named file; 353*c9083b85SXin LI for example, if the source was stdin on a Unix system, there 354*c9083b85SXin LI is no file name. 355*c9083b85SXin LI 356*c9083b85SXin LI If FCOMMENT is set, a zero-terminated file comment is 357*c9083b85SXin LI present. This comment is not interpreted; it is only 358*c9083b85SXin LI intended for human consumption. The comment must consist of 359*c9083b85SXin LI ISO 8859-1 (LATIN-1) characters. Line breaks should be 360*c9083b85SXin LI denoted by a single line feed character (10 decimal). 361*c9083b85SXin LI 362*c9083b85SXin LI Reserved FLG bits must be zero. 363*c9083b85SXin LI 364*c9083b85SXin LI MTIME (Modification TIME) 365*c9083b85SXin LI This gives the most recent modification time of the original 366*c9083b85SXin LI file being compressed. The time is in Unix format, i.e., 367*c9083b85SXin LI seconds since 00:00:00 GMT, Jan. 1, 1970. (Note that this 368*c9083b85SXin LI may cause problems for MS-DOS and other systems that use 369*c9083b85SXin LI local rather than Universal time.) If the compressed data 370*c9083b85SXin LI did not come from a file, MTIME is set to the time at which 371*c9083b85SXin LI compression started. MTIME = 0 means no time stamp is 372*c9083b85SXin LI available. 373*c9083b85SXin LI 374*c9083b85SXin LI XFL (eXtra FLags) 375*c9083b85SXin LI These flags are available for use by specific compression 376*c9083b85SXin LI methods. The "deflate" method (CM = 8) sets these flags as 377*c9083b85SXin LI follows: 378*c9083b85SXin LI 379*c9083b85SXin LI XFL = 2 - compressor used maximum compression, 380*c9083b85SXin LI slowest algorithm 381*c9083b85SXin LI XFL = 4 - compressor used fastest algorithm 382*c9083b85SXin LI 383*c9083b85SXin LI OS (Operating System) 384*c9083b85SXin LI This identifies the type of file system on which compression 385*c9083b85SXin LI took place. This may be useful in determining end-of-line 386*c9083b85SXin LI convention for text files. The currently defined values are 387*c9083b85SXin LI as follows: 388*c9083b85SXin LI 389*c9083b85SXin LI 390*c9083b85SXin LI 391*c9083b85SXin LI 392*c9083b85SXin LI 393*c9083b85SXin LI 394*c9083b85SXin LIDeutsch Informational [Page 7] 395*c9083b85SXin LI 396*c9083b85SXin LIRFC 1952 GZIP File Format Specification May 1996 397*c9083b85SXin LI 398*c9083b85SXin LI 399*c9083b85SXin LI 0 - FAT filesystem (MS-DOS, OS/2, NT/Win32) 400*c9083b85SXin LI 1 - Amiga 401*c9083b85SXin LI 2 - VMS (or OpenVMS) 402*c9083b85SXin LI 3 - Unix 403*c9083b85SXin LI 4 - VM/CMS 404*c9083b85SXin LI 5 - Atari TOS 405*c9083b85SXin LI 6 - HPFS filesystem (OS/2, NT) 406*c9083b85SXin LI 7 - Macintosh 407*c9083b85SXin LI 8 - Z-System 408*c9083b85SXin LI 9 - CP/M 409*c9083b85SXin LI 10 - TOPS-20 410*c9083b85SXin LI 11 - NTFS filesystem (NT) 411*c9083b85SXin LI 12 - QDOS 412*c9083b85SXin LI 13 - Acorn RISCOS 413*c9083b85SXin LI 255 - unknown 414*c9083b85SXin LI 415*c9083b85SXin LI XLEN (eXtra LENgth) 416*c9083b85SXin LI If FLG.FEXTRA is set, this gives the length of the optional 417*c9083b85SXin LI extra field. See below for details. 418*c9083b85SXin LI 419*c9083b85SXin LI CRC32 (CRC-32) 420*c9083b85SXin LI This contains a Cyclic Redundancy Check value of the 421*c9083b85SXin LI uncompressed data computed according to CRC-32 algorithm 422*c9083b85SXin LI used in the ISO 3309 standard and in section 8.1.1.6.2 of 423*c9083b85SXin LI ITU-T recommendation V.42. (See http://www.iso.ch for 424*c9083b85SXin LI ordering ISO documents. See gopher://info.itu.ch for an 425*c9083b85SXin LI online version of ITU-T V.42.) 426*c9083b85SXin LI 427*c9083b85SXin LI ISIZE (Input SIZE) 428*c9083b85SXin LI This contains the size of the original (uncompressed) input 429*c9083b85SXin LI data modulo 2^32. 430*c9083b85SXin LI 431*c9083b85SXin LI 2.3.1.1. Extra field 432*c9083b85SXin LI 433*c9083b85SXin LI If the FLG.FEXTRA bit is set, an "extra field" is present in 434*c9083b85SXin LI the header, with total length XLEN bytes. It consists of a 435*c9083b85SXin LI series of subfields, each of the form: 436*c9083b85SXin LI 437*c9083b85SXin LI +---+---+---+---+==================================+ 438*c9083b85SXin LI |SI1|SI2| LEN |... LEN bytes of subfield data ...| 439*c9083b85SXin LI +---+---+---+---+==================================+ 440*c9083b85SXin LI 441*c9083b85SXin LI SI1 and SI2 provide a subfield ID, typically two ASCII letters 442*c9083b85SXin LI with some mnemonic value. Jean-Loup Gailly 443*c9083b85SXin LI <gzip@prep.ai.mit.edu> is maintaining a registry of subfield 444*c9083b85SXin LI IDs; please send him any subfield ID you wish to use. Subfield 445*c9083b85SXin LI IDs with SI2 = 0 are reserved for future use. The following 446*c9083b85SXin LI IDs are currently defined: 447*c9083b85SXin LI 448*c9083b85SXin LI 449*c9083b85SXin LI 450*c9083b85SXin LIDeutsch Informational [Page 8] 451*c9083b85SXin LI 452*c9083b85SXin LIRFC 1952 GZIP File Format Specification May 1996 453*c9083b85SXin LI 454*c9083b85SXin LI 455*c9083b85SXin LI SI1 SI2 Data 456*c9083b85SXin LI ---------- ---------- ---- 457*c9083b85SXin LI 0x41 ('A') 0x70 ('P') Apollo file type information 458*c9083b85SXin LI 459*c9083b85SXin LI LEN gives the length of the subfield data, excluding the 4 460*c9083b85SXin LI initial bytes. 461*c9083b85SXin LI 462*c9083b85SXin LI 2.3.1.2. Compliance 463*c9083b85SXin LI 464*c9083b85SXin LI A compliant compressor must produce files with correct ID1, 465*c9083b85SXin LI ID2, CM, CRC32, and ISIZE, but may set all the other fields in 466*c9083b85SXin LI the fixed-length part of the header to default values (255 for 467*c9083b85SXin LI OS, 0 for all others). The compressor must set all reserved 468*c9083b85SXin LI bits to zero. 469*c9083b85SXin LI 470*c9083b85SXin LI A compliant decompressor must check ID1, ID2, and CM, and 471*c9083b85SXin LI provide an error indication if any of these have incorrect 472*c9083b85SXin LI values. It must examine FEXTRA/XLEN, FNAME, FCOMMENT and FHCRC 473*c9083b85SXin LI at least so it can skip over the optional fields if they are 474*c9083b85SXin LI present. It need not examine any other part of the header or 475*c9083b85SXin LI trailer; in particular, a decompressor may ignore FTEXT and OS 476*c9083b85SXin LI and always produce binary output, and still be compliant. A 477*c9083b85SXin LI compliant decompressor must give an error indication if any 478*c9083b85SXin LI reserved bit is non-zero, since such a bit could indicate the 479*c9083b85SXin LI presence of a new field that would cause subsequent data to be 480*c9083b85SXin LI interpreted incorrectly. 481*c9083b85SXin LI 482*c9083b85SXin LI3. References 483*c9083b85SXin LI 484*c9083b85SXin LI [1] "Information Processing - 8-bit single-byte coded graphic 485*c9083b85SXin LI character sets - Part 1: Latin alphabet No.1" (ISO 8859-1:1987). 486*c9083b85SXin LI The ISO 8859-1 (Latin-1) character set is a superset of 7-bit 487*c9083b85SXin LI ASCII. Files defining this character set are available as 488*c9083b85SXin LI iso_8859-1.* in ftp://ftp.uu.net/graphics/png/documents/ 489*c9083b85SXin LI 490*c9083b85SXin LI [2] ISO 3309 491*c9083b85SXin LI 492*c9083b85SXin LI [3] ITU-T recommendation V.42 493*c9083b85SXin LI 494*c9083b85SXin LI [4] Deutsch, L.P.,"DEFLATE Compressed Data Format Specification", 495*c9083b85SXin LI available in ftp://ftp.uu.net/pub/archiving/zip/doc/ 496*c9083b85SXin LI 497*c9083b85SXin LI [5] Gailly, J.-L., GZIP documentation, available as gzip-*.tar in 498*c9083b85SXin LI ftp://prep.ai.mit.edu/pub/gnu/ 499*c9083b85SXin LI 500*c9083b85SXin LI [6] Sarwate, D.V., "Computation of Cyclic Redundancy Checks via Table 501*c9083b85SXin LI Look-Up", Communications of the ACM, 31(8), pp.1008-1013. 502*c9083b85SXin LI 503*c9083b85SXin LI 504*c9083b85SXin LI 505*c9083b85SXin LI 506*c9083b85SXin LIDeutsch Informational [Page 9] 507*c9083b85SXin LI 508*c9083b85SXin LIRFC 1952 GZIP File Format Specification May 1996 509*c9083b85SXin LI 510*c9083b85SXin LI 511*c9083b85SXin LI [7] Schwaderer, W.D., "CRC Calculation", April 85 PC Tech Journal, 512*c9083b85SXin LI pp.118-133. 513*c9083b85SXin LI 514*c9083b85SXin LI [8] ftp://ftp.adelaide.edu.au/pub/rocksoft/papers/crc_v3.txt, 515*c9083b85SXin LI describing the CRC concept. 516*c9083b85SXin LI 517*c9083b85SXin LI4. Security Considerations 518*c9083b85SXin LI 519*c9083b85SXin LI Any data compression method involves the reduction of redundancy in 520*c9083b85SXin LI the data. Consequently, any corruption of the data is likely to have 521*c9083b85SXin LI severe effects and be difficult to correct. Uncompressed text, on 522*c9083b85SXin LI the other hand, will probably still be readable despite the presence 523*c9083b85SXin LI of some corrupted bytes. 524*c9083b85SXin LI 525*c9083b85SXin LI It is recommended that systems using this data format provide some 526*c9083b85SXin LI means of validating the integrity of the compressed data, such as by 527*c9083b85SXin LI setting and checking the CRC-32 check value. 528*c9083b85SXin LI 529*c9083b85SXin LI5. Acknowledgements 530*c9083b85SXin LI 531*c9083b85SXin LI Trademarks cited in this document are the property of their 532*c9083b85SXin LI respective owners. 533*c9083b85SXin LI 534*c9083b85SXin LI Jean-Loup Gailly designed the gzip format and wrote, with Mark Adler, 535*c9083b85SXin LI the related software described in this specification. Glenn 536*c9083b85SXin LI Randers-Pehrson converted this document to RFC and HTML format. 537*c9083b85SXin LI 538*c9083b85SXin LI6. Author's Address 539*c9083b85SXin LI 540*c9083b85SXin LI L. Peter Deutsch 541*c9083b85SXin LI Aladdin Enterprises 542*c9083b85SXin LI 203 Santa Margarita Ave. 543*c9083b85SXin LI Menlo Park, CA 94025 544*c9083b85SXin LI 545*c9083b85SXin LI Phone: (415) 322-0103 (AM only) 546*c9083b85SXin LI FAX: (415) 322-1734 547*c9083b85SXin LI EMail: <ghost@aladdin.com> 548*c9083b85SXin LI 549*c9083b85SXin LI Questions about the technical content of this specification can be 550*c9083b85SXin LI sent by email to: 551*c9083b85SXin LI 552*c9083b85SXin LI Jean-Loup Gailly <gzip@prep.ai.mit.edu> and 553*c9083b85SXin LI Mark Adler <madler@alumni.caltech.edu> 554*c9083b85SXin LI 555*c9083b85SXin LI Editorial comments on this specification can be sent by email to: 556*c9083b85SXin LI 557*c9083b85SXin LI L. Peter Deutsch <ghost@aladdin.com> and 558*c9083b85SXin LI Glenn Randers-Pehrson <randeg@alumni.rpi.edu> 559*c9083b85SXin LI 560*c9083b85SXin LI 561*c9083b85SXin LI 562*c9083b85SXin LIDeutsch Informational [Page 10] 563*c9083b85SXin LI 564*c9083b85SXin LIRFC 1952 GZIP File Format Specification May 1996 565*c9083b85SXin LI 566*c9083b85SXin LI 567*c9083b85SXin LI7. Appendix: Jean-Loup Gailly's gzip utility 568*c9083b85SXin LI 569*c9083b85SXin LI The most widely used implementation of gzip compression, and the 570*c9083b85SXin LI original documentation on which this specification is based, were 571*c9083b85SXin LI created by Jean-Loup Gailly <gzip@prep.ai.mit.edu>. Since this 572*c9083b85SXin LI implementation is a de facto standard, we mention some more of its 573*c9083b85SXin LI features here. Again, the material in this section is not part of 574*c9083b85SXin LI the specification per se, and implementations need not follow it to 575*c9083b85SXin LI be compliant. 576*c9083b85SXin LI 577*c9083b85SXin LI When compressing or decompressing a file, gzip preserves the 578*c9083b85SXin LI protection, ownership, and modification time attributes on the local 579*c9083b85SXin LI file system, since there is no provision for representing protection 580*c9083b85SXin LI attributes in the gzip file format itself. Since the file format 581*c9083b85SXin LI includes a modification time, the gzip decompressor provides a 582*c9083b85SXin LI command line switch that assigns the modification time from the file, 583*c9083b85SXin LI rather than the local modification time of the compressed input, to 584*c9083b85SXin LI the decompressed output. 585*c9083b85SXin LI 586*c9083b85SXin LI8. Appendix: Sample CRC Code 587*c9083b85SXin LI 588*c9083b85SXin LI The following sample code represents a practical implementation of 589*c9083b85SXin LI the CRC (Cyclic Redundancy Check). (See also ISO 3309 and ITU-T V.42 590*c9083b85SXin LI for a formal specification.) 591*c9083b85SXin LI 592*c9083b85SXin LI The sample code is in the ANSI C programming language. Non C users 593*c9083b85SXin LI may find it easier to read with these hints: 594*c9083b85SXin LI 595*c9083b85SXin LI & Bitwise AND operator. 596*c9083b85SXin LI ^ Bitwise exclusive-OR operator. 597*c9083b85SXin LI >> Bitwise right shift operator. When applied to an 598*c9083b85SXin LI unsigned quantity, as here, right shift inserts zero 599*c9083b85SXin LI bit(s) at the left. 600*c9083b85SXin LI ! Logical NOT operator. 601*c9083b85SXin LI ++ "n++" increments the variable n. 602*c9083b85SXin LI 0xNNN 0x introduces a hexadecimal (base 16) constant. 603*c9083b85SXin LI Suffix L indicates a long value (at least 32 bits). 604*c9083b85SXin LI 605*c9083b85SXin LI /* Table of CRCs of all 8-bit messages. */ 606*c9083b85SXin LI unsigned long crc_table[256]; 607*c9083b85SXin LI 608*c9083b85SXin LI /* Flag: has the table been computed? Initially false. */ 609*c9083b85SXin LI int crc_table_computed = 0; 610*c9083b85SXin LI 611*c9083b85SXin LI /* Make the table for a fast CRC. */ 612*c9083b85SXin LI void make_crc_table(void) 613*c9083b85SXin LI { 614*c9083b85SXin LI unsigned long c; 615*c9083b85SXin LI 616*c9083b85SXin LI 617*c9083b85SXin LI 618*c9083b85SXin LIDeutsch Informational [Page 11] 619*c9083b85SXin LI 620*c9083b85SXin LIRFC 1952 GZIP File Format Specification May 1996 621*c9083b85SXin LI 622*c9083b85SXin LI 623*c9083b85SXin LI int n, k; 624*c9083b85SXin LI for (n = 0; n < 256; n++) { 625*c9083b85SXin LI c = (unsigned long) n; 626*c9083b85SXin LI for (k = 0; k < 8; k++) { 627*c9083b85SXin LI if (c & 1) { 628*c9083b85SXin LI c = 0xedb88320L ^ (c >> 1); 629*c9083b85SXin LI } else { 630*c9083b85SXin LI c = c >> 1; 631*c9083b85SXin LI } 632*c9083b85SXin LI } 633*c9083b85SXin LI crc_table[n] = c; 634*c9083b85SXin LI } 635*c9083b85SXin LI crc_table_computed = 1; 636*c9083b85SXin LI } 637*c9083b85SXin LI 638*c9083b85SXin LI /* 639*c9083b85SXin LI Update a running crc with the bytes buf[0..len-1] and return 640*c9083b85SXin LI the updated crc. The crc should be initialized to zero. Pre- and 641*c9083b85SXin LI post-conditioning (one's complement) is performed within this 642*c9083b85SXin LI function so it shouldn't be done by the caller. Usage example: 643*c9083b85SXin LI 644*c9083b85SXin LI unsigned long crc = 0L; 645*c9083b85SXin LI 646*c9083b85SXin LI while (read_buffer(buffer, length) != EOF) { 647*c9083b85SXin LI crc = update_crc(crc, buffer, length); 648*c9083b85SXin LI } 649*c9083b85SXin LI if (crc != original_crc) error(); 650*c9083b85SXin LI */ 651*c9083b85SXin LI unsigned long update_crc(unsigned long crc, 652*c9083b85SXin LI unsigned char *buf, int len) 653*c9083b85SXin LI { 654*c9083b85SXin LI unsigned long c = crc ^ 0xffffffffL; 655*c9083b85SXin LI int n; 656*c9083b85SXin LI 657*c9083b85SXin LI if (!crc_table_computed) 658*c9083b85SXin LI make_crc_table(); 659*c9083b85SXin LI for (n = 0; n < len; n++) { 660*c9083b85SXin LI c = crc_table[(c ^ buf[n]) & 0xff] ^ (c >> 8); 661*c9083b85SXin LI } 662*c9083b85SXin LI return c ^ 0xffffffffL; 663*c9083b85SXin LI } 664*c9083b85SXin LI 665*c9083b85SXin LI /* Return the CRC of the bytes buf[0..len-1]. */ 666*c9083b85SXin LI unsigned long crc(unsigned char *buf, int len) 667*c9083b85SXin LI { 668*c9083b85SXin LI return update_crc(0L, buf, len); 669*c9083b85SXin LI } 670*c9083b85SXin LI 671*c9083b85SXin LI 672*c9083b85SXin LI 673*c9083b85SXin LI 674*c9083b85SXin LIDeutsch Informational [Page 12] 675*c9083b85SXin LI 676