xref: /freebsd/sys/contrib/zlib/doc/rfc1952.txt (revision 7648bc9fee8dec6cb3c4941e0165a930fbe8dcb0)
1*c9083b85SXin LI
2*c9083b85SXin LI
3*c9083b85SXin LI
4*c9083b85SXin LI
5*c9083b85SXin LI
6*c9083b85SXin LI
7*c9083b85SXin LINetwork Working Group                                         P. Deutsch
8*c9083b85SXin LIRequest for Comments: 1952                           Aladdin Enterprises
9*c9083b85SXin LICategory: Informational                                         May 1996
10*c9083b85SXin LI
11*c9083b85SXin LI
12*c9083b85SXin LI               GZIP file format specification version 4.3
13*c9083b85SXin LI
14*c9083b85SXin LIStatus of This Memo
15*c9083b85SXin LI
16*c9083b85SXin LI   This memo provides information for the Internet community.  This memo
17*c9083b85SXin LI   does not specify an Internet standard of any kind.  Distribution of
18*c9083b85SXin LI   this memo is unlimited.
19*c9083b85SXin LI
20*c9083b85SXin LIIESG Note:
21*c9083b85SXin LI
22*c9083b85SXin LI   The IESG takes no position on the validity of any Intellectual
23*c9083b85SXin LI   Property Rights statements contained in this document.
24*c9083b85SXin LI
25*c9083b85SXin LINotices
26*c9083b85SXin LI
27*c9083b85SXin LI   Copyright (c) 1996 L. Peter Deutsch
28*c9083b85SXin LI
29*c9083b85SXin LI   Permission is granted to copy and distribute this document for any
30*c9083b85SXin LI   purpose and without charge, including translations into other
31*c9083b85SXin LI   languages and incorporation into compilations, provided that the
32*c9083b85SXin LI   copyright notice and this notice are preserved, and that any
33*c9083b85SXin LI   substantive changes or deletions from the original are clearly
34*c9083b85SXin LI   marked.
35*c9083b85SXin LI
36*c9083b85SXin LI   A pointer to the latest version of this and related documentation in
37*c9083b85SXin LI   HTML format can be found at the URL
38*c9083b85SXin LI   <ftp://ftp.uu.net/graphics/png/documents/zlib/zdoc-index.html>.
39*c9083b85SXin LI
40*c9083b85SXin LIAbstract
41*c9083b85SXin LI
42*c9083b85SXin LI   This specification defines a lossless compressed data format that is
43*c9083b85SXin LI   compatible with the widely used GZIP utility.  The format includes a
44*c9083b85SXin LI   cyclic redundancy check value for detecting data corruption.  The
45*c9083b85SXin LI   format presently uses the DEFLATE method of compression but can be
46*c9083b85SXin LI   easily extended to use other compression methods.  The format can be
47*c9083b85SXin LI   implemented readily in a manner not covered by patents.
48*c9083b85SXin LI
49*c9083b85SXin LI
50*c9083b85SXin LI
51*c9083b85SXin LI
52*c9083b85SXin LI
53*c9083b85SXin LI
54*c9083b85SXin LI
55*c9083b85SXin LI
56*c9083b85SXin LI
57*c9083b85SXin LI
58*c9083b85SXin LIDeutsch                      Informational                      [Page 1]
59*c9083b85SXin LI
60*c9083b85SXin LIRFC 1952             GZIP File Format Specification             May 1996
61*c9083b85SXin LI
62*c9083b85SXin LI
63*c9083b85SXin LITable of Contents
64*c9083b85SXin LI
65*c9083b85SXin LI   1. Introduction ................................................... 2
66*c9083b85SXin LI      1.1. Purpose ................................................... 2
67*c9083b85SXin LI      1.2. Intended audience ......................................... 3
68*c9083b85SXin LI      1.3. Scope ..................................................... 3
69*c9083b85SXin LI      1.4. Compliance ................................................ 3
70*c9083b85SXin LI      1.5. Definitions of terms and conventions used ................. 3
71*c9083b85SXin LI      1.6. Changes from previous versions ............................ 3
72*c9083b85SXin LI   2. Detailed specification ......................................... 4
73*c9083b85SXin LI      2.1. Overall conventions ....................................... 4
74*c9083b85SXin LI      2.2. File format ............................................... 5
75*c9083b85SXin LI      2.3. Member format ............................................. 5
76*c9083b85SXin LI          2.3.1. Member header and trailer ........................... 6
77*c9083b85SXin LI              2.3.1.1. Extra field ................................... 8
78*c9083b85SXin LI              2.3.1.2. Compliance .................................... 9
79*c9083b85SXin LI      3. References .................................................. 9
80*c9083b85SXin LI      4. Security Considerations .................................... 10
81*c9083b85SXin LI      5. Acknowledgements ........................................... 10
82*c9083b85SXin LI      6. Author's Address ........................................... 10
83*c9083b85SXin LI      7. Appendix: Jean-Loup Gailly's gzip utility .................. 11
84*c9083b85SXin LI      8. Appendix: Sample CRC Code .................................. 11
85*c9083b85SXin LI
86*c9083b85SXin LI1. Introduction
87*c9083b85SXin LI
88*c9083b85SXin LI   1.1. Purpose
89*c9083b85SXin LI
90*c9083b85SXin LI      The purpose of this specification is to define a lossless
91*c9083b85SXin LI      compressed data format that:
92*c9083b85SXin LI
93*c9083b85SXin LI          * Is independent of CPU type, operating system, file system,
94*c9083b85SXin LI            and character set, and hence can be used for interchange;
95*c9083b85SXin LI          * Can compress or decompress a data stream (as opposed to a
96*c9083b85SXin LI            randomly accessible file) to produce another data stream,
97*c9083b85SXin LI            using only an a priori bounded amount of intermediate
98*c9083b85SXin LI            storage, and hence can be used in data communications or
99*c9083b85SXin LI            similar structures such as Unix filters;
100*c9083b85SXin LI          * Compresses data with efficiency comparable to the best
101*c9083b85SXin LI            currently available general-purpose compression methods,
102*c9083b85SXin LI            and in particular considerably better than the "compress"
103*c9083b85SXin LI            program;
104*c9083b85SXin LI          * Can be implemented readily in a manner not covered by
105*c9083b85SXin LI            patents, and hence can be practiced freely;
106*c9083b85SXin LI          * Is compatible with the file format produced by the current
107*c9083b85SXin LI            widely used gzip utility, in that conforming decompressors
108*c9083b85SXin LI            will be able to read data produced by the existing gzip
109*c9083b85SXin LI            compressor.
110*c9083b85SXin LI
111*c9083b85SXin LI
112*c9083b85SXin LI
113*c9083b85SXin LI
114*c9083b85SXin LIDeutsch                      Informational                      [Page 2]
115*c9083b85SXin LI
116*c9083b85SXin LIRFC 1952             GZIP File Format Specification             May 1996
117*c9083b85SXin LI
118*c9083b85SXin LI
119*c9083b85SXin LI      The data format defined by this specification does not attempt to:
120*c9083b85SXin LI
121*c9083b85SXin LI          * Provide random access to compressed data;
122*c9083b85SXin LI          * Compress specialized data (e.g., raster graphics) as well as
123*c9083b85SXin LI            the best currently available specialized algorithms.
124*c9083b85SXin LI
125*c9083b85SXin LI   1.2. Intended audience
126*c9083b85SXin LI
127*c9083b85SXin LI      This specification is intended for use by implementors of software
128*c9083b85SXin LI      to compress data into gzip format and/or decompress data from gzip
129*c9083b85SXin LI      format.
130*c9083b85SXin LI
131*c9083b85SXin LI      The text of the specification assumes a basic background in
132*c9083b85SXin LI      programming at the level of bits and other primitive data
133*c9083b85SXin LI      representations.
134*c9083b85SXin LI
135*c9083b85SXin LI   1.3. Scope
136*c9083b85SXin LI
137*c9083b85SXin LI      The specification specifies a compression method and a file format
138*c9083b85SXin LI      (the latter assuming only that a file can store a sequence of
139*c9083b85SXin LI      arbitrary bytes).  It does not specify any particular interface to
140*c9083b85SXin LI      a file system or anything about character sets or encodings
141*c9083b85SXin LI      (except for file names and comments, which are optional).
142*c9083b85SXin LI
143*c9083b85SXin LI   1.4. Compliance
144*c9083b85SXin LI
145*c9083b85SXin LI      Unless otherwise indicated below, a compliant decompressor must be
146*c9083b85SXin LI      able to accept and decompress any file that conforms to all the
147*c9083b85SXin LI      specifications presented here; a compliant compressor must produce
148*c9083b85SXin LI      files that conform to all the specifications presented here.  The
149*c9083b85SXin LI      material in the appendices is not part of the specification per se
150*c9083b85SXin LI      and is not relevant to compliance.
151*c9083b85SXin LI
152*c9083b85SXin LI   1.5. Definitions of terms and conventions used
153*c9083b85SXin LI
154*c9083b85SXin LI      byte: 8 bits stored or transmitted as a unit (same as an octet).
155*c9083b85SXin LI      (For this specification, a byte is exactly 8 bits, even on
156*c9083b85SXin LI      machines which store a character on a number of bits different
157*c9083b85SXin LI      from 8.)  See below for the numbering of bits within a byte.
158*c9083b85SXin LI
159*c9083b85SXin LI   1.6. Changes from previous versions
160*c9083b85SXin LI
161*c9083b85SXin LI      There have been no technical changes to the gzip format since
162*c9083b85SXin LI      version 4.1 of this specification.  In version 4.2, some
163*c9083b85SXin LI      terminology was changed, and the sample CRC code was rewritten for
164*c9083b85SXin LI      clarity and to eliminate the requirement for the caller to do pre-
165*c9083b85SXin LI      and post-conditioning.  Version 4.3 is a conversion of the
166*c9083b85SXin LI      specification to RFC style.
167*c9083b85SXin LI
168*c9083b85SXin LI
169*c9083b85SXin LI
170*c9083b85SXin LIDeutsch                      Informational                      [Page 3]
171*c9083b85SXin LI
172*c9083b85SXin LIRFC 1952             GZIP File Format Specification             May 1996
173*c9083b85SXin LI
174*c9083b85SXin LI
175*c9083b85SXin LI2. Detailed specification
176*c9083b85SXin LI
177*c9083b85SXin LI   2.1. Overall conventions
178*c9083b85SXin LI
179*c9083b85SXin LI      In the diagrams below, a box like this:
180*c9083b85SXin LI
181*c9083b85SXin LI         +---+
182*c9083b85SXin LI         |   | <-- the vertical bars might be missing
183*c9083b85SXin LI         +---+
184*c9083b85SXin LI
185*c9083b85SXin LI      represents one byte; a box like this:
186*c9083b85SXin LI
187*c9083b85SXin LI         +==============+
188*c9083b85SXin LI         |              |
189*c9083b85SXin LI         +==============+
190*c9083b85SXin LI
191*c9083b85SXin LI      represents a variable number of bytes.
192*c9083b85SXin LI
193*c9083b85SXin LI      Bytes stored within a computer do not have a "bit order", since
194*c9083b85SXin LI      they are always treated as a unit.  However, a byte considered as
195*c9083b85SXin LI      an integer between 0 and 255 does have a most- and least-
196*c9083b85SXin LI      significant bit, and since we write numbers with the most-
197*c9083b85SXin LI      significant digit on the left, we also write bytes with the most-
198*c9083b85SXin LI      significant bit on the left.  In the diagrams below, we number the
199*c9083b85SXin LI      bits of a byte so that bit 0 is the least-significant bit, i.e.,
200*c9083b85SXin LI      the bits are numbered:
201*c9083b85SXin LI
202*c9083b85SXin LI         +--------+
203*c9083b85SXin LI         |76543210|
204*c9083b85SXin LI         +--------+
205*c9083b85SXin LI
206*c9083b85SXin LI      This document does not address the issue of the order in which
207*c9083b85SXin LI      bits of a byte are transmitted on a bit-sequential medium, since
208*c9083b85SXin LI      the data format described here is byte- rather than bit-oriented.
209*c9083b85SXin LI
210*c9083b85SXin LI      Within a computer, a number may occupy multiple bytes.  All
211*c9083b85SXin LI      multi-byte numbers in the format described here are stored with
212*c9083b85SXin LI      the least-significant byte first (at the lower memory address).
213*c9083b85SXin LI      For example, the decimal number 520 is stored as:
214*c9083b85SXin LI
215*c9083b85SXin LI             0        1
216*c9083b85SXin LI         +--------+--------+
217*c9083b85SXin LI         |00001000|00000010|
218*c9083b85SXin LI         +--------+--------+
219*c9083b85SXin LI          ^        ^
220*c9083b85SXin LI          |        |
221*c9083b85SXin LI          |        + more significant byte = 2 x 256
222*c9083b85SXin LI          + less significant byte = 8
223*c9083b85SXin LI
224*c9083b85SXin LI
225*c9083b85SXin LI
226*c9083b85SXin LIDeutsch                      Informational                      [Page 4]
227*c9083b85SXin LI
228*c9083b85SXin LIRFC 1952             GZIP File Format Specification             May 1996
229*c9083b85SXin LI
230*c9083b85SXin LI
231*c9083b85SXin LI   2.2. File format
232*c9083b85SXin LI
233*c9083b85SXin LI      A gzip file consists of a series of "members" (compressed data
234*c9083b85SXin LI      sets).  The format of each member is specified in the following
235*c9083b85SXin LI      section.  The members simply appear one after another in the file,
236*c9083b85SXin LI      with no additional information before, between, or after them.
237*c9083b85SXin LI
238*c9083b85SXin LI   2.3. Member format
239*c9083b85SXin LI
240*c9083b85SXin LI      Each member has the following structure:
241*c9083b85SXin LI
242*c9083b85SXin LI         +---+---+---+---+---+---+---+---+---+---+
243*c9083b85SXin LI         |ID1|ID2|CM |FLG|     MTIME     |XFL|OS | (more-->)
244*c9083b85SXin LI         +---+---+---+---+---+---+---+---+---+---+
245*c9083b85SXin LI
246*c9083b85SXin LI      (if FLG.FEXTRA set)
247*c9083b85SXin LI
248*c9083b85SXin LI         +---+---+=================================+
249*c9083b85SXin LI         | XLEN  |...XLEN bytes of "extra field"...| (more-->)
250*c9083b85SXin LI         +---+---+=================================+
251*c9083b85SXin LI
252*c9083b85SXin LI      (if FLG.FNAME set)
253*c9083b85SXin LI
254*c9083b85SXin LI         +=========================================+
255*c9083b85SXin LI         |...original file name, zero-terminated...| (more-->)
256*c9083b85SXin LI         +=========================================+
257*c9083b85SXin LI
258*c9083b85SXin LI      (if FLG.FCOMMENT set)
259*c9083b85SXin LI
260*c9083b85SXin LI         +===================================+
261*c9083b85SXin LI         |...file comment, zero-terminated...| (more-->)
262*c9083b85SXin LI         +===================================+
263*c9083b85SXin LI
264*c9083b85SXin LI      (if FLG.FHCRC set)
265*c9083b85SXin LI
266*c9083b85SXin LI         +---+---+
267*c9083b85SXin LI         | CRC16 |
268*c9083b85SXin LI         +---+---+
269*c9083b85SXin LI
270*c9083b85SXin LI         +=======================+
271*c9083b85SXin LI         |...compressed blocks...| (more-->)
272*c9083b85SXin LI         +=======================+
273*c9083b85SXin LI
274*c9083b85SXin LI           0   1   2   3   4   5   6   7
275*c9083b85SXin LI         +---+---+---+---+---+---+---+---+
276*c9083b85SXin LI         |     CRC32     |     ISIZE     |
277*c9083b85SXin LI         +---+---+---+---+---+---+---+---+
278*c9083b85SXin LI
279*c9083b85SXin LI
280*c9083b85SXin LI
281*c9083b85SXin LI
282*c9083b85SXin LIDeutsch                      Informational                      [Page 5]
283*c9083b85SXin LI
284*c9083b85SXin LIRFC 1952             GZIP File Format Specification             May 1996
285*c9083b85SXin LI
286*c9083b85SXin LI
287*c9083b85SXin LI      2.3.1. Member header and trailer
288*c9083b85SXin LI
289*c9083b85SXin LI         ID1 (IDentification 1)
290*c9083b85SXin LI         ID2 (IDentification 2)
291*c9083b85SXin LI            These have the fixed values ID1 = 31 (0x1f, \037), ID2 = 139
292*c9083b85SXin LI            (0x8b, \213), to identify the file as being in gzip format.
293*c9083b85SXin LI
294*c9083b85SXin LI         CM (Compression Method)
295*c9083b85SXin LI            This identifies the compression method used in the file.  CM
296*c9083b85SXin LI            = 0-7 are reserved.  CM = 8 denotes the "deflate"
297*c9083b85SXin LI            compression method, which is the one customarily used by
298*c9083b85SXin LI            gzip and which is documented elsewhere.
299*c9083b85SXin LI
300*c9083b85SXin LI         FLG (FLaGs)
301*c9083b85SXin LI            This flag byte is divided into individual bits as follows:
302*c9083b85SXin LI
303*c9083b85SXin LI               bit 0   FTEXT
304*c9083b85SXin LI               bit 1   FHCRC
305*c9083b85SXin LI               bit 2   FEXTRA
306*c9083b85SXin LI               bit 3   FNAME
307*c9083b85SXin LI               bit 4   FCOMMENT
308*c9083b85SXin LI               bit 5   reserved
309*c9083b85SXin LI               bit 6   reserved
310*c9083b85SXin LI               bit 7   reserved
311*c9083b85SXin LI
312*c9083b85SXin LI            If FTEXT is set, the file is probably ASCII text.  This is
313*c9083b85SXin LI            an optional indication, which the compressor may set by
314*c9083b85SXin LI            checking a small amount of the input data to see whether any
315*c9083b85SXin LI            non-ASCII characters are present.  In case of doubt, FTEXT
316*c9083b85SXin LI            is cleared, indicating binary data. For systems which have
317*c9083b85SXin LI            different file formats for ascii text and binary data, the
318*c9083b85SXin LI            decompressor can use FTEXT to choose the appropriate format.
319*c9083b85SXin LI            We deliberately do not specify the algorithm used to set
320*c9083b85SXin LI            this bit, since a compressor always has the option of
321*c9083b85SXin LI            leaving it cleared and a decompressor always has the option
322*c9083b85SXin LI            of ignoring it and letting some other program handle issues
323*c9083b85SXin LI            of data conversion.
324*c9083b85SXin LI
325*c9083b85SXin LI            If FHCRC is set, a CRC16 for the gzip header is present,
326*c9083b85SXin LI            immediately before the compressed data. The CRC16 consists
327*c9083b85SXin LI            of the two least significant bytes of the CRC32 for all
328*c9083b85SXin LI            bytes of the gzip header up to and not including the CRC16.
329*c9083b85SXin LI            [The FHCRC bit was never set by versions of gzip up to
330*c9083b85SXin LI            1.2.4, even though it was documented with a different
331*c9083b85SXin LI            meaning in gzip 1.2.4.]
332*c9083b85SXin LI
333*c9083b85SXin LI            If FEXTRA is set, optional extra fields are present, as
334*c9083b85SXin LI            described in a following section.
335*c9083b85SXin LI
336*c9083b85SXin LI
337*c9083b85SXin LI
338*c9083b85SXin LIDeutsch                      Informational                      [Page 6]
339*c9083b85SXin LI
340*c9083b85SXin LIRFC 1952             GZIP File Format Specification             May 1996
341*c9083b85SXin LI
342*c9083b85SXin LI
343*c9083b85SXin LI            If FNAME is set, an original file name is present,
344*c9083b85SXin LI            terminated by a zero byte.  The name must consist of ISO
345*c9083b85SXin LI            8859-1 (LATIN-1) characters; on operating systems using
346*c9083b85SXin LI            EBCDIC or any other character set for file names, the name
347*c9083b85SXin LI            must be translated to the ISO LATIN-1 character set.  This
348*c9083b85SXin LI            is the original name of the file being compressed, with any
349*c9083b85SXin LI            directory components removed, and, if the file being
350*c9083b85SXin LI            compressed is on a file system with case insensitive names,
351*c9083b85SXin LI            forced to lower case. There is no original file name if the
352*c9083b85SXin LI            data was compressed from a source other than a named file;
353*c9083b85SXin LI            for example, if the source was stdin on a Unix system, there
354*c9083b85SXin LI            is no file name.
355*c9083b85SXin LI
356*c9083b85SXin LI            If FCOMMENT is set, a zero-terminated file comment is
357*c9083b85SXin LI            present.  This comment is not interpreted; it is only
358*c9083b85SXin LI            intended for human consumption.  The comment must consist of
359*c9083b85SXin LI            ISO 8859-1 (LATIN-1) characters.  Line breaks should be
360*c9083b85SXin LI            denoted by a single line feed character (10 decimal).
361*c9083b85SXin LI
362*c9083b85SXin LI            Reserved FLG bits must be zero.
363*c9083b85SXin LI
364*c9083b85SXin LI         MTIME (Modification TIME)
365*c9083b85SXin LI            This gives the most recent modification time of the original
366*c9083b85SXin LI            file being compressed.  The time is in Unix format, i.e.,
367*c9083b85SXin LI            seconds since 00:00:00 GMT, Jan.  1, 1970.  (Note that this
368*c9083b85SXin LI            may cause problems for MS-DOS and other systems that use
369*c9083b85SXin LI            local rather than Universal time.)  If the compressed data
370*c9083b85SXin LI            did not come from a file, MTIME is set to the time at which
371*c9083b85SXin LI            compression started.  MTIME = 0 means no time stamp is
372*c9083b85SXin LI            available.
373*c9083b85SXin LI
374*c9083b85SXin LI         XFL (eXtra FLags)
375*c9083b85SXin LI            These flags are available for use by specific compression
376*c9083b85SXin LI            methods.  The "deflate" method (CM = 8) sets these flags as
377*c9083b85SXin LI            follows:
378*c9083b85SXin LI
379*c9083b85SXin LI               XFL = 2 - compressor used maximum compression,
380*c9083b85SXin LI                         slowest algorithm
381*c9083b85SXin LI               XFL = 4 - compressor used fastest algorithm
382*c9083b85SXin LI
383*c9083b85SXin LI         OS (Operating System)
384*c9083b85SXin LI            This identifies the type of file system on which compression
385*c9083b85SXin LI            took place.  This may be useful in determining end-of-line
386*c9083b85SXin LI            convention for text files.  The currently defined values are
387*c9083b85SXin LI            as follows:
388*c9083b85SXin LI
389*c9083b85SXin LI
390*c9083b85SXin LI
391*c9083b85SXin LI
392*c9083b85SXin LI
393*c9083b85SXin LI
394*c9083b85SXin LIDeutsch                      Informational                      [Page 7]
395*c9083b85SXin LI
396*c9083b85SXin LIRFC 1952             GZIP File Format Specification             May 1996
397*c9083b85SXin LI
398*c9083b85SXin LI
399*c9083b85SXin LI                 0 - FAT filesystem (MS-DOS, OS/2, NT/Win32)
400*c9083b85SXin LI                 1 - Amiga
401*c9083b85SXin LI                 2 - VMS (or OpenVMS)
402*c9083b85SXin LI                 3 - Unix
403*c9083b85SXin LI                 4 - VM/CMS
404*c9083b85SXin LI                 5 - Atari TOS
405*c9083b85SXin LI                 6 - HPFS filesystem (OS/2, NT)
406*c9083b85SXin LI                 7 - Macintosh
407*c9083b85SXin LI                 8 - Z-System
408*c9083b85SXin LI                 9 - CP/M
409*c9083b85SXin LI                10 - TOPS-20
410*c9083b85SXin LI                11 - NTFS filesystem (NT)
411*c9083b85SXin LI                12 - QDOS
412*c9083b85SXin LI                13 - Acorn RISCOS
413*c9083b85SXin LI               255 - unknown
414*c9083b85SXin LI
415*c9083b85SXin LI         XLEN (eXtra LENgth)
416*c9083b85SXin LI            If FLG.FEXTRA is set, this gives the length of the optional
417*c9083b85SXin LI            extra field.  See below for details.
418*c9083b85SXin LI
419*c9083b85SXin LI         CRC32 (CRC-32)
420*c9083b85SXin LI            This contains a Cyclic Redundancy Check value of the
421*c9083b85SXin LI            uncompressed data computed according to CRC-32 algorithm
422*c9083b85SXin LI            used in the ISO 3309 standard and in section 8.1.1.6.2 of
423*c9083b85SXin LI            ITU-T recommendation V.42.  (See http://www.iso.ch for
424*c9083b85SXin LI            ordering ISO documents. See gopher://info.itu.ch for an
425*c9083b85SXin LI            online version of ITU-T V.42.)
426*c9083b85SXin LI
427*c9083b85SXin LI         ISIZE (Input SIZE)
428*c9083b85SXin LI            This contains the size of the original (uncompressed) input
429*c9083b85SXin LI            data modulo 2^32.
430*c9083b85SXin LI
431*c9083b85SXin LI      2.3.1.1. Extra field
432*c9083b85SXin LI
433*c9083b85SXin LI         If the FLG.FEXTRA bit is set, an "extra field" is present in
434*c9083b85SXin LI         the header, with total length XLEN bytes.  It consists of a
435*c9083b85SXin LI         series of subfields, each of the form:
436*c9083b85SXin LI
437*c9083b85SXin LI            +---+---+---+---+==================================+
438*c9083b85SXin LI            |SI1|SI2|  LEN  |... LEN bytes of subfield data ...|
439*c9083b85SXin LI            +---+---+---+---+==================================+
440*c9083b85SXin LI
441*c9083b85SXin LI         SI1 and SI2 provide a subfield ID, typically two ASCII letters
442*c9083b85SXin LI         with some mnemonic value.  Jean-Loup Gailly
443*c9083b85SXin LI         <gzip@prep.ai.mit.edu> is maintaining a registry of subfield
444*c9083b85SXin LI         IDs; please send him any subfield ID you wish to use.  Subfield
445*c9083b85SXin LI         IDs with SI2 = 0 are reserved for future use.  The following
446*c9083b85SXin LI         IDs are currently defined:
447*c9083b85SXin LI
448*c9083b85SXin LI
449*c9083b85SXin LI
450*c9083b85SXin LIDeutsch                      Informational                      [Page 8]
451*c9083b85SXin LI
452*c9083b85SXin LIRFC 1952             GZIP File Format Specification             May 1996
453*c9083b85SXin LI
454*c9083b85SXin LI
455*c9083b85SXin LI            SI1         SI2         Data
456*c9083b85SXin LI            ----------  ----------  ----
457*c9083b85SXin LI            0x41 ('A')  0x70 ('P')  Apollo file type information
458*c9083b85SXin LI
459*c9083b85SXin LI         LEN gives the length of the subfield data, excluding the 4
460*c9083b85SXin LI         initial bytes.
461*c9083b85SXin LI
462*c9083b85SXin LI      2.3.1.2. Compliance
463*c9083b85SXin LI
464*c9083b85SXin LI         A compliant compressor must produce files with correct ID1,
465*c9083b85SXin LI         ID2, CM, CRC32, and ISIZE, but may set all the other fields in
466*c9083b85SXin LI         the fixed-length part of the header to default values (255 for
467*c9083b85SXin LI         OS, 0 for all others).  The compressor must set all reserved
468*c9083b85SXin LI         bits to zero.
469*c9083b85SXin LI
470*c9083b85SXin LI         A compliant decompressor must check ID1, ID2, and CM, and
471*c9083b85SXin LI         provide an error indication if any of these have incorrect
472*c9083b85SXin LI         values.  It must examine FEXTRA/XLEN, FNAME, FCOMMENT and FHCRC
473*c9083b85SXin LI         at least so it can skip over the optional fields if they are
474*c9083b85SXin LI         present.  It need not examine any other part of the header or
475*c9083b85SXin LI         trailer; in particular, a decompressor may ignore FTEXT and OS
476*c9083b85SXin LI         and always produce binary output, and still be compliant.  A
477*c9083b85SXin LI         compliant decompressor must give an error indication if any
478*c9083b85SXin LI         reserved bit is non-zero, since such a bit could indicate the
479*c9083b85SXin LI         presence of a new field that would cause subsequent data to be
480*c9083b85SXin LI         interpreted incorrectly.
481*c9083b85SXin LI
482*c9083b85SXin LI3. References
483*c9083b85SXin LI
484*c9083b85SXin LI   [1] "Information Processing - 8-bit single-byte coded graphic
485*c9083b85SXin LI       character sets - Part 1: Latin alphabet No.1" (ISO 8859-1:1987).
486*c9083b85SXin LI       The ISO 8859-1 (Latin-1) character set is a superset of 7-bit
487*c9083b85SXin LI       ASCII. Files defining this character set are available as
488*c9083b85SXin LI       iso_8859-1.* in ftp://ftp.uu.net/graphics/png/documents/
489*c9083b85SXin LI
490*c9083b85SXin LI   [2] ISO 3309
491*c9083b85SXin LI
492*c9083b85SXin LI   [3] ITU-T recommendation V.42
493*c9083b85SXin LI
494*c9083b85SXin LI   [4] Deutsch, L.P.,"DEFLATE Compressed Data Format Specification",
495*c9083b85SXin LI       available in ftp://ftp.uu.net/pub/archiving/zip/doc/
496*c9083b85SXin LI
497*c9083b85SXin LI   [5] Gailly, J.-L., GZIP documentation, available as gzip-*.tar in
498*c9083b85SXin LI       ftp://prep.ai.mit.edu/pub/gnu/
499*c9083b85SXin LI
500*c9083b85SXin LI   [6] Sarwate, D.V., "Computation of Cyclic Redundancy Checks via Table
501*c9083b85SXin LI       Look-Up", Communications of the ACM, 31(8), pp.1008-1013.
502*c9083b85SXin LI
503*c9083b85SXin LI
504*c9083b85SXin LI
505*c9083b85SXin LI
506*c9083b85SXin LIDeutsch                      Informational                      [Page 9]
507*c9083b85SXin LI
508*c9083b85SXin LIRFC 1952             GZIP File Format Specification             May 1996
509*c9083b85SXin LI
510*c9083b85SXin LI
511*c9083b85SXin LI   [7] Schwaderer, W.D., "CRC Calculation", April 85 PC Tech Journal,
512*c9083b85SXin LI       pp.118-133.
513*c9083b85SXin LI
514*c9083b85SXin LI   [8] ftp://ftp.adelaide.edu.au/pub/rocksoft/papers/crc_v3.txt,
515*c9083b85SXin LI       describing the CRC concept.
516*c9083b85SXin LI
517*c9083b85SXin LI4. Security Considerations
518*c9083b85SXin LI
519*c9083b85SXin LI   Any data compression method involves the reduction of redundancy in
520*c9083b85SXin LI   the data.  Consequently, any corruption of the data is likely to have
521*c9083b85SXin LI   severe effects and be difficult to correct.  Uncompressed text, on
522*c9083b85SXin LI   the other hand, will probably still be readable despite the presence
523*c9083b85SXin LI   of some corrupted bytes.
524*c9083b85SXin LI
525*c9083b85SXin LI   It is recommended that systems using this data format provide some
526*c9083b85SXin LI   means of validating the integrity of the compressed data, such as by
527*c9083b85SXin LI   setting and checking the CRC-32 check value.
528*c9083b85SXin LI
529*c9083b85SXin LI5. Acknowledgements
530*c9083b85SXin LI
531*c9083b85SXin LI   Trademarks cited in this document are the property of their
532*c9083b85SXin LI   respective owners.
533*c9083b85SXin LI
534*c9083b85SXin LI   Jean-Loup Gailly designed the gzip format and wrote, with Mark Adler,
535*c9083b85SXin LI   the related software described in this specification.  Glenn
536*c9083b85SXin LI   Randers-Pehrson converted this document to RFC and HTML format.
537*c9083b85SXin LI
538*c9083b85SXin LI6. Author's Address
539*c9083b85SXin LI
540*c9083b85SXin LI   L. Peter Deutsch
541*c9083b85SXin LI   Aladdin Enterprises
542*c9083b85SXin LI   203 Santa Margarita Ave.
543*c9083b85SXin LI   Menlo Park, CA 94025
544*c9083b85SXin LI
545*c9083b85SXin LI   Phone: (415) 322-0103 (AM only)
546*c9083b85SXin LI   FAX:   (415) 322-1734
547*c9083b85SXin LI   EMail: <ghost@aladdin.com>
548*c9083b85SXin LI
549*c9083b85SXin LI   Questions about the technical content of this specification can be
550*c9083b85SXin LI   sent by email to:
551*c9083b85SXin LI
552*c9083b85SXin LI   Jean-Loup Gailly <gzip@prep.ai.mit.edu> and
553*c9083b85SXin LI   Mark Adler <madler@alumni.caltech.edu>
554*c9083b85SXin LI
555*c9083b85SXin LI   Editorial comments on this specification can be sent by email to:
556*c9083b85SXin LI
557*c9083b85SXin LI   L. Peter Deutsch <ghost@aladdin.com> and
558*c9083b85SXin LI   Glenn Randers-Pehrson <randeg@alumni.rpi.edu>
559*c9083b85SXin LI
560*c9083b85SXin LI
561*c9083b85SXin LI
562*c9083b85SXin LIDeutsch                      Informational                     [Page 10]
563*c9083b85SXin LI
564*c9083b85SXin LIRFC 1952             GZIP File Format Specification             May 1996
565*c9083b85SXin LI
566*c9083b85SXin LI
567*c9083b85SXin LI7. Appendix: Jean-Loup Gailly's gzip utility
568*c9083b85SXin LI
569*c9083b85SXin LI   The most widely used implementation of gzip compression, and the
570*c9083b85SXin LI   original documentation on which this specification is based, were
571*c9083b85SXin LI   created by Jean-Loup Gailly <gzip@prep.ai.mit.edu>.  Since this
572*c9083b85SXin LI   implementation is a de facto standard, we mention some more of its
573*c9083b85SXin LI   features here.  Again, the material in this section is not part of
574*c9083b85SXin LI   the specification per se, and implementations need not follow it to
575*c9083b85SXin LI   be compliant.
576*c9083b85SXin LI
577*c9083b85SXin LI   When compressing or decompressing a file, gzip preserves the
578*c9083b85SXin LI   protection, ownership, and modification time attributes on the local
579*c9083b85SXin LI   file system, since there is no provision for representing protection
580*c9083b85SXin LI   attributes in the gzip file format itself.  Since the file format
581*c9083b85SXin LI   includes a modification time, the gzip decompressor provides a
582*c9083b85SXin LI   command line switch that assigns the modification time from the file,
583*c9083b85SXin LI   rather than the local modification time of the compressed input, to
584*c9083b85SXin LI   the decompressed output.
585*c9083b85SXin LI
586*c9083b85SXin LI8. Appendix: Sample CRC Code
587*c9083b85SXin LI
588*c9083b85SXin LI   The following sample code represents a practical implementation of
589*c9083b85SXin LI   the CRC (Cyclic Redundancy Check). (See also ISO 3309 and ITU-T V.42
590*c9083b85SXin LI   for a formal specification.)
591*c9083b85SXin LI
592*c9083b85SXin LI   The sample code is in the ANSI C programming language. Non C users
593*c9083b85SXin LI   may find it easier to read with these hints:
594*c9083b85SXin LI
595*c9083b85SXin LI      &      Bitwise AND operator.
596*c9083b85SXin LI      ^      Bitwise exclusive-OR operator.
597*c9083b85SXin LI      >>     Bitwise right shift operator. When applied to an
598*c9083b85SXin LI             unsigned quantity, as here, right shift inserts zero
599*c9083b85SXin LI             bit(s) at the left.
600*c9083b85SXin LI      !      Logical NOT operator.
601*c9083b85SXin LI      ++     "n++" increments the variable n.
602*c9083b85SXin LI      0xNNN  0x introduces a hexadecimal (base 16) constant.
603*c9083b85SXin LI             Suffix L indicates a long value (at least 32 bits).
604*c9083b85SXin LI
605*c9083b85SXin LI      /* Table of CRCs of all 8-bit messages. */
606*c9083b85SXin LI      unsigned long crc_table[256];
607*c9083b85SXin LI
608*c9083b85SXin LI      /* Flag: has the table been computed? Initially false. */
609*c9083b85SXin LI      int crc_table_computed = 0;
610*c9083b85SXin LI
611*c9083b85SXin LI      /* Make the table for a fast CRC. */
612*c9083b85SXin LI      void make_crc_table(void)
613*c9083b85SXin LI      {
614*c9083b85SXin LI        unsigned long c;
615*c9083b85SXin LI
616*c9083b85SXin LI
617*c9083b85SXin LI
618*c9083b85SXin LIDeutsch                      Informational                     [Page 11]
619*c9083b85SXin LI
620*c9083b85SXin LIRFC 1952             GZIP File Format Specification             May 1996
621*c9083b85SXin LI
622*c9083b85SXin LI
623*c9083b85SXin LI        int n, k;
624*c9083b85SXin LI        for (n = 0; n < 256; n++) {
625*c9083b85SXin LI          c = (unsigned long) n;
626*c9083b85SXin LI          for (k = 0; k < 8; k++) {
627*c9083b85SXin LI            if (c & 1) {
628*c9083b85SXin LI              c = 0xedb88320L ^ (c >> 1);
629*c9083b85SXin LI            } else {
630*c9083b85SXin LI              c = c >> 1;
631*c9083b85SXin LI            }
632*c9083b85SXin LI          }
633*c9083b85SXin LI          crc_table[n] = c;
634*c9083b85SXin LI        }
635*c9083b85SXin LI        crc_table_computed = 1;
636*c9083b85SXin LI      }
637*c9083b85SXin LI
638*c9083b85SXin LI      /*
639*c9083b85SXin LI         Update a running crc with the bytes buf[0..len-1] and return
640*c9083b85SXin LI       the updated crc. The crc should be initialized to zero. Pre- and
641*c9083b85SXin LI       post-conditioning (one's complement) is performed within this
642*c9083b85SXin LI       function so it shouldn't be done by the caller. Usage example:
643*c9083b85SXin LI
644*c9083b85SXin LI         unsigned long crc = 0L;
645*c9083b85SXin LI
646*c9083b85SXin LI         while (read_buffer(buffer, length) != EOF) {
647*c9083b85SXin LI           crc = update_crc(crc, buffer, length);
648*c9083b85SXin LI         }
649*c9083b85SXin LI         if (crc != original_crc) error();
650*c9083b85SXin LI      */
651*c9083b85SXin LI      unsigned long update_crc(unsigned long crc,
652*c9083b85SXin LI                      unsigned char *buf, int len)
653*c9083b85SXin LI      {
654*c9083b85SXin LI        unsigned long c = crc ^ 0xffffffffL;
655*c9083b85SXin LI        int n;
656*c9083b85SXin LI
657*c9083b85SXin LI        if (!crc_table_computed)
658*c9083b85SXin LI          make_crc_table();
659*c9083b85SXin LI        for (n = 0; n < len; n++) {
660*c9083b85SXin LI          c = crc_table[(c ^ buf[n]) & 0xff] ^ (c >> 8);
661*c9083b85SXin LI        }
662*c9083b85SXin LI        return c ^ 0xffffffffL;
663*c9083b85SXin LI      }
664*c9083b85SXin LI
665*c9083b85SXin LI      /* Return the CRC of the bytes buf[0..len-1]. */
666*c9083b85SXin LI      unsigned long crc(unsigned char *buf, int len)
667*c9083b85SXin LI      {
668*c9083b85SXin LI        return update_crc(0L, buf, len);
669*c9083b85SXin LI      }
670*c9083b85SXin LI
671*c9083b85SXin LI
672*c9083b85SXin LI
673*c9083b85SXin LI
674*c9083b85SXin LIDeutsch                      Informational                     [Page 12]
675*c9083b85SXin LI
676