xref: /freebsd/contrib/tzdata/zishrink.awk (revision e9b1dc32c9bd2ebae5f9e140bfa0e0321bc366b5)
1# Convert tzdata source into a smaller version of itself.
2
3# Contributed by Paul Eggert.  This file is in the public domain.
4
5# This is not a general-purpose converter; it is designed for current tzdata.
6# 'zic' should treat this script's output as if it were identical to
7# this script's input.
8
9# Record a hash N for the new name NAME, checking for collisions.
10
11function record_hash(n, name)
12{
13  if (used_hashes[n]) {
14    printf "# ! collision: %s %s\n", used_hashes[n], name
15    exit 1
16  }
17  used_hashes[n] = name
18}
19
20# Return a shortened rule name representing NAME,
21# and record this relationship to the hash table.
22
23function gen_rule_name(name, n)
24{
25  # Use a simple memonic: the first two letters.
26  n = substr(name, 1, 2)
27  record_hash(n, name)
28  # printf "# %s = %s\n", n, name
29  return n
30}
31
32function prehash_rule_names(name)
33{
34  # Rule names are not part of the tzdb API, so substitute shorter
35  # ones.  Shortening them consistently from one release to the next
36  # simplifies comparison of the output.  That being said, the
37  # 1-letter names below are not standardized in any way, and can
38  # change arbitrarily from one release to the next, as the main goal
39  # here is compression not comparison.
40
41  # Abbreviating these rules names to one letter saved the most space
42  # circa 2018e.
43  rule["Arg"] = "A"
44  rule["Brazil"] = "B"
45  rule["Canada"] = "C"
46  rule["Denmark"] = "D"
47  rule["EU"] = "E"
48  rule["France"] = "F"
49  rule["GB-Eire"] = "G"
50  rule["Halifax"] = "H"
51  rule["Italy"] = "I"
52  rule["Jordan"] = "J"
53  rule["Egypt"] = "K" # "Kemet" in ancient Egyptian
54  rule["Libya"] = "L"
55  rule["Morocco"] = "M"
56  rule["Neth"] = "N"
57  rule["Poland"] = "O" # arbitrary
58  rule["Palestine"] = "P"
59  rule["Cuba"] = "Q" # Its start sounds like "Q".
60  rule["Russia"] = "R"
61  rule["Syria"] = "S"
62  rule["Turkey"] = "T"
63  rule["Uruguay"] = "U"
64  rule["Vincennes"] = "V"
65  rule["Winn"] = "W"
66  rule["Mongol"] = "X" # arbitrary
67  rule["NT_YK"] = "Y"
68  rule["Zion"] = "Z"
69  rule["Austria"] = "a"
70  rule["Belgium"] = "b"
71  rule["C-Eur"] = "c"
72  rule["Algeria"] = "d" # country code DZ
73  rule["E-Eur"] = "e"
74  rule["Taiwan"] = "f" # Formosa
75  rule["Greece"] = "g"
76  rule["Hungary"] = "h"
77  rule["Iran"] = "i"
78  rule["StJohns"] = "j"
79  rule["Chatham"] = "k" # arbitrary
80  rule["Lebanon"] = "l"
81  rule["Mexico"] = "m"
82  rule["Tunisia"] = "n" # country code TN
83  rule["Moncton"] = "o" # arbitrary
84  rule["Port"] = "p"
85  rule["Albania"] = "q" # arbitrary
86  rule["Regina"] = "r"
87  rule["Spain"] = "s"
88  rule["Toronto"] = "t"
89  rule["US"] = "u"
90  rule["Louisville"] = "v" # ville
91  rule["Iceland"] = "w" # arbitrary
92  rule["Chile"] = "x" # arbitrary
93  rule["Para"] = "y" # country code PY
94  rule["Romania"] = "z" # arbitrary
95  rule["Macau"] = "_" # arbitrary
96
97  # Use ISO 3166 alpha-2 country codes for remaining names that are countries.
98  # This is more systematic, and avoids collisions (e.g., Malta and Moldova).
99  rule["Armenia"] = "AM"
100  rule["Aus"] = "AU"
101  rule["Azer"] = "AZ"
102  rule["Barb"] = "BB"
103  rule["Dhaka"] = "BD"
104  rule["Bulg"] = "BG"
105  rule["Bahamas"] = "BS"
106  rule["Belize"] = "BZ"
107  rule["Swiss"] = "CH"
108  rule["Cook"] = "CK"
109  rule["PRC"] = "CN"
110  rule["Cyprus"] = "CY"
111  rule["Czech"] = "CZ"
112  rule["Germany"] = "DE"
113  rule["DR"] = "DO"
114  rule["Ecuador"] = "EC"
115  rule["Finland"] = "FI"
116  rule["Fiji"] = "FJ"
117  rule["Falk"] = "FK"
118  rule["Ghana"] = "GH"
119  rule["Guat"] = "GT"
120  rule["Hond"] = "HN"
121  rule["Haiti"] = "HT"
122  rule["Eire"] = "IE"
123  rule["Iraq"] = "IQ"
124  rule["Japan"] = "JP"
125  rule["Kyrgyz"] = "KG"
126  rule["ROK"] = "KR"
127  rule["Latvia"] = "LV"
128  rule["Lux"] = "LX"
129  rule["Moldova"] = "MD"
130  rule["Malta"] = "MT"
131  rule["Mauritius"] = "MU"
132  rule["Namibia"] = "NA"
133  rule["Nic"] = "NI"
134  rule["Norway"] = "NO"
135  rule["Peru"] = "PE"
136  rule["Phil"] = "PH"
137  rule["Pakistan"] = "PK"
138  rule["Sudan"] = "SD"
139  rule["Salv"] = "SV"
140  rule["Tonga"] = "TO"
141  rule["Vanuatu"] = "VU"
142
143  # Avoid collisions.
144  rule["Detroit"] = "Dt" # De = Denver
145
146  for (name in rule) {
147    record_hash(rule[name], name)
148  }
149}
150
151# Process an input line and save it for later output.
152
153function process_input_line(line, field, end, i, n, startdef)
154{
155  # Remove comments, normalize spaces, and append a space to each line.
156  sub(/#.*/, "", line)
157  line = line " "
158  gsub(/[\t ]+/, " ", line)
159
160  # Abbreviate keywords.  Do not abbreviate "Link" to just "L",
161  # as pre-2017c zic erroneously diagnoses "Li" as ambiguous.
162  sub(/^Link /, "Li ", line)
163  sub(/^Rule /, "R ", line)
164  sub(/^Zone /, "Z ", line)
165
166  # SystemV rules are not needed.
167  if (line ~ /^R SystemV /) return
168
169  # Replace FooAsia rules with the same rules without "Asia", as they
170  # are duplicates.
171  if (match(line, /[^ ]Asia /)) {
172    if (line ~ /^R /) return
173    line = substr(line, 1, RSTART) substr(line, RSTART + 5)
174  }
175  # Replace SpainAfrica rules with Morocco, as they are duplicates.
176  if (match(line, / SpainAfrica /)) {
177    if (line ~ /^R /) return
178    line = substr(line, 1, RSTART) "Morocco" substr(line, RSTART + RLENGTH - 1)
179  }
180
181  # Abbreviate times.
182  while (match(line, /[: ]0+[0-9]/))
183    line = substr(line, 1, RSTART) substr(line, RSTART + RLENGTH - 1)
184  while (match(line, /:0[^:]/))
185    line = substr(line, 1, RSTART - 1) substr(line, RSTART + 2)
186
187  # Abbreviate weekday names.  Do not abbreviate "Sun" and "Sat", as
188  # pre-2017c zic erroneously diagnoses "Su" and "Sa" as ambiguous.
189  while (match(line, / (last)?(Mon|Wed|Fri)[ <>]/)) {
190    end = RSTART + RLENGTH
191    line = substr(line, 1, end - 4) substr(line, end - 1)
192  }
193  while (match(line, / (last)?(Tue|Thu)[ <>]/)) {
194    end = RSTART + RLENGTH
195    line = substr(line, 1, end - 3) substr(line, end - 1)
196  }
197
198  # Abbreviate "max", "only" and month names.
199  # Do not abbreviate "min", as pre-2017c zic erroneously diagnoses "mi"
200  # as ambiguous.
201  gsub(/ max /, " ma ", line)
202  gsub(/ only /, " o ", line)
203  gsub(/ Jan /, " Ja ", line)
204  gsub(/ Feb /, " F ", line)
205  gsub(/ Apr /, " Ap ", line)
206  gsub(/ Aug /, " Au ", line)
207  gsub(/ Sep /, " S ", line)
208  gsub(/ Oct /, " O ", line)
209  gsub(/ Nov /, " N ", line)
210  gsub(/ Dec /, " D ", line)
211
212  # Strip leading and trailing space.
213  sub(/^ /, "", line)
214  sub(/ $/, "", line)
215
216  # Remove unnecessary trailing zero fields.
217  sub(/ 0+$/, "", line)
218
219  # Remove unnecessary trailing days-of-month "1".
220  if (match(line, /[A-Za-z] 1$/))
221    line = substr(line, 1, RSTART)
222
223  # Remove unnecessary trailing " Ja" (for January).
224  sub(/ Ja$/, "", line)
225
226  n = split(line, field)
227
228  # Abbreviate rule names.
229  i = field[1] == "Z" ? 4 : field[1] == "Li" ? 0 : 2
230  if (i && field[i] ~ /^[^-+0-9]/) {
231    if (!rule[field[i]])
232      rule[field[i]] = gen_rule_name(field[i])
233    field[i] = rule[field[i]]
234  }
235
236  # If this zone supersedes an earlier one, delete the earlier one
237  # from the saved output lines.
238  startdef = ""
239  if (field[1] == "Z")
240    zonename = startdef = field[2]
241  else if (field[1] == "Li")
242    zonename = startdef = field[3]
243  else if (field[1] == "R")
244    zonename = ""
245  if (startdef) {
246    i = zonedef[startdef]
247    if (i) {
248      do
249	output_line[i - 1] = ""
250      while (output_line[i++] ~ /^[-+0-9]/);
251    }
252  }
253  zonedef[zonename] = nout + 1
254
255  # Save the line for later output.
256  line = field[1]
257  for (i = 2; i <= n; i++)
258    line = line " " field[i]
259  output_line[nout++] = line
260}
261
262function output_saved_lines(i)
263{
264  for (i = 0; i < nout; i++)
265    if (output_line[i])
266      print output_line[i]
267}
268
269BEGIN {
270  # Files that the output normally depends on.
271  default_dep["africa"] = 1
272  default_dep["antarctica"] = 1
273  default_dep["asia"] = 1
274  default_dep["australasia"] = 1
275  default_dep["backward"] = 1
276  default_dep["etcetera"] = 1
277  default_dep["europe"] = 1
278  default_dep["factory"] = 1
279  default_dep["northamerica"] = 1
280  default_dep["southamerica"] = 1
281  default_dep["systemv"] = 1
282  default_dep["ziguard.awk"] = 1
283  default_dep["zishrink.awk"] = 1
284
285  # Output a version string from 'version' and related configuration variables
286  # supported by tzdb's Makefile.  If you change the makefile or any other files
287  # that affect the output of this script, you should append '-SOMETHING'
288  # to the contents of 'version', where SOMETHING identifies what was changed.
289
290  ndeps = split(deps, dep)
291  ddeps = ""
292  for (i = 1; i <= ndeps; i++) {
293    if (default_dep[dep[i]]) {
294      default_dep[dep[i]]++
295    } else {
296      ddeps = ddeps " " dep[i]
297    }
298  }
299  for (d in default_dep) {
300    if (default_dep[d] == 1) {
301      ddeps = ddeps " !" d
302    }
303  }
304  print "# version", version
305  if (dataform != "main") {
306    print "# dataform", dataform
307  }
308  if (redo != "posix_right") {
309    print "# redo " redo
310  }
311  if (ddeps) {
312    print "# ddeps" ddeps
313  }
314  print "# This zic input file is in the public domain."
315
316  prehash_rule_names()
317}
318
319/^[\t ]*[^#\t ]/ {
320  process_input_line($0)
321}
322
323END {
324  output_saved_lines()
325}
326