xref: /freebsd/contrib/tzdata/zishrink.awk (revision ebacd8013fe5f7fdf9f6a5b286f6680dd2891036)
1# Convert tzdata source into a smaller version of itself.
2
3# Contributed by Paul Eggert.  This file is in the public domain.
4
5# This is not a general-purpose converter; it is designed for current tzdata.
6# 'zic' should treat this script's output as if it were identical to
7# this script's input.
8
9# Record a hash N for the new name NAME, checking for collisions.
10
11function record_hash(n, name)
12{
13  if (used_hashes[n]) {
14    printf "# ! collision: %s %s\n", used_hashes[n], name
15    exit 1
16  }
17  used_hashes[n] = name
18}
19
20# Return a shortened rule name representing NAME,
21# and record this relationship to the hash table.
22
23function gen_rule_name(name, \
24		       n)
25{
26  # Use a simple mnemonic: the first two letters.
27  n = substr(name, 1, 2)
28  record_hash(n, name)
29  # printf "# %s = %s\n", n, name
30  return n
31}
32
33function prehash_rule_names( \
34			    name)
35{
36  # Rule names are not part of the tzdb API, so substitute shorter
37  # ones.  Shortening them consistently from one release to the next
38  # simplifies comparison of the output.  That being said, the
39  # 1-letter names below are not standardized in any way, and can
40  # change arbitrarily from one release to the next, as the main goal
41  # here is compression not comparison.
42
43  # Abbreviating these rules names to one letter saved the most space
44  # circa 2018e.
45  rule["Arg"] = "A"
46  rule["Brazil"] = "B"
47  rule["Canada"] = "C"
48  rule["Denmark"] = "D"
49  rule["EU"] = "E"
50  rule["France"] = "F"
51  rule["GB-Eire"] = "G"
52  rule["Halifax"] = "H"
53  rule["Italy"] = "I"
54  rule["Jordan"] = "J"
55  rule["Egypt"] = "K" # "Kemet" in ancient Egyptian
56  rule["Libya"] = "L"
57  rule["Morocco"] = "M"
58  rule["Neth"] = "N"
59  rule["Poland"] = "O" # arbitrary
60  rule["Palestine"] = "P"
61  rule["Cuba"] = "Q" # Its start sounds like "Q".
62  rule["Russia"] = "R"
63  rule["Syria"] = "S"
64  rule["Turkey"] = "T"
65  rule["Uruguay"] = "U"
66  rule["Vincennes"] = "V"
67  rule["Winn"] = "W"
68  rule["Mongol"] = "X" # arbitrary
69  rule["NT_YK"] = "Y"
70  rule["Zion"] = "Z"
71  rule["Austria"] = "a"
72  rule["Belgium"] = "b"
73  rule["C-Eur"] = "c"
74  rule["Algeria"] = "d" # country code DZ
75  rule["E-Eur"] = "e"
76  rule["Taiwan"] = "f" # Formosa
77  rule["Greece"] = "g"
78  rule["Hungary"] = "h"
79  rule["Iran"] = "i"
80  rule["StJohns"] = "j"
81  rule["Chatham"] = "k" # arbitrary
82  rule["Lebanon"] = "l"
83  rule["Mexico"] = "m"
84  rule["Tunisia"] = "n" # country code TN
85  rule["Moncton"] = "o" # arbitrary
86  rule["Port"] = "p"
87  rule["Albania"] = "q" # arbitrary
88  rule["Regina"] = "r"
89  rule["Spain"] = "s"
90  rule["Toronto"] = "t"
91  rule["US"] = "u"
92  rule["Louisville"] = "v" # ville
93  rule["Iceland"] = "w" # arbitrary
94  rule["Chile"] = "x" # arbitrary
95  rule["Para"] = "y" # country code PY
96  rule["Romania"] = "z" # arbitrary
97  rule["Macau"] = "_" # arbitrary
98
99  # Use ISO 3166 alpha-2 country codes for remaining names that are countries.
100  # This is more systematic, and avoids collisions (e.g., Malta and Moldova).
101  rule["Armenia"] = "AM"
102  rule["Aus"] = "AU"
103  rule["Azer"] = "AZ"
104  rule["Barb"] = "BB"
105  rule["Dhaka"] = "BD"
106  rule["Bulg"] = "BG"
107  rule["Bahamas"] = "BS"
108  rule["Belize"] = "BZ"
109  rule["Swiss"] = "CH"
110  rule["Cook"] = "CK"
111  rule["PRC"] = "CN"
112  rule["Cyprus"] = "CY"
113  rule["Czech"] = "CZ"
114  rule["Germany"] = "DE"
115  rule["DR"] = "DO"
116  rule["Ecuador"] = "EC"
117  rule["Finland"] = "FI"
118  rule["Fiji"] = "FJ"
119  rule["Falk"] = "FK"
120  rule["Ghana"] = "GH"
121  rule["Guat"] = "GT"
122  rule["Hond"] = "HN"
123  rule["Haiti"] = "HT"
124  rule["Eire"] = "IE"
125  rule["Iraq"] = "IQ"
126  rule["Japan"] = "JP"
127  rule["Kyrgyz"] = "KG"
128  rule["ROK"] = "KR"
129  rule["Latvia"] = "LV"
130  rule["Lux"] = "LX"
131  rule["Moldova"] = "MD"
132  rule["Malta"] = "MT"
133  rule["Mauritius"] = "MU"
134  rule["Namibia"] = "NA"
135  rule["Nic"] = "NI"
136  rule["Norway"] = "NO"
137  rule["Peru"] = "PE"
138  rule["Phil"] = "PH"
139  rule["Pakistan"] = "PK"
140  rule["Sudan"] = "SD"
141  rule["Salv"] = "SV"
142  rule["Tonga"] = "TO"
143  rule["Vanuatu"] = "VU"
144
145  # Avoid collisions.
146  rule["Detroit"] = "Dt" # De = Denver
147
148  for (name in rule) {
149    record_hash(rule[name], name)
150  }
151}
152
153function make_line(n, field, \
154		   f, r)
155{
156  r = field[1]
157  for (f = 2; f <= n; f++)
158    r = r " " field[f]
159  return r
160}
161
162# Process the input line LINE and save it for later output.
163
164function process_input_line(line, \
165			    f, field, end, i, n, r, startdef, \
166			    linkline, ruleline, zoneline)
167{
168  # Remove comments, normalize spaces, and append a space to each line.
169  sub(/#.*/, "", line)
170  line = line " "
171  gsub(/[\t ]+/, " ", line)
172
173  # Abbreviate keywords and determine line type.
174  linkline = sub(/^Link /, "L ", line)
175  ruleline = sub(/^Rule /, "R ", line)
176  zoneline = sub(/^Zone /, "Z ", line)
177
178  # Replace FooAsia rules with the same rules without "Asia", as they
179  # are duplicates.
180  if (match(line, /[^ ]Asia /)) {
181    if (ruleline) return
182    line = substr(line, 1, RSTART) substr(line, RSTART + 5)
183  }
184
185  # Abbreviate times.
186  while (match(line, /[: ]0+[0-9]/))
187    line = substr(line, 1, RSTART) substr(line, RSTART + RLENGTH - 1)
188  while (match(line, /:0[^:]/))
189    line = substr(line, 1, RSTART - 1) substr(line, RSTART + 2)
190
191  # Abbreviate weekday names.
192  while (match(line, / (last)?(Mon|Wed|Fri)[ <>]/)) {
193    end = RSTART + RLENGTH
194    line = substr(line, 1, end - 4) substr(line, end - 1)
195  }
196  while (match(line, / (last)?(Sun|Tue|Thu|Sat)[ <>]/)) {
197    end = RSTART + RLENGTH
198    line = substr(line, 1, end - 3) substr(line, end - 1)
199  }
200
201  # Abbreviate "max", "min", "only" and month names.
202  gsub(/ max /, " ma ", line)
203  gsub(/ min /, " mi ", line)
204  gsub(/ only /, " o ", line)
205  gsub(/ Jan /, " Ja ", line)
206  gsub(/ Feb /, " F ", line)
207  gsub(/ Apr /, " Ap ", line)
208  gsub(/ Aug /, " Au ", line)
209  gsub(/ Sep /, " S ", line)
210  gsub(/ Oct /, " O ", line)
211  gsub(/ Nov /, " N ", line)
212  gsub(/ Dec /, " D ", line)
213
214  # Strip leading and trailing space.
215  sub(/^ /, "", line)
216  sub(/ $/, "", line)
217
218  # Remove unnecessary trailing zero fields.
219  sub(/ 0+$/, "", line)
220
221  # Remove unnecessary trailing days-of-month "1".
222  if (match(line, /[A-Za-z] 1$/))
223    line = substr(line, 1, RSTART)
224
225  # Remove unnecessary trailing " Ja" (for January).
226  sub(/ Ja$/, "", line)
227
228  n = split(line, field)
229
230  # Record which rule names are used, and generate their abbreviations.
231  f = zoneline ? 4 : linkline || ruleline ? 0 : 2
232  r = field[f]
233  if (r ~ /^[^-+0-9]/) {
234    rule_used[r] = 1
235  }
236
237  # If this zone supersedes an earlier one, delete the earlier one
238  # from the saved output lines.
239  startdef = ""
240  if (zoneline)
241    zonename = startdef = field[2]
242  else if (linkline)
243    zonename = startdef = field[3]
244  else if (ruleline)
245    zonename = ""
246  if (startdef) {
247    i = zonedef[startdef]
248    if (i) {
249      do
250	output_line[i - 1] = ""
251      while (output_line[i++] ~ /^[-+0-9]/);
252    }
253  }
254  zonedef[zonename] = nout + 1
255
256  # Save the line for later output.
257  output_line[nout++] = make_line(n, field)
258}
259
260function omit_unused_rules( \
261			   i, field)
262{
263  for (i = 0; i < nout; i++) {
264    split(output_line[i], field)
265    if (field[1] == "R" && !rule_used[field[2]]) {
266      output_line[i] = ""
267    }
268  }
269}
270
271function abbreviate_rule_names( \
272			       abbr, f, field, i, n, r)
273{
274  for (i = 0; i < nout; i++) {
275    n = split(output_line[i], field)
276    if (n) {
277      f = field[1] == "Z" ? 4 : field[1] == "L" ? 0 : 2
278      r = field[f]
279      if (r ~ /^[^-+0-9]/) {
280	abbr = rule[r]
281	if (!abbr) {
282	  rule[r] = abbr = gen_rule_name(r)
283	}
284	field[f] = abbr
285	output_line[i] = make_line(n, field)
286      }
287    }
288  }
289}
290
291function output_saved_lines( \
292			    i)
293{
294  for (i = 0; i < nout; i++)
295    if (output_line[i])
296      print output_line[i]
297}
298
299BEGIN {
300  # Files that the output normally depends on.
301  default_dep["africa"] = 1
302  default_dep["antarctica"] = 1
303  default_dep["asia"] = 1
304  default_dep["australasia"] = 1
305  default_dep["backward"] = 1
306  default_dep["etcetera"] = 1
307  default_dep["europe"] = 1
308  default_dep["factory"] = 1
309  default_dep["northamerica"] = 1
310  default_dep["southamerica"] = 1
311  default_dep["ziguard.awk"] = 1
312  default_dep["zishrink.awk"] = 1
313
314  # Output a version string from 'version' and related configuration variables
315  # supported by tzdb's Makefile.  If you change the makefile or any other files
316  # that affect the output of this script, you should append '-SOMETHING'
317  # to the contents of 'version', where SOMETHING identifies what was changed.
318
319  ndeps = split(deps, dep)
320  ddeps = ""
321  for (i = 1; i <= ndeps; i++) {
322    if (default_dep[dep[i]]) {
323      default_dep[dep[i]]++
324    } else {
325      ddeps = ddeps " " dep[i]
326    }
327  }
328  for (d in default_dep) {
329    if (default_dep[d] == 1) {
330      ddeps = ddeps " !" d
331    }
332  }
333  print "# version", version
334  if (dataform != "main") {
335    print "# dataform", dataform
336  }
337  if (redo != "posix_right") {
338    print "# redo " redo
339  }
340  if (ddeps) {
341    print "# ddeps" ddeps
342  }
343  print "# This zic input file is in the public domain."
344
345  prehash_rule_names()
346}
347
348/^[\t ]*[^#\t ]/ {
349  process_input_line($0)
350}
351
352END {
353  omit_unused_rules()
354  abbreviate_rule_names()
355  output_saved_lines()
356}
357