1# From arnold@f7.net Sun Sep 5 12:30:53 2004 2# Date: Fri, 3 Sep 2004 00:54:32 -0400 (EDT) 3# From: William J Poser <wjposer@ldc.upenn.edu> 4# To: arnold@skeeve.com 5# Subject: gawk bug 6# Message-ID: <20040903004347.W80049@lorax.ldc.upenn.edu> 7# 8# Here is a revised version of my previous message, modified to describe 9# the accompanying files. 10# 11# IhSplit.awk should replicate every record with exactly one entry in the 12# IH field, delete records lacking an IH field, and produce as many copies 13# of records with two or more entries in the IH field as there are entries. 14# In the latter case, the original IH field should be relabelled OIH and 15# a new IH field be added at the beginning of the record. 16# 17# This has worked properly for many years, since at least 1997. It worked properly with gawk 3.0.5 18# and possibly later versions. Unfortunately I didn't keep track of exactly what version it 19# broke on, but it was whatever came with Mandrake Linux 9.0. It continued to fail with version 20# 3.1.2. However, the problem was eliminated with version 3.1.3 and remains 21# eliminated in version 3.1.4. 22# 23# The problem was that an apparently random subset of records would loose some 24# or all of their fields. Running the script on the same input always produces 25# the same output with the same errors. 26# 27# The file Input is a subset of a real lexicon that produces errors using 28# gawk 3.1.2. GoodOutput is the expected output. BadOutput is the erroneous 29# output. A diff will show that there are actually two errors. One record 30# has fields stripped as described above. Another is omitted in its entirety. 31# 32# 33# Bill Poser, Linguistics, University of Pennsylvania 34# http://www.ling.upenn.edu/~wjposer/ billposer@alum.mit.edu 35# ---------------------------------------------------------------------------- 36#For each record that contains multiple items in its inverse headword (IH) 37#field, generate a set of new records each containing exactly one item 38#in the inverse headword field, otherwise copies of the original. 39 40function CleanUp() #Clean up for next input record. 41{ 42 for(i in rec) delete rec[i]; 43} 44 45BEGIN { 46RS = ""; 47FS = "\n?%" 48} 49{ 50 51# First, create an associative array with the tags as indices. 52 for(i = 2; i <= NF; i++) { # The leading FS creates an initial empty field 53 split($i, f, ":"); 54 rec[f[1]]=substr($i,index($i,":")+1); 55 } 56 57 if(!("IH" in rec)) next; 58 59# Parse out the inverse headwords 60 61 items = split(rec["IH"],ihs,"/"); 62 63# Replace the old IH field. 64 65 sub(/%IH:/,"%OIH:",$0); 66 67# Generate a new copy of the record for each inverse headword 68 69 for(i = 1; i <= items; i++){ 70 entries+=1; 71 printf("%%IH:%s\n",ihs[i]); 72 printf("%s\n\n",$0); 73 } 74 CleanUp(); 75 } 76