| 1 | # From [email protected] Sun Sep 5 12:30:53 2004
|
|---|
| 2 | # Date: Fri, 3 Sep 2004 00:54:32 -0400 (EDT)
|
|---|
| 3 | # From: William J Poser <[email protected]>
|
|---|
| 4 | # To: [email protected]
|
|---|
| 5 | # Subject: gawk bug
|
|---|
| 6 | # Message-ID: <[email protected]>
|
|---|
| 7 | #
|
|---|
| 8 | # Here is a revised version of my previous message, modified to describe
|
|---|
| 9 | # the accompanying files.
|
|---|
| 10 | #
|
|---|
| 11 | # IhSplit.awk should replicate every record with exactly one entry in the
|
|---|
| 12 | # IH field, delete records lacking an IH field, and produce as many copies
|
|---|
| 13 | # of records with two or more entries in the IH field as there are entries.
|
|---|
| 14 | # In the latter case, the original IH field should be relabelled OIH and
|
|---|
| 15 | # a new IH field be added at the beginning of the record.
|
|---|
| 16 | #
|
|---|
| 17 | # This has worked properly for many years, since at least 1997. It worked properly with gawk 3.0.5
|
|---|
| 18 | # and possibly later versions. Unfortunately I didn't keep track of exactly what version it
|
|---|
| 19 | # broke on, but it was whatever came with Mandrake Linux 9.0. It continued to fail with version
|
|---|
| 20 | # 3.1.2. However, the problem was eliminated with version 3.1.3 and remains
|
|---|
| 21 | # eliminated in version 3.1.4.
|
|---|
| 22 | #
|
|---|
| 23 | # The problem was that an apparently random subset of records would loose some
|
|---|
| 24 | # or all of their fields. Running the script on the same input always produces
|
|---|
| 25 | # the same output with the same errors.
|
|---|
| 26 | #
|
|---|
| 27 | # The file Input is a subset of a real lexicon that produces errors using
|
|---|
| 28 | # gawk 3.1.2. GoodOutput is the expected output. BadOutput is the erroneous
|
|---|
| 29 | # output. A diff will show that there are actually two errors. One record
|
|---|
| 30 | # has fields stripped as described above. Another is omitted in its entirety.
|
|---|
| 31 | #
|
|---|
| 32 | #
|
|---|
| 33 | # Bill Poser, Linguistics, University of Pennsylvania
|
|---|
| 34 | # http://www.ling.upenn.edu/~wjposer/ [email protected]
|
|---|
| 35 | # ----------------------------------------------------------------------------
|
|---|
| 36 | #For each record that contains multiple items in its inverse headword (IH)
|
|---|
| 37 | #field, generate a set of new records each containing exactly one item
|
|---|
| 38 | #in the inverse headword field, otherwise copies of the original.
|
|---|
| 39 |
|
|---|
| 40 | function CleanUp() #Clean up for next input record.
|
|---|
| 41 | {
|
|---|
| 42 | for(i in rec) delete rec[i];
|
|---|
| 43 | }
|
|---|
| 44 |
|
|---|
| 45 | BEGIN {
|
|---|
| 46 | RS = "";
|
|---|
| 47 | FS = "\n?%"
|
|---|
| 48 | }
|
|---|
| 49 | {
|
|---|
| 50 |
|
|---|
| 51 | # First, create an associative array with the tags as indices.
|
|---|
| 52 | for(i = 2; i <= NF; i++) { # The leading FS creates an initial empty field
|
|---|
| 53 | split($i, f, ":");
|
|---|
| 54 | rec[f[1]]=substr($i,index($i,":")+1);
|
|---|
| 55 | }
|
|---|
| 56 |
|
|---|
| 57 | if(!("IH" in rec)) next;
|
|---|
| 58 |
|
|---|
| 59 | # Parse out the inverse headwords
|
|---|
| 60 |
|
|---|
| 61 | items = split(rec["IH"],ihs,"/");
|
|---|
| 62 |
|
|---|
| 63 | # Replace the old IH field.
|
|---|
| 64 |
|
|---|
| 65 | sub(/%IH:/,"%OIH:",$0);
|
|---|
| 66 |
|
|---|
| 67 | # Generate a new copy of the record for each inverse headword
|
|---|
| 68 |
|
|---|
| 69 | for(i = 1; i <= items; i++){
|
|---|
| 70 | entries+=1;
|
|---|
| 71 | printf("%%IH:%s\n",ihs[i]);
|
|---|
| 72 | printf("%s\n\n",$0);
|
|---|
| 73 | }
|
|---|
| 74 | CleanUp();
|
|---|
| 75 | }
|
|---|