1# From Gawk Manual modified by bug fix and removal of punctuation 2 3# Invoker can customize sort command if necessary. 4BEGIN { 5 if (!SORT) SORT = "LC_ALL=C sort" 6} 7 8# Record every word which is used at least once 9{ 10 for (i = 1; i <= NF; i++) { 11 tmp = tolower($i) 12 if (0 != (pos = match(tmp, /([[:lower:]]|-)+/))) 13 used[substr(tmp, pos, RLENGTH)] = 1 14 } 15} 16 17#Find a number of distinct words longer than 10 characters 18END { 19 num_long_words = 0 20 for (x in used) 21 if (length(x) > 10) { 22 ++num_long_words 23 print x | SORT 24 } 25 print(num_long_words, "long words") | SORT 26 close(SORT) 27} 28