#! /bin/sh # Sort, sanity-check, and generally groom the GPL'ed TLA FAQ (GTF) or similarly # formed list of TLAs # Preparation phase. A temporary file is created, and initial entries are # counted. export LC_ALL=C date tempnm=`mktemp /tmp/gtftemp.XXXXXX` if test -z "$tempnm" ; then echo "Could not create temporary file" exit 3 fi echo -n "Counting entries in $1: " ENTRIES=`grep -v '^$' -- "$1" | wc -l` echo $ENTRIES if ! test "$ENTRIES" -gt 0 ; then echo "No entries found!" exit 2 fi # Transformation phase. A canonicized version is written to the temporary file. echo # Remove leading and trailing whitespace, ensure that TLA and meaning are # separated by exactly one tab, replace sequences of whitespace by a single # space, and replace any tabs except the field separator by normal space. # Once whitespace is normalized, sort the result & eliminate duplicates. echo "Writing reformatted version to $tempnm" if ! sed -e 's/^[[:space:]]*\([A-Z]\{3\}\)[[:space:]]\{1,\}\(.*[^[:space:]]\)[[:space:]]*$/\1\t\2/' -e 's/[[:space:]]\{2,\}/ /g' -e 's/^\(.\{5,\}\)\t/\1 /g' <"$1" | grep -v '^$' | sort -u -f >"$tempnm" ; then rm -f -- "$tempnm" exit 3 fi echo -n "Counting entries in result: " ENTRIES=`cat -- "$tempnm" | wc -l` echo "$ENTRIES" if ! test "$ENTRIES" -gt 0 ; then echo "No entries found!" exit 3 fi # Verification phase. The transformed list is checked for correct form. echo # Well-formed entries consist of exactly: # - Three upper-case letters # - A tab # - A sequence of alphanumerics, punctuation, spaces (but no other whitespace) # - A final letter, punctuation, or digit # - End-of-line echo "Looking for malformed entries:" if grep -h -v '^[[:upper:]]\{3\}[[:space:]][ [:graph:]]\+[[:graph:]]$' -- "$tempnm" ; then rm -f -- "$tempnm" exit 2 fi # In addition, a well-formed definition must consist of multiple words if grep -h -v '^.\{5\}.*[- /&+]' -- "$tempnm" ; then rm -f -- "$tempnm" exit 2 fi echo "Looking for near-duplicates:" tr -d \ \'\"- <"$tempnm" | sed -e 's/\[[^]]*\]//g' -e 's/[,;+&]//g' -e 's/\// /g' -e 's/\//g' -e 's/[[:space:]][[:space:]]*/ /g' -e 's/\([^[:space:]A-Zfhjsy]\)our\([^t]\)/\1or\2/g' | tr '[a-z]' '[A-Z]' | sed -e 's/\([^[:space:]]\)IZ\([AE]\)/\1IS\2/g' -e 's/\/AND/g' | sort | uniq -i -c -d echo "Looking for possibly incorrect TLAs:" if grep -v '[!]' -- "$tempnm" | sed -e 's/\[[^]]*\]//g' | perl -ne '($p1,$p2,$p3,$rest) = /^(.)(.)(.)\w*(.*)/; $p="$p1.*$p2.*$p3"; print if not $rest=~/$p/' | grep -v '^$' ; then exit 1 fi # Statistics phase. No files are modified. echo echo -n "Counting distinct TLAs (maximum possible is 17576):" DISTINCT=`cut -c1,2,3 -- "$tempnm" | uniq | wc -l` DISTINCTPCT=`echo "$DISTINCT" ' * 100 / 17576' | bc -q` echo " $DISTINCT ( $DISTINCTPCT% )" export TOPLIST=5 echo "Top $TOPLIST TLAs:" cut -c1,2,3 "$tempnm" | uniq -c | sort -n -r -b | head -n "$TOPLIST" echo "Histogram:" echo "TLAs occurrences" cut -c1,2,3 -- "$tempnm" | uniq -c | sed -e 's/^[[:space:]]*\([0-9]*\)[[:space:]]*/\1\t/' | sort -n -r | cut -f1 | uniq -c # Writeback phase. Original is backed up & replaced by the canonicized list. echo backup=`mktemp /tmp/gtfbackup.XXXXXX` if test -z "$backup" ; then echo "Could not create backup file!" exit 3 fi echo "Copying backup of $1 to $backup" if ! cp -- "$1" "$backup" ; then echo "Failed to back up the original file; not overwriting it." exit 3 fi echo "Replacing $1 with $tempnm" mv --update --backup -- "$tempnm" "$1" # Signoff phase. Summarize statistics into a "progress report." echo echo "$ENTRIES / $DISTINCT ( $DISTINCTPCT% )"