#! /bin/bash

set -eu -o pipefail

# Sort, sanity-check, and generally groom the GPL'ed TLA FAQ (GTF) or similarly
# formed list of TLAs

# Preparation phase.  A temporary file is created, and initial entries are
# counted.
export LC_ALL=C
date

tempnm=$(mktemp /tmp/gtftemp.XXXXXX)
if test -z "$tempnm" ; then
  echo "Could not create temporary file"
  exit 3
fi

echo -n "Counting entries in $1: "
ENTRIES=$(grep -v '^$' -- "$1" | wc -l)
echo $ENTRIES
if ! test "$ENTRIES" -gt 0 ; then
  echo "No entries found!"
  exit 2
fi

# Transformation phase.  A canonicized version is written to the temporary file.
echo

# Remove leading and trailing whitespace, ensure that TLA and meaning are
# separated by exactly one tab, replace sequences of whitespace by a single
# space, and replace any tabs except the field separator by normal space.
# Once whitespace is normalized, sort the result & eliminate duplicates.
echo "Writing reformatted version to $tempnm"
if ! sed -e 's/^[[:space:]]*\([A-Z]\{3\}\)[[:space:]]\{1,\}\(.*[^[:space:]]\)[[:space:]]*$/\1\t\2/' -e 's/[[:space:]]\{2,\}/ /g' -e 's/^\(.\{5,\}\)\t/\1 /g' <"$1" | grep -v '^$' | sort -u -f >"$tempnm" ; then
  rm -f -- "$tempnm"
  exit 3
fi

echo -n "Counting entries in result: "
ENTRIES=$(cat -- "$tempnm" | wc -l)
echo "$ENTRIES"
if ! test "$ENTRIES" -gt 0 ; then
  echo "No entries found!"
  exit 3
fi

# Verification phase.  The transformed list is checked for correct form.
echo

# Well-formed entries consist of exactly:
#  - Three upper-case letters
#  - A tab
#  - A sequence of alphanumerics, punctuation, spaces (but no other whitespace)
#  - A final letter, punctuation, or digit
#  - End-of-line
echo "Looking for malformed entries:"
if grep -h -v '^[[:upper:]]\{3\}[[:space:]][ [:graph:]]\+[[:graph:]]$' -- "$tempnm" ; then
	rm -f -- "$tempnm"
	exit 2
fi

# In addition, a well-formed definition must consist of multiple words
if grep -h -v '^.\{5\}.*[- /&+]' -- "$tempnm" ; then
	rm -f -- "$tempnm"
	exit 2
fi

echo "Looking for near-duplicates:"
tr -d \ \'\"- <"$tempnm" | sed -e 's/\[[^]]*\]//g' -e 's/[,;+&]//g' -e 's/\// /g' -e 's/\<and\>//g' -e 's/[[:space:]][[:space:]]*/ /g' -e 's/\([^[:space:]A-Zfhjsy]\)our\([^t]\)/\1or\2/g' | tr '[a-z]' '[A-Z]' | sed -e 's/\([^[:space:]]\)IZ\([AE]\)/\1IS\2/g' -e 's/\<INCORPORATED/INC./g' -e 's/\<&\>/AND/g' | sort | uniq -i -c -d

echo "Looking for possibly incorrect TLAs:"
if grep -v '[!]' -- "$tempnm" | sed -e 's/\[[^]]*\]//g' | perl -ne '($p1,$p2,$p3,$rest) = /^(.)(.)(.)\w*(.*)/; $p="$p1.*$p2.*$p3"; print if not $rest=~/$p/' | grep -v '^$' ; then
  exit 1
fi


# Statistics phase.  No files are modified.
echo

echo -n "Counting distinct TLAs (maximum possible is 17576):"
DISTINCT=$(cut -c1,2,3 -- "$tempnm" | uniq | wc -l)
DISTINCTPCT=$(echo "$DISTINCT" ' * 100 / 17576' | bc -q)
echo " $DISTINCT ( $DISTINCTPCT% )"

export TOPLIST=5
echo "Top $TOPLIST TLAs:"
# NO IDEA why the "head" returns non-zero sometimes after a successful job!
cut -c1,2,3 "$tempnm" | uniq -c | sort -n -r -b | head -n$TOPLIST || true

echo "Histogram:"
echo "TLAs	occurrences"
cut -c1,2,3 -- "$tempnm" | uniq -c | sed -e 's/^[[:space:]]*\([0-9]*\)[[:space:]]*/\1\t/' | sort -n -r | cut -f1 | uniq -c


# Writeback phase.  Original is backed up & replaced by the canonicized list.
echo

backup=$(mktemp /tmp/gtfbackup.XXXXXX)
if test -z "$backup" ; then
  echo "Could not create backup file!"
  exit 3
fi

echo "Copying backup of $1 to $backup"
if ! cp -- "$1" "$backup" ; then
  echo "Failed to back up the original file; not overwriting it."
  exit 3
fi

echo "Replacing $1 with $tempnm"
mv --update --backup -- "$tempnm" "$1"

# Signoff phase.  Summarize statistics into a "progress report."
echo

echo "$ENTRIES / $DISTINCT ( $DISTINCTPCT% )"