#!/bin/sh
echo -n "Extracting all words from " >&2
(
    for f in $*; do
        echo -n "$f " >&2
        cat $f |
            sed 's/\\%/ /g' |
            sed 's/%.*//' |
            sed 's/\\begin{[^}]*}//g' |
            sed 's/\\end{[^}]*}//g' |
            sed 's/\\ref{[^}]*}//g' |
            sed 's/\\namedlabel{[^}]*}{[^}]*}/ /g' |
            sed 's/\\label{[^}]*}//g' |
            sed 's/\\index{[^}]*}//g' |
            sed 's/\\[_$#\]/ /g' |
            sed 's/\\[a-z]*/ /g' |
            tr '{}()[]<>|\\,.:;@^%/~?!&$=*_#"`'"'-" " " |
            tr '[A-Z]' '[a-z]' |
            sed 's/[ 	]/
/g' |
            grep -v '^$' | 
            grep -v '^-' |
            sort |
            uniq
    done 
    echo >&2
) | sort | uniq > /tmp/words.all

# Define function for word counts
function delta {
    w1=`wc -l < $1`
    w2=`wc -l < $2`
    expr $w1 - $w2
}

cp /tmp/words.all /tmp/words.0
echo "Removing derivative words" >&2
    echo -n "...numbers: "
    grep -v '^[-0-9]*$' /tmp/words.all |
       grep -v '^0x[0-9a-f]*$' > /tmp/w1.$$
    delta /tmp/words.all /tmp/w1.$$
    mv /tmp/w1.$$ /tmp/words.all

    echo -n "...ing: "
    cat /tmp/words.all |
    	sed 'h
    	     s/$/ing/p
    	     g
    	     s/.$/&&ing/p
    	     g
    	     s/ie$/ying/p
    	     g
    	     s/e$/ing/p
    	     d' | sort > /tmp/w0.$$
    comm -2 -3 /tmp/words.all /tmp/w0.$$ > /tmp/w1.$$
    delta /tmp/words.all /tmp/w1.$$
    mv /tmp/w1.$$ /tmp/words.all

    echo -n "...s: "
    cat /tmp/words.all |
    	sed 'h
    	     s/$/s/p
    	     g
    	     s/$/es/p
    	     g
    	     s/y$/ies/p
    	     g
    	     s/s$/ses/p
    	     d' | sort > /tmp/w0.$$
    comm -2 -3 /tmp/words.all /tmp/w0.$$ > /tmp/w1.$$
    echo `delta /tmp/words.all /tmp/w1.$$`
    mv /tmp/w1.$$ /tmp/words.all

    echo -n "...d: "
    cat /tmp/words.all |
    	sed 'h
    	     s/$/d/p
    	     g
    	     s/$/ed/p
    	     g
    	     s/.$/&&ed/p
    	     g
    	     s/y$/ied/p
    	     d' | sort > /tmp/w0.$$
    comm -2 -3 /tmp/words.all /tmp/w0.$$ > /tmp/w1.$$
    echo `delta /tmp/words.all /tmp/w1.$$`
    mv /tmp/w1.$$ /tmp/words.all

    echo -n "...ly: "
    cat /tmp/words.all |
    	sed 'h
    	     s/y$/ily/p
    	     g
    	     s/$/ly/p
    	     g
    	     s/ble/bly/p
    	     d' | sort > /tmp/w0.$$
    comm -2 -3 /tmp/words.all /tmp/w0.$$ > /tmp/w1.$$
    delta /tmp/words.all /tmp/w1.$$
    mv /tmp/w1.$$ /tmp/words.all

    echo -n "...ion: "
    cat /tmp/words.all |
    	sed 'h
    	     s/$/ion/p
    	     g
    	     s/te$/tion/p
    	     g
    	     s/de$/sion/p
    	     g
    	     s/te$/tions/p
    	     g
    	     s/de$/sions/p
    	     d' | sort > /tmp/w0.$$
    comm -2 -3 /tmp/words.all /tmp/w0.$$ > /tmp/w1.$$
    delta /tmp/words.all /tmp/w1.$$
    mv /tmp/w1.$$ /tmp/words.all

    rm -f /tmp/w0.$$ /tmp/w1.$$ 
    mv /tmp/words.all words.all

    echo "extracting probable misspellings"
    comm -23 words.all /usr/dict/words > words.bad
