/*
 *  same.c  V1.6  -- Works for me.
 *
 *  Copyright (C) by R.E.Wolff -- R.E.Wolff@BitWizard.nl
 *
 *  I prefer if you try to contact me if you have enhancements,
 *  instead of forking off a different branch.....
 *
 *              date          by     what
 *  Written:    ??? ?? 1996   REW    initial revision
 *  changed:    Jun 15 1997   REW    enhancements & documentation
 *                                   for publication.
 *              Oct 18 1997   Geert  sped up relinking of files
 *              Dec 28 1998   REW    Added restart feature.
 *              Oct 8  2000   Geert  Speedup improvements:
 *                                     memory pools etc.
 *                                   should work now on "large" filesets.
 *		Sep 09 2002   ri     added strict compiler options
 *                                   quiet signal handler
 *                                   --help and --version options
 *                                   improved parameter scanning
 *              Jan 8 2004    Geert  Add -u option
 *                                   Kill superfluous prototype
 *                                   Kill superfluous whitespace
 *                                   Add comment about safeness in case of
 *                                     inappropriate file permissions
 *                                   Remove comment about RCS
 *                                   Make hash_start static
 *                                   Remove unused routine get_crc_table()
 *                                   Add saved disk space to statistics
 *              Jun 22 2004   Geert  Add caching option
 *                                   Use zlib for crc calculations
 *                                   Many small clean ups
 *              Jul 14 2004  REW     Changed UID option to do what I want.
 *                                   Added "write cache on interrupt" feature.
 *
 *  who-is-who:
 *    initials full name                 Email address
 *    REW      Roger E. Wolff            R.E.Wolff@BitWizard.nl
 *    Geert    Geert Uytterhoeven        geert@linux-m68k.org
 *    ri       Roland Illig              roland.illig@gmx.de
 *
 *
 * Introduction.
 * ------------
 * This program takes a list of files (e.g. the output of find . -type f)
 * on stdin. Each of the files is compared against each of the others.
 * Whenever two files are found that match exactly, the two files are
 * linked (soft or hard) together.
 *
 * Goal.
 * ----
 * The goal of this program is to conserve disk space when you have
 * several different trees of large projects on your disk. By creating
 * hardlinks or softlinks between the files that are the same, you can
 * save lots of disk space. For example, two different versions of the
 * Linux kernel only differ in a small number of files. By running this
 * program you only need to store the contents of those files once.
 * This is especially useful if you have different versions of complete
 * trees lying around.
 *
 * Implementation.
 * --------------
 * The filesize of every file is used as an indication of wether two
 * files can be the same. Whenever the filesizes match, the hashes of
 * these two files are compared. Whenever these match, the file
 * contents are compared. For every matching pair one of the two
 * files is replaced by a hard link to the other file.
 * With the -s option a softlink is used.
 *
 * To allow you to do this incrementally, the "rm" is done on the file
 * with the least links. This allows you to "merge" a new tree with
 * several trees that have already been processed. The new tree has
 * link count 1, while the old tree has a higher link count for those
 * files that are likely candidates for linkage.
 *
 * The current implementation keeps the "first" incantation of a file,
 * and replaces further occurrances of the same file. This is
 * significant when using softlinks.
 *
 *
 * Example.
 * -------
 * For example you could do:
 *
 *      find . -type f | same
 *
 * This links all files together under the current directory that are
 * the same.
 *
 *
 * Bugs.
 * ----
 * - Make sure that you have all the permissions required for
 *   execution of the commands. If you don't, no bad things should happen,
 *   but the statistics will be wrong for sure.
 *
 * - If your editor does not move the original aside before writing a
 *   new copy, you will change the file in ALL incarnations when
 *   editing a file. Patch works just fine: it moves the original
 *   aside before creating a new copy. I'm confident that I could
 *   learn Emacs to do it this way too. I'm too lazy to figure it out,
 *   so if you happen to know an easy way how to do this, please Email
 *   me at R.E.Wolff@BitWizard.nl
 * 
 * - There is a 1024 (BUFSIZE) character limit to pathnames when using 
 *   symlinks.
 *
 * - The same source is not exactly 32kbytes long.  However this comment 
 *   seems to fix that.
 * */


#include <assert.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include <signal.h>
#include <time.h>
#include <sys/times.h>
#include <zlib.h>
#include <limits.h>

#define __USE_LARGEFILE64
#include <sys/stat.h>

#if 1
/* Why the *&^#$ doesn't sys/stat define this??? */
extern int lstat64 (__const char *__restrict __file,
                    struct stat64 *__restrict __buf) __THROW;
#endif

#ifdef __linux__
#include <asm/page.h>
#else /* !__linux__ */
#define PAGE_SIZE	(4096)
#endif

#define VERSION		"same 1.8"

/* Largest prime smaller than 64k. */
#define MAXHASH		65521

#define BACKUP_SUFFIX	".old"
#define TMPNAME_SUFFIX	".new"

#define BUFSIZE		16384

/* This CRC should be "seen in the wild" about 1 in 4 billion. If
   that happens, the file's CRC will be recalculated even though 
   it's been saved in the cache. Tough luck. -- REW */
#define INVALID_CRC	0xffffffff

#define true		1
#define false		0


static volatile int stop;
static volatile int doing_input;


    /*
     *  Program Options
     */

static int o_debug;
static int o_verbose;
static int o_softlinks;
static int o_dryrun;
static int o_timings;
static int o_null;
static int o_nullfiles;
static int o_user;
static int o_merge_cache;
static const char *o_cache;


    /*
     *  Database Entries
     */

struct name_entry {
    struct name_entry *next;
    char name[0];
};

#define F_CRC_VALID		(1 << 0)
#define F_STAT_VALID		(1 << 1)
#define F_NEW			(1 << 2)

struct inode_entry {
    struct inode_entry *next;
    struct name_entry *names;
    int flags;			/* See F_* definitions above */
    /* The two fields below may have been read from the cache */
    loff_t size;
    unsigned int crc;		/* valid if flags & F_CRC_VALID only */
    /* The four fields below are valid if flags & F_STAT_VALID only */
    dev_t device;
    ino_t inode;
    nlink_t nlink;
    uid_t uid;
};

static struct {
    struct inode_entry *old;	/* Read from cache */
    struct inode_entry *new;	/* New entries */
} hashtable[MAXHASH];

static unsigned int hash_start;

/* static uid_t uid; */


    /*
     *  Pooled Memory Allocation
     */

#define POOL_NUM_DATA	(((PAGE_SIZE)/sizeof(unsigned long))-2)

struct pool {
    struct pool *next;
    unsigned long nfree;
    unsigned long data[POOL_NUM_DATA];
};

static struct pool *pooltable[MAXHASH];


    /*
     *  Timing and Statistics
     */

static time_t start_time, total_time;

static unsigned long stat_names_in;
static unsigned long stat_names_out;
static unsigned long stat_files_in;
static unsigned long stat_files_out;
static unsigned long stat_new;
static unsigned long stat_merge;
static unsigned long stat_link_inode;
static unsigned long stat_link_name;
static unsigned long stat_skip_inode;
static unsigned long stat_skip_name;
static unsigned long stat_stat;
static unsigned long stat_crc;
static unsigned long stat_cmp;
static unsigned long long stat_saved;


    /*
     *  Function Prototypes
     */

static void do_start(const char *msg);
static void do_stop(void);
static void do_end(void);
static void progress(int percent);
static void dump_inode_entry(const struct inode_entry *entry,
			     const char *indent);
static void dump_hashtable(void);
static void load_cache(void);
static void save_cache(void);
static void save_entry(gzFile *out, const struct inode_entry *entry);
static void *p_malloc(unsigned int hash, size_t size);
static struct inode_entry *alloc_inode_entry(unsigned int hash, int is_new);
static void delete_inode_entry(struct inode_entry *entry);
static struct name_entry *alloc_name_entry(unsigned int hash,
					   const char *name);
static void delete_name_entry(struct name_entry *entry);
static void merge_and_link(void);
static void merge_old_new(unsigned int hash);
static void merge_new(unsigned int hash);
static void link_old_new(unsigned int hash);
static void link_old_old(unsigned int hash);
static void link_new(unsigned int hash);
static int do_unlink(const char *name);
static int do_link(const char *master, const char *slave);
static int do_symlink(const char *master, const char *slave);
static void merge_link(struct inode_entry *entry0, struct inode_entry *entry1);
static void link_soft(struct inode_entry *entry0, struct inode_entry *entry1);
static void link_hard(struct inode_entry *entry0, struct inode_entry *entry1);
static void read_list(void);
static const char *get_fname(void);
static struct inode_entry *get_entry(void);
static int __get_stat(struct inode_entry *entry);
static unsigned int calc_hash(const struct stat64 *sb);
static int __get_crc(struct inode_entry *entry);
static int cmp(const struct inode_entry *entry1,
	       const struct inode_entry *entry2);
static void sighandler(int signum);
static void usage(void);


    /*
     *  Timings, Progress and Status Reporting
     */

static void do_start(const char *msg)
{
    struct tms tms;

    if (o_timings) {
	fprintf(stderr, "%-60s: ", msg);
	start_time = times(&tms);
    } else if (o_verbose)
	fprintf(stderr, "%s", msg);
}

static void do_stop(void)
{
    struct tms tms;
    time_t time;

    if (o_timings) {
	time = times(&tms)-start_time;
	total_time += time;
	fprintf(stderr, "%10.2f s\n", (double)time/CLK_TCK);
    } else if (o_verbose)
	fputs("\n", stderr);
}

static void do_end(void)
{
    if (o_timings)
	fprintf(stderr, "%-60s: %10.2f s\n", "*** Total execution time ***",
		(double)total_time/CLK_TCK);
    fputs("Statistics:\n", stderr);
    fprintf(stderr, "  Processed %lu new files\n", stat_new);
    fprintf(stderr, "  Merged %lu hard links\n", stat_merge);
    fprintf(stderr, "  Statted %lu files\n", stat_stat);
    fprintf(stderr, "  Calculated %lu CRCs\n", stat_crc);
    fprintf(stderr, "  Compared %lu files\n", stat_cmp);
    fprintf(stderr,
	    "  Linked %lu names for %lu identical files saving %llu bytes\n",
	    stat_link_name, stat_link_inode, stat_saved);
    if (o_user)
	fprintf(stderr, "  Skipped %lu names for %lu identical files\n",
		stat_skip_name, stat_skip_inode);
    if (o_cache)
	fprintf(stderr,
		"  Cache: %lu names/%lu files in, %lu names/%lu files out\n",
		stat_names_in, stat_files_in, stat_names_out, stat_files_out);
}

static void progress(int percent)
{
    static int last = -1;
    int i;

    percent /= 2;
    if (o_verbose && !o_timings && (last != percent)) {
	if (percent == 0) {
	    fputs("\r  [                                                  ]"
		  "\r  [", stderr);
	} else {
	    for (i = 0; i < percent-last; i++)
		fputc('#', stderr);
	    if (percent == 50)
		fputs("]", stderr);
	}
    }
    last = percent;
}


    /*
     *  Debugging
     */

static void dump_inode_entry(const struct inode_entry *entry,
			     const char *indent)
{
    const struct name_entry *names;

    printf("%sentry %p size %lu crc %08x device %lx inode %lx nlink %d uid "
	   "%lx\n",
	   indent, entry, (unsigned long)entry->size, entry->crc,
	   (unsigned long)entry->device, (unsigned long)entry->inode,
	   entry->nlink, (unsigned long)entry->uid);
    for (names = entry->names; names; names = names->next)
	printf("%s  %s\n", indent, names->name);
}

static void dump_hashtable(void)
{
    unsigned int i;
    const struct inode_entry *entry;

    for (i = hash_start; !stop && (i < MAXHASH); i++) {
	if (!hashtable[i].old && !hashtable[i].new)
	    continue;
	printf("hash %u:\n", i);
	for (entry = hashtable[i].old; entry; entry = entry->next)
	    dump_inode_entry(entry, "  ");
	for (entry = hashtable[i].new; entry; entry = entry->next)
	    dump_inode_entry(entry, "  ");
    }
}


static inline int get_stat(struct inode_entry *entry)
{
    return entry->flags & F_STAT_VALID ? 1 : __get_stat(entry);
}

static inline int get_crc(struct inode_entry *entry)
{
    return entry->flags & F_CRC_VALID ? 1 : __get_crc(entry);
}

static inline int is_new(const struct inode_entry *entry)
{
    return entry->flags & F_NEW;
}


    /*
     *  Load the cache
     */

static void load_cache(void)
{
    static char buf[BUFSIZE];
    gzFile *in;
    unsigned long line = 0;
    char *s;
    loff_t size;
    unsigned int crc;
    const char *name;
    unsigned int hash = 0;
    struct inode_entry *entry;
    struct name_entry **next_name = NULL;

    do_start("Loading cache from previous session");
    in = gzopen(o_cache, "r");
    if (!in) {
	do_stop();
	fprintf(stderr, "open %s: %s\n", o_cache, strerror(errno));
	return;
    }
    while (!stop && gzgets(in, buf, sizeof(buf))) {
	line++;
	s = strchr(buf, '\n');
	if (!s)
	    goto error;
	*s = '\0';
	if (buf[0] != '\t') {
	    size = strtoul(buf, &s, 0);
	    if (*s != '\t')
		goto error;
	    crc = strtoul(s+1, &s, 0);
	    if (*s != '\t')
		goto error;
	    name = s+1;
	    hash = size % MAXHASH;
	    entry = alloc_inode_entry(hash, false);
	    entry->names = alloc_name_entry(hash, name);
	    entry->size = size;
	    entry->crc = crc;
	    entry->flags = 0;
	    if (entry->crc != INVALID_CRC)
	        entry->flags |= F_CRC_VALID;
	    next_name = &entry->names->next;
	    stat_names_in++;
	    stat_files_in++;
	} else {
	    if (!next_name) {
error:
		fprintf(stderr, "Parse error on line %lu of %s\n", line,
			o_cache);
		continue;
	    }
	    name = buf+1;
	    *next_name = alloc_name_entry(hash, name);
	    next_name = &(*next_name)->next;
	    stat_names_in++;
	}
    }
    gzclose(in);
    do_stop();
}

#if 0
static void *Malloc (size_t size)
{
    void *t;

    t = malloc (size); 
    if (t) return t; 
    do_stop (); 
    fprintf (stderr, "malloc %d failed: %s\n",size, strerror (errno));
    return (NULL); 
}
#endif

    /*
     *  Save the cache
     */

static void save_cache(void)
{
    struct stat sb;
    gzFile *out;
    unsigned int i;
    struct inode_entry *entry;
    char backup[PATH_MAX+1], tmpname[PATH_MAX+1];

    if (o_dryrun)
	return;

    do_start("Saving cache for next session");
    sprintf (backup,  "%s%s", o_cache, BACKUP_SUFFIX); 
    sprintf (tmpname, "%s%s", o_cache, TMPNAME_SUFFIX); 

    /* Give the user a chance to save the cache, but allow 
       him to control-c the saving process as well */
    stop = 0;
    out = gzopen(tmpname, "w");
    if (!out) {
	do_stop();
	fprintf(stderr, "create %s: %s\n", o_cache, strerror(errno));
	return;
    }

    for (i = hash_start; !stop && (i < MAXHASH); i++) {
	for (entry = hashtable[i].old; entry; entry = entry->next)
	    save_entry(out, entry);
	for (entry = hashtable[i].new; entry; entry = entry->next) {
#if 0
	    if (!get_crc(entry))
		continue;
#endif
	    save_entry(out, entry);
	}
    }
    gzclose(out);
    do_stop();

    if (stop) {
	/* Hmm. Interrupted. Forget about renaming the files then. */
	unlink (tmpname); 
	return;
    }
    if (!stat(o_cache, &sb)) {
	if (rename(o_cache, backup) == -1) {
	    fprintf(stderr, "rename %s %s: %s\n", o_cache, backup,
		    strerror(errno));
        }
    }
    if (rename(tmpname, o_cache) == -1) {
	fprintf(stderr, "rename %s %s: %s\n", tmpname, o_cache,
		    strerror(errno));
    }
}


    /*
     *  Save a cache entry
     */

static void save_entry(gzFile *out, const struct inode_entry *entry)
{
    const struct name_entry *names;

    assert(entry->names != NULL);
    gzprintf(out, "%lu\t0x%08x", (unsigned long)entry->size, (int)entry->crc);
    stat_files_out++;
    for (names = entry->names; names; names = names->next) {
	gzprintf(out, "\t%s\n", names->name);
	stat_names_out++;
    }
}


    /*
     *  Pooled Memory Allocation
     */

static void *p_malloc(unsigned int hash, size_t size)
{
    struct pool *pool;
    void *data;

    if (size > sizeof(pool->data)) {
	fprintf(stderr, "Warning: p_malloc() of size %u\n", size);
	return malloc(size);
    }

    size = (size+sizeof(unsigned long)-1)/sizeof(unsigned long);
    for (pool = pooltable[hash]; pool; pool = pool->next) {
	if (pool->nfree >= size)
	    break;
    }
    if (!pool) {
	pool = malloc(sizeof(struct pool));
	pool->next = pooltable[hash];
	pool->nfree = POOL_NUM_DATA;
	pooltable[hash] = pool;
    }
    data = &pool->data[POOL_NUM_DATA-pool->nfree];
    pool->nfree -= size;
    return data;
}


    /*
     *  Inode Entry Allocation and Deallocation
     */

static struct inode_entry *alloc_inode_entry(unsigned int hash, int is_new)
{
    struct inode_entry *entry = NULL;

    entry = p_malloc(hash, sizeof(struct inode_entry));
    if (is_new) {
	entry->next = hashtable[hash].new;
	hashtable[hash].new = entry;
    } else {
	entry->next = hashtable[hash].old;
	hashtable[hash].old = entry;
    }
    return entry;
}

static void delete_inode_entry(struct inode_entry *entry)
{
    struct name_entry *names, *next;

    names = entry->names;
    while (names) {
	if (o_debug > 2)
	    printf("delete name %s\n", names->name);
	next = names->next;
	delete_name_entry(names);
	names = next;
    }
    /*
     *  We cannot free the entry until the whole pool can be freed
     */
}


    /*
     *  Name Entry Allocation and Deallocation
     */

static struct name_entry *alloc_name_entry(unsigned int hash, const char *name)
{
    struct name_entry *entry = NULL;

    entry = p_malloc(hash, sizeof(struct name_entry)+strlen(name)+1);
    entry->next = NULL;
    strcpy(entry->name, name);
    return entry;
}

static void delete_name_entry(struct name_entry *entry __attribute__((__unused__)))
{
    /*
     *  We don't free names allocated from the pool
     */
}


    /*
     *  Merge Hard Links and Link Identical Files
     */
static void merge_and_link(void)
{
    unsigned int hash;

    do_start("Merging hard links and linking identical files");
    for (hash = hash_start; !stop && (hash < MAXHASH); hash++) {
	progress(100*hash/(MAXHASH+1));
	merge_old_new(hash);
	merge_new(hash);
        if (o_merge_cache)
	    link_old_old (hash); 
	link_old_new(hash);
	link_new(hash);
    }
    progress(100);
    do_stop();
}


    /*
     *  Merge hard links between old and new files
     */

static void merge_old_new(unsigned int hash)
{
    struct inode_entry *entry0, *entry1, **prev0, **prev1;

    /* Loop over all old files */
    prev0 = &hashtable[hash].old;
    entry0 = hashtable[hash].old;
    while (!stop && entry0) {
	/* Loop over all new files */
	prev1 = &hashtable[hash].new;
	entry1 = hashtable[hash].new;
	while (!stop && entry1) {
	    /* Check size match first */
	    if (entry0->size == entry1->size) {
		if (!get_stat(entry0)) {
		    /* Remove erroneous entry0 */
		    *prev0 = entry0->next;
		    delete_inode_entry(entry0);
		    entry0 = *prev0;
		    /* Continue outer loop */
		    goto continue_outer_loop;
		}
		if (entry0->device == entry1->device &&
		    entry0->inode == entry1->inode) {
		    /* Existing hard link found */
		    merge_link(entry0, entry1);
		    *prev1 = entry1->next;
		    delete_inode_entry(entry1);
		    entry1 = *prev1;
		    /* Continue inner loop */
		    continue;
		}
	    }
	    prev1 = &entry1->next;
	    entry1 = entry1->next;
	}
	prev0 = &entry0->next;
	entry0 = entry0->next;
continue_outer_loop:;
    }
}


    /*
     *  Merge hard links between new files
     */

static void merge_new(unsigned int hash)
{
    struct inode_entry *entry0, *entry1, **prev1;

    /* Loop over all new files */
    entry0 = hashtable[hash].new;
    while (!stop && entry0) {
	/* Loop over all new files */
	prev1 = &entry0->next;
	entry1 = entry0->next;
	while (!stop && entry1) {
	    if (entry0->device == entry1->device &&
		entry0->inode == entry1->inode) {
		/* Existing hard link found */
		merge_link(entry0, entry1);
		*prev1 = entry1->next;
		delete_inode_entry(entry1);
		entry1 = *prev1;
		/* Continue inner loop */
		continue;
	    }
	    prev1 = &entry1->next;
	    entry1 = entry1->next;
	}
	entry0 = entry0->next;
    }
}


    /*
     *  Link identical files within the cached files
     */

static void link_old_old(unsigned int hash)
{
    struct inode_entry *entry0, *entry1, **prev0, **prev1;

    /* Loop over all old files */
    prev0 = &hashtable[hash].old;
    entry0 = hashtable[hash].old;
    while (!stop && entry0) {
        /* Loop over all new files */
        prev1 = &entry0->next;
        entry1 = entry0->next; 
        while (!stop && entry1) {
            /* Check size match first */
            if (entry0->size == entry1->size &&
                (entry0->size != 0 || o_nullfiles)) {
                if (!get_stat(entry0)) {
                    /* Remove erroneous entry0 */
                    *prev0 = entry0->next;
                    delete_inode_entry(entry0);
                    entry0 = *prev0;
                    /* Continue outer loop */
                    goto continue_outer_loop;
                }
                if (!get_crc(entry0)) {
                    /* Remove erroneous entry0 */
                    *prev0 = entry0->next;
                    delete_inode_entry(entry0);
                    entry0 = *prev0;
                    /* Continue outer loop */
                    goto continue_outer_loop;
                }
                if (!get_crc(entry1)) {
                    /* Remove erroneous entry1 */
                    *prev1 = entry1->next;
                    delete_inode_entry(entry1);
                    entry1 = *prev1;
                    /* Continue inner loop */
                    continue;
                }
                /* Check CRC, device (for soft links), and contents match */
                if (entry0->crc == entry1->crc &&
                    (o_softlinks || (entry0->device == entry1->device)) &&
                    (!o_user ||  (entry0->uid == entry1->uid)) &&
                    cmp(entry0, entry1)) {
                    /* Identical contents found */
                    if (o_softlinks)
                        link_soft(entry0, entry1);
                    else
                        link_hard(entry0, entry1);
                    *prev1 = entry1->next;
                    delete_inode_entry(entry1);
                    entry1 = *prev1;
                    /* Continue inner loop */
                    continue;
                }
            }
            prev1 = &entry1->next;
            entry1 = entry1->next;
        }
	prev0 = &entry0->next;
        entry0 = entry0->next;
continue_outer_loop:;
    }
}



    /*
     *  Link identical files between old and new files
     */

static void link_old_new(unsigned int hash)
{
    struct inode_entry *entry0, *entry1, **prev0, **prev1;

    /* Loop over all old files */
    prev0 = &hashtable[hash].old;
    entry0 = hashtable[hash].old;
    while (!stop && entry0) {
	/* Loop over all new files */
	prev1 = &hashtable[hash].new;
	entry1 = hashtable[hash].new;
	while (!stop && entry1) {
	    /* Check size match first */
	    if (entry0->size == entry1->size &&
		(entry0->size != 0 || o_nullfiles)) {
		if (!get_stat(entry0)) {
		    /* Remove erroneous entry0 */
		    *prev0 = entry0->next;
		    delete_inode_entry(entry0);
		    entry0 = *prev0;
		    /* Continue outer loop */
		    goto continue_outer_loop;
		}
                if (!get_crc(entry0)) {
                    /* Remove erroneous entry0 */
                    *prev0 = entry0->next;
                    delete_inode_entry(entry0);
                    entry0 = *prev0;
                    /* Continue outer loop */
                    goto continue_outer_loop;
                }
		if (!get_crc(entry1)) {
		    /* Remove erroneous entry1 */
		    *prev1 = entry1->next;
		    delete_inode_entry(entry1);
		    entry1 = *prev1;
		    /* Continue inner loop */
		    continue;
		}
		/* Check CRC, device (for soft links), and contents match */
		if (entry0->crc == entry1->crc &&
		    (o_softlinks || (entry0->device == entry1->device)) &&
                    (!o_user ||  (entry0->uid == entry1->uid)) &&
		    cmp(entry0, entry1)) {
		    /* Identical contents found */
		    if (o_softlinks)
			link_soft(entry0, entry1);
		    else
			link_hard(entry0, entry1);
		    *prev1 = entry1->next;
		    delete_inode_entry(entry1);
		    entry1 = *prev1;
		    /* Continue inner loop */
		    continue;
		}
	    }
	    prev1 = &entry1->next;
	    entry1 = entry1->next;
	}
	prev0 = &entry0->next;
	entry0 = entry0->next;
continue_outer_loop:;
    }
}


    /*
     *  Link identical files between new files
     */

static void link_new(unsigned int hash)
{
    struct inode_entry *entry0, *entry1, **prev0, **prev1;

    /* Loop over all new files */
    prev0 = &hashtable[hash].new;
    entry0 = hashtable[hash].new;
    while (!stop && entry0) {
	/* Loop over all new files */
	prev1 = &entry0->next;
	entry1 = entry0->next;
	while (!stop && entry1) {
	    /* Check size match first */
	    if (entry0->size == entry1->size &&
		(entry0->size || o_nullfiles)) {
		if (!get_crc(entry0)) {
		    /* Remove erroneous entry0 */
		    *prev0 = entry0->next;
		    delete_inode_entry(entry0);
		    entry0 = *prev0;
		    /* Continue outer loop */
		    goto continue_outer_loop;
		}
		if (!get_crc(entry1)) {
		    /* Remove erroneous entry1 */
		    *prev1 = entry1->next;
		    delete_inode_entry(entry1);
		    entry1 = *prev1;
		    /* Continue inner loop */
		    continue;
		}
		/* Check CRC, device (for soft links), and contents match */
		if (entry0->crc == entry1->crc &&
		    (o_softlinks || (entry0->device == entry1->device)) &&
		    (!o_user || (entry0->uid == entry1->uid)) &&
		    cmp(entry0, entry1)) {
		    /* Identical contents found */
		    if (o_softlinks)
			link_soft(entry0, entry1);
		    else
			link_hard(entry0, entry1);
		    *prev1 = entry1->next;
		    delete_inode_entry(entry1);
		    entry1 = *prev1;
		    /* Continue inner loop */
		    continue;
		}
	    }
	    prev1 = &entry1->next;
	    entry1 = entry1->next;
	}
	entry0 = entry0->next;
continue_outer_loop:;
    }
}


    /*
     *  Unlink, Link and Symlink Wrappers
     */

static int do_unlink(const char *name)
{
    int res = 0;

    if (o_debug > 0)
	printf("unlink %s\n", name);
    if (!o_dryrun && ((res = unlink(name)) == -1))
	fprintf(stderr, "unlink %s: %s\n", name, strerror(errno));
    return res;
}

static int do_link(const char *master, const char *slave)
{
    int res = 0;

    if (o_debug > 0)
	printf("link %s %s\n", master, slave);
    if (!o_dryrun && ((res = link(master, slave)) == -1))
	fprintf(stderr, "link %s %s: %s\n", master, slave, strerror(errno));
    return res;
}

static int do_symlink(const char *master, const char *slave)
{
    int res = 0;

    if (o_debug > 0)
	printf("symlink %s %s\n", master, slave);
    if (!o_dryrun && ((res = symlink(master, slave)) == -1))
	fprintf(stderr, "symlink %s %s: %s\n", master, slave, strerror(errno));
    return res;
}


    /*
     *  Merge a Hard Link
     */

static void merge_link(struct inode_entry *entry0, struct inode_entry *entry1)
{
    assert(entry0->names != NULL);
    assert(entry1->names != NULL);
    assert(entry1->names->next == NULL);
    entry1->names->next = entry0->names;
    entry0->names = entry1->names;
    entry1->names = NULL;
    stat_merge++;
}


    /*
     *  Symlink all Names for Two Identical Files
     */

static void link_soft(struct inode_entry *entry0, struct inode_entry *entry1)
{
    struct name_entry *slaves, *master, *next;
    static char buf[BUFSIZE];
    const char *p;

    assert(entry0->names != NULL);
    assert(entry1->names != NULL);
    if (!o_user || entry1->uid == entry0->uid) {
	master = entry0->names;
	slaves = entry1->names;
    } else if (entry0->uid == entry1->uid && is_new(entry0)) {
	master = entry1->names;
	slaves = entry0->names;
    } else {
	stat_skip_name += entry0->nlink+entry1->nlink;
	stat_skip_inode += 2;
	return;
    }

    while (slaves) {
	if (do_unlink(slaves->name) != -1) {
	    buf[0] = '\0';
	    /* FIXME: avoid possible buffer overflow */
	    for (p = slaves->name; (p = strchr(p,'/')) != 0; p++)
		strcat(buf, "../");
	    strcat(buf, master->name);
	    do_symlink(buf, slaves->name);
	}
	next = slaves->next;
	delete_name_entry(slaves);
	slaves = next;
	stat_link_name++;
    }
    entry1->names = NULL;
    stat_link_inode++;
    stat_saved += entry0->size;
}


    /*
     *  Link all Names for Two Identical Files
     */

static void link_hard(struct inode_entry *entry0, struct inode_entry *entry1)
{
    struct name_entry *slaves, *master, *master_next, *next;

    assert(entry0->names != NULL);
    assert(entry1->names != NULL);
    if ((entry0->nlink >= entry1->nlink || !is_new(entry0))) {
	master = entry0->names;
	slaves = entry1->names;
    } else {
	master = entry1->names;
	slaves = entry0->names;
    }
    master_next = master->next;
    master->next = slaves;

    while (slaves) {
	if (do_unlink(slaves->name) != -1)
	    do_link(master->name, slaves->name);
	next = slaves->next;
	if (next == NULL)
	    slaves->next = master_next;
	slaves = next;
	stat_link_name++;
    }
    entry0->names = master;
    entry1->names = NULL;
    entry0->nlink += entry1->nlink;
    stat_link_inode++;
    stat_saved += entry0->size;
}


    /*
     *  Read the List of File Names from Stdin
     */

static void read_list(void)
{
    const struct inode_entry *entry;

    do_start("Reading list of files");
    while (!stop && !feof(stdin)) {
	if (!(entry = get_entry()))
	    break;
    }
    do_stop();
}


static const char *get_fname (void)
{
  static char tbuf[BUFSIZE];
  static int bib;
  static int pos;
  const char *t, *ep;
  int rv; 

  if (!o_null) {
    t = fgets (tbuf, BUFSIZE, stdin);
    tbuf[BUFSIZE-1] = 0;
    tbuf[strlen(tbuf)-1] = '\0';
    return t;
  }

  if (pos < bib) {
    ep = memchr (tbuf + pos, 0, bib-pos);
    if (ep) {
      t = tbuf + pos;
      pos = ep - tbuf + 1;
      return t;
    }
  }
  if (pos <= bib) {
    memmove (tbuf, tbuf + pos, bib); 
    bib -= pos;
    pos = 0;
    rv = read (0, tbuf + bib, BUFSIZE - bib); 
    if (rv > 0) bib += rv;
  }

  ep = memchr (tbuf + pos, 0, bib-pos);
  if (ep) {
    t = tbuf + pos;
    pos = ep - tbuf + 1;
    return t;
  }
  return NULL;
}


static struct inode_entry *get_entry(void)
{
    const char *buf;
    struct inode_entry *entry;
    struct stat64 sb;
    unsigned int hash;

    do {
        buf = get_fname();
	if (buf == NULL)
	    return NULL;
	stat_stat++;
	if (lstat64(buf, &sb) < 0) {
	    fprintf(stderr, "stat %s: %s\n", buf, strerror(errno));
	    exit(1);
	}
    } while (!S_ISREG(sb.st_mode));

    hash = calc_hash(&sb);
    entry = alloc_inode_entry(hash, true);
    entry->names = alloc_name_entry(hash, buf);
    entry->size = sb.st_size;
    entry->device = sb.st_dev;
    entry->inode = sb.st_ino;
    entry->nlink = sb.st_nlink;
    entry->uid = sb.st_uid;
    entry->flags = F_STAT_VALID | F_NEW;
    entry->crc = INVALID_CRC;
    stat_new++;

    return entry;
}


    /*
     *  Retrieve the file information block for a file
     */

static int __get_stat(struct inode_entry *entry)
{
    struct stat64 sb;
    struct name_entry *name;

    /* Loop until we find a file that does exist */
    while ((name = entry->names) != 0) {
	stat_stat++;
	if (lstat64(name->name, &sb) < 0)
	    fprintf(stderr, "stat %s: %s\n", name->name, strerror(errno));
	else if (S_ISREG(sb.st_mode)) {
	    entry->device = sb.st_dev;
	    entry->inode = sb.st_ino;
	    entry->nlink = sb.st_nlink;
	    entry->uid = sb.st_uid;
	    entry->flags |= F_STAT_VALID;
	    return 1;
	}
	entry->names = name->next;
	delete_name_entry(name);
    }
    return 0;
}


    /*
     *  Calculate the Hash Value for an Inode Entry
     */

static unsigned int calc_hash(const struct stat64 *sb)
{
    return (sb->st_size) % MAXHASH;
}


    /*
     *  Calculate a CRC for a File.
     */

static int __get_crc(struct inode_entry *entry)
{
    static char b1[BUFSIZE];
    unsigned int crc = 0;
    int f1, n;

    assert(entry->names != NULL);

    stat_crc++;
    if ((f1 = open(entry->names->name, O_RDONLY)) < 0) {
	fprintf(stderr, "open %s: %s\n", entry->names->name, strerror(errno));
	return 0;
    }

    while ((n = read(f1, b1, BUFSIZE)) > 0)
	crc = crc32(crc, b1, n);
    close(f1);
    if (n < 0) {
	fprintf(stderr, "read %s: %s\n", entry->names->name, strerror(errno));
	return 0;
    }
    entry->crc = crc;
    entry->flags |= F_CRC_VALID;
    return 1;
}


    /*
     *  Compare Two Files
     */

static int cmp(const struct inode_entry *entry1,
	       const struct inode_entry *entry2)
{
    static char b1[BUFSIZE], b2[BUFSIZE];
    int res = 0, r1, r2, f1, f2;

    assert(entry1->names != NULL);
    assert(entry2->names != NULL);

    stat_cmp++;
    fflush(stdout);
    if ((f1 = open(entry1->names->name, O_RDONLY)) != -1) {
	if ((f2 = open(entry2->names->name, O_RDONLY)) != -1) {
	    do {
		r1 = read(f1, b1, BUFSIZE);
		r2 = read(f2, b2, BUFSIZE);
		if (r1 != r2 || r1 < 0)
		    break;
		if (r1 == 0) {
		    res = 1;
		    break;
		}
	    } while (!memcmp(b1, b2, r1));
	    close(f2);
	}
	close(f1);
    }
    return res;
}


    /*
     *  Signal Handler
     */

static void sighandler(int signum)
{
    fprintf(stderr," \n*** Caught signal %d ***\n", signum);
    if (doing_input)
	exit(2);
    stop = 1;
    /* Reinstall the signal handler. */
    signal(signum, sighandler);
}


static void usage(void)
{
    printf("usage: same [options]\n");
    printf("\t-s, --softlinks    Create soft links instead of hard links\n");
    printf("\t-d, --debug        Output some debug messages\n");
    printf("\t-v, --verbose      Output verbose messages\n");
    printf("\t-n, --dryrun       Only simulate\n");
    printf("\t-t, --timings      Output timing statistics\n");
    printf("\t-u, --user         Don't relink files owned by another user\n");
    printf("\t-z, --nullfiles    Link zero size files, too\n");
    printf("\t-c, --cache f      Keep a cache between runs in file f\n");
    printf("\t-H, --hashstart n  Start at hash value n instead of 0\n");
    printf("\t-h, --help         This page\n");
    printf("\t-V, --version      print \"" VERSION "\"\n");
}


    /*
     *  Main Routine
     */

int main(int argc, char **argv)
{
    int i;

    for (i = 1; i < argc; i++) {
	if (!strcmp(argv[i], "-h") || !strcmp(argv[i], "--help")) {
	    usage();
	    exit(0);
	} else if (!strcmp(argv[i], "-V") || !strcmp(argv[i], "--version")) {
	    printf(VERSION "\n");
	    exit(0);
	} else if (!strcmp(argv[i], "-s") || !strcmp(argv[i], "--softlinks")) {
            printf ("Softlinks currently broken. Sorry. --REW\n");
	    exit (1); 
	    o_softlinks++;
	} else if (!strcmp(argv[i], "-d") || !strcmp(argv[i], "--debug")) {
	    o_debug++;
	} else if (!strcmp(argv[i], "-v") || !strcmp(argv[i], "--verbose")) {
	    o_verbose++;
	} else if (!strcmp(argv[i], "-n") || !strcmp(argv[i], "--dryrun")) {
	    o_dryrun++;
	} else if (!strcmp(argv[i], "-t") || !strcmp(argv[i], "--timings")) {
	    o_timings++;
	} else if (!strcmp(argv[i], "-u") || !strcmp(argv[i], "--user")) {
	    o_user++;
	} else if (!strcmp(argv[i], "-0") || !strcmp(argv[i], "--null")) {
	    o_null++;
	} else if (!strcmp(argv[i], "-C") || !strcmp(argv[i], "--mergecache")) {
	    o_merge_cache++;
	} else if (!strcmp(argv[i], "-z") || !strcmp(argv[i], "--nullfiles")) {
	    o_nullfiles++;
	} else if ((!strcmp(argv[i], "-c") || !strcmp(argv[i], "--cache"))) {
	/* This used to have an extra check for the argv[i+1] existing. 
	   I removed it. If it doens't exist, o_cache will become NULL again! 
	   no bad things will happen. -- REW */
	    o_cache = argv[i + 1];
	    i++;
	} else if ((!strcmp(argv[i], "-H") || !strcmp(argv[i], "--hashstart"))) {
	    char *end;
	    if (!argv[i+1]) {
	        fprintf(stderr, "Invalid --hashstart value %s\n", argv[i + 1]);
                exit(1);
            }

	    hash_start = strtoul(argv[i + 1], &end, 10);
	    if (*end) {
		fprintf(stderr, "Invalid --hashstart value %s\n", argv[i + 1]);
		exit(1);
	    }
	    i++;
	} else {
	    fprintf(stderr, "Unknown option %s\n", argv[1]);
	    exit(1);
	}
    }

    if (o_merge_cache && !o_cache) {
	fprintf (stderr, "Merge-cache requested, but no cache given. \n");
	exit (1); 
    }

    signal(SIGINT,  sighandler);
    signal(SIGPIPE, sighandler);
    signal(SIGTERM, sighandler);

    /* uid = getuid();*/

    doing_input = 1;
    read_list();
    if (!stop && o_cache)
	load_cache();
    doing_input = 0;
    if (!stop && o_debug > 1)
	dump_hashtable();
    if (!stop)
	merge_and_link();
    if (!stop && o_debug > 1)
	dump_hashtable();
    if (o_cache)
	save_cache();
    if (!stop)
	do_end();

    exit(0);
}

