/*
 * lat_mem_rd.c - measure memory load latency
 *
 * usage: lat_mem_rd size-in-MB stride [stride ...]
 *
 * Copyright (c) 1994 Larry McVoy.  Distributed under the FSF GPL with
 * additional restriction that results may published only if
 * (1) the benchmark is unmodified, and
 * (2) the version in the sccsid below is included in the report.
 * Support for this development by Sun Microsystems is gratefully acknowledged.
 */
char	*id = "$Id: lat_mem_rd.c,v 1.4 1996/05/30 07:33:07 lm Exp lm $\n";

#define N       1000000
#define STRIDE  (512/sizeof(char *))
#define	TRIES	4
#define	LOWER	512

#include	"timing.c"

main(ac, av)
        char  **av;
{
        int     len;
	int	range;
	int	stride;
	int	i;
	float	clk, getclock();
        char   *addr;

        len = atoi(av[1]) * 1024 * 1024;
        addr = (char *)malloc(len);
	clk = getclock();
	printf("clk=%.2f\n", clk);

	if (av[2] == 0) {
		fprintf(stderr, "\"stride=%d\n", STRIDE);
		for (range = LOWER; range <= len; range = step(range)) {
			loads(addr, range, STRIDE, clk);
		}
	} else {
		for (i = 2; i < ac; ++i) {
			stride = bytes(av[i]);
			fprintf(stderr, "\"stride=%d\n", stride);
			for (range = LOWER; range <= len; range = step(range)) {
				loads(addr, range, stride, clk);
			}
			fprintf(stderr, "\n");
		}
	}
	exit(0);
}

loads(addr, range, stride, clk)
	char	*addr;
	int	stride;
	float	clk;
{
	register char **p;
        int     i;
	int	tries = 0;
	int	result = 0x7fffffff;

        /*
	 * First create a list of pointers.
	 */
     	if (stride & (sizeof(char *) - 1)) {
		fprintf(stderr, "list: stride must be aligned.\n");
		return;
	}

     	for (i = 0; i < range; i += stride) {
		char	*next;

		p = (char **)&addr[i];
		if (i + stride >= range) {
			next = &addr[0];
		} else {
			next = &addr[i + stride];
		}
		*p = next;
	}

	/*
	 * Now walk them and time it.
	 */
        for (tries = 0; tries < TRIES; ++tries) {
                /* time loop with loads */
#ifdef	BEFORE
#define	ONE	((int *)p)[1] = 0; p = (char **)*p; 
#else
#define	ONE	p = (char **)*p; ((int *)p)[1] = 0;
#endif
#define	FIVE	ONE ONE ONE ONE ONE
#define	TEN	FIVE FIVE
#define	FIFTY	TEN TEN TEN TEN TEN
#define	HUNDRED	FIFTY FIFTY
		i = N;
		p = (char **)addr;
                start();
                while (i > 0) {
			HUNDRED
			HUNDRED
			HUNDRED
			HUNDRED
			HUNDRED
			i -= 500;
                }
		i = stop(p);
		if (i < result) {
			result = i;
		}
	}
	/*
	 * We want to get to nanoseconds / load.  We don't want to
	 * lose any precision in the process.  What we have is the
	 * milliseconds it took to do N loads, where N is 1 million,
	 * and we expect that each load took between 10 and 2000
	 * nanoseconds.
	 *
	 * We want just the memory latency time, not including the
	 * time to execute the load instruction.  We allow one clock
	 * for the instruction itself.  So we need to subtract off
	 * N * clk nanoseconds.
	 *
	 * XXX - we do not account for loop overhead here.
	 */
     	i = (clk * N * 2) / 1000;		/* load instruction time in usecs */
	result -= i;			/* time for the loads themselves */
	result *= 1000;			/* convert to nanoseconds */
	result /= N;				/* nanosecs per load */
	fprintf(stderr, "%.5f %d\n", range / (1024. * 1024), result);
}

step(k)
{
	if (k < 1024) {
		k = k * 2;
        } else if (k < 4*1024) {
		k += 1024;
        } else if (k < 32*1024) {
		k += 2048;
        } else if (k < 64*1024) {
		k += 4096;
        } else if (k < 128*1024) {
		k += 8192;
        } else if (k < 256*1024) {
		k += 16384;
        } else if (k < 512*1024) {
		k += 32*1024;
	} else {
		k += 512 * 1024;
	}
	return (k);
}

float
getclock()
{
	float	c;
	FILE	*f = popen("mhz -c", "r");

	fscanf(f, "%f", &c);
	return (c);
}

int
bytes(s)
	char	*s;
{
	int	n = atoi(s);

	if ((last(s) == 'k') || (last(s) == 'K'))
		n *= 1024;
	if ((last(s) == 'm') || (last(s) == 'M'))
		n *= (1024 * 1024);
	return (n);
}

last(s)
	char	*s;
{
	while (*s++)
		;
	return (s[-2]);
}
