/*-
 * Copyright (c) 1994 Berkeley Software Design, Inc. All rights reserved.
 * The Berkeley Software Design Inc. software License Agreement specifies
 * the terms and conditions for redistribution.
 *
 *	BSDI $Id: vfs_cluster.c,v 2.2 1995/12/12 19:45:57 donn Exp $
 */

/*-
 * Copyright (c) 1993
 *	The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by the University of
 *	California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
 */

#include <sys/param.h>
#include <sys/proc.h>
#include <sys/buf.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/trace.h>
#include <sys/malloc.h>
#include <sys/resourcevar.h>
#include <sys/systm.h>

#ifdef notyet
#ifdef DEBUG
#include <vm/vm.h>
#include <sys/sysctl.h>
int doreallocblks = 1;
struct ctldebug debug13 = { "doreallocblks", &doreallocblks };
#else
/* XXX for cluster_write */
#define doreallocblks 1
#endif
#else /* notyet */
#ifdef DEBUG
#include <vm/vm.h>
#include <sys/sysctl.h>
int doreallocblks = 0;
struct ctldebug debug13 = { "doreallocblks", &doreallocblks };
#else
/* XXX for cluster_write */
#define doreallocblks 0
#endif
#endif /* notyet */

/*
 * Local declarations
 */
void	cluster_rbuild __P((struct vnode *, struct buf *, long, daddr_t,
		daddr_t, int, long));
void	cluster_wbuild __P((struct vnode *, struct buf *, long,
		daddr_t, int, daddr_t));
struct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *));

#ifdef DIAGNOSTIC
/*
 * Set to 1 if reads of block zero should cause readahead to be done.
 * Set to 0 treats a read of block zero as a non-sequential read.
 *
 * Setting to one assumes that most reads of block zero of files are due to
 * sequential passes over the files (e.g. cat, sum) where additional blocks
 * will soon be needed.  Setting to zero assumes that the majority are
 * surgical strikes to get particular info (e.g. size, file) where readahead
 * blocks will not be used and, in fact, push out other potentially useful
 * blocks from the cache.  The former seems intuitive, but some quick tests
 * showed that the latter performed better from a system-wide point of view.
 */
int	doclusterraz = 0;
#define ISSEQREAD(vp, blk) \
	(((blk) != 0 || doclusterraz) && \
	 ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
#else
#define ISSEQREAD(vp, blk) \
	((blk) != 0 && ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
#endif

/*
 * This replaces bread.  If this is a bread at the beginning of a file and
 * lastr is 0, we assume this is the first read and we'll read up to two
 * blocks if they are sequential.  After that, we'll do regular read ahead
 * in clustered chunks.
 *
 * There are 4 or 5 cases depending on how you count:
 *	Desired block is in the cache:
 *	    1 Not sequential access (0 I/Os).
 *	    2 Access is sequential, try to read ahead (1 ASYNC).
 *	Desired block is not in cache:
 *	    3 Not sequential access (1 SYNC).
 *	    4 Sequential access, next block is contiguous (1 SYNC).
 *	    5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC)
 *
 * There are potentially two buffers that require I/O.
 * If either is NULL, then you don't have to do the I/O.
 */
int
cluster_read(vp, filesize, lblkno, size, cred, bpp)
	struct vnode *vp;
	u_quad_t filesize;
	daddr_t lblkno;
	long size;
	struct ucred *cred;
	struct buf **bpp;
{
	register struct buf *bp, *syncbp, *asyncbp;
	struct buf *chain;
	struct rusage *ru;
	daddr_t blkno, ioblkno;
	long flags, iocount;
	int error, num_ra;

#ifdef DIAGNOSTIC
	if (size == 0)
		panic("cluster_read: size 0");
#endif

	error = 0;
	flags = B_READ;
	*bpp = bp = getblk(vp, lblkno, size, 0, 0);
	if (bp->b_flags & B_CACHE) {
		/*
		 * Desired block is in cache; do any readahead ASYNC.
		 * Case 1, 2.
		 */
		trace(TR_BREADHIT, pack(vp, size), lblkno);
		flags |= B_ASYNC;
		ioblkno = lblkno + (vp->v_ralen ? vp->v_ralen : 1);
		syncbp = NULL;		/* do not read it again */
	} else {
		/*
		 * Block wasn't in cache, case 3, 4, 5.  Defer trace
		 * call as size may be increased below (case 4).
		 */
		bp->b_flags |= B_READ;
		syncbp = bp;		/* still need to read it */
		ioblkno = lblkno;
	}
	asyncbp = NULL;			/* no async i/o to do yet */

	/*
	 * XXX
	 * Replace 1 with a window size based on some permutation of
	 * maxcontig and rot_delay.  This will let you figure out how
	 * many blocks you should read-ahead (case 2, 4, 5).
	 *
	 * If the access isn't sequential, reset the window to 1.
	 * Note that a read to the same block is considered sequential.
	 * This catches the case where the file is being read sequentially,
	 * but at smaller than the filesystem block size.
	 */
	if (!ISSEQREAD(vp, lblkno)) {
		vp->v_ralen = 0;
		vp->v_maxra = lblkno;
		goto skip_readahead;
	}
	/*
	 * Can only cluster full blocks.
	 * (XXX  ought to read-ahead last frag ?!?)
	 */
	if ((ioblkno + 1) * size > filesize)
		goto skip_readahead;
	if (syncbp == NULL) {
		/*
		 * Case 1, 2.  If clustering is working, the next block
		 * will usually already be in the cache; try to save work
		 * by avoiding the VOP_BMAP.  If not, chain cluster into
		 * a separate async read.
		 */
		bp = getblk(vp, ioblkno, size, 0, 0);
		if (bp->b_flags & B_CACHE) {
			/* readahead block in cache, just take it */
			brelse(bp);
			goto skip_readahead;
		}
		if ((error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) ||
		    blkno == -1) {
			bp->b_flags |= B_INVAL;
			brelse(bp);
			goto skip_readahead;
		}
		chain = asyncbp = bp;
	} else {
		/*
		 * Case 3, 4, 5.  Chain cluster into sync read if possible.
		 */
		if ((error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) ||
		    blkno == -1)
			goto skip_readahead;
		chain = syncbp;
	}

	/*
	 * Reading sequentially, and the next block is not in the
	 * cache.  We are going to try reading ahead.
	 *
	 * If there are no contiguous blocks here, we may still be
	 * able to win via case 5.  Save the result of the bmap so
	 * that we do not have to repeat it later, and do an async
	 * read of the next area.
	 */
	if (num_ra == 0 && ioblkno == lblkno) {
if (chain != syncbp) panic("cluster_read %x!=%x", chain, syncbp);
		syncbp->b_blkno = blkno;	/* nb, chain == syncbp */
		++ioblkno;
		if ((ioblkno + 1) * size > filesize)
			goto skip_readahead;
		bp = getblk(vp, ioblkno, size, 0, 0);
		if (bp->b_flags & B_CACHE) {
			/* readahead block in cache, just leave it */
			brelse(bp);
			goto skip_readahead;
		}
		if ((error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) ||
		    blkno == -1) {
			bp->b_flags |= B_INVAL;
			brelse(bp);
			goto skip_readahead;
		}
		chain = asyncbp = bp;
		flags |= B_ASYNC;
	}
	if (num_ra > 0) {
		/*
		 * If our desired readahead block had been read
		 * in a previous readahead but is no longer in
		 * core, then we may be reading ahead too far
		 * or are not using our readahead very rapidly.
		 * In this case we scale back the window.
		 *
		 * ### scale back arithmetically rather than geometrically?
		 */
		if (ioblkno <= vp->v_maxra)
			vp->v_ralen = max(vp->v_ralen >> 1, 1);
		/*
		 * There are more sequential blocks than our current
		 * window allows, scale up.  Ideally we want to get
		 * in sync with the filesystem maxcontig value.
		 */
		else if (num_ra > vp->v_ralen && lblkno != vp->v_lastr)
			vp->v_ralen = vp->v_ralen ?
				min(num_ra, vp->v_ralen << 1) : 1;

		if (num_ra > vp->v_ralen)
			num_ra = vp->v_ralen;
	}

	/* ??? I am not sure this next test is needed  --chris */
	if ((ioblkno + num_ra + 1) * size > filesize)
		num_ra--;

	/* This is needed even if num_ra <= 0 (e.g., case 2 but num_ra==0). */
	cluster_rbuild(vp, chain, ioblkno, blkno, size, num_ra, flags);

	/* XXX Kirk, do we need to make sure the bp has creds? */
skip_readahead:
	iocount = 0;
	ru = &curproc->p_stats->p_ru;	/* XXX */
	if (syncbp) {			/* case 3, 5 */
		bp = syncbp;
		if (bp->b_flags & (B_DONE | B_DELWRI))
			panic("cluster_read: DONE syncbp");
		trace(TR_BREADMISS, pack(vp, bp->b_iocount), lblkno);
		ru->ru_inblock++;
#ifdef notyet
		if (bp->b_rcred == NOCRED && cred != NOCRED) {
			crhold(cred);
			bp->b_rcred = cred;
		}
#endif
		iocount = bp->b_iocount;
		error = VOP_STRATEGY(bp);
	}
	if (asyncbp) {			/* case 2, 5 */
		bp = asyncbp;
		if (error) {
			bp->b_flags &= ~(B_ASYNC | B_READ);
			brelse(bp);
		} else {
			trace(TR_BREADMISSRA, pack(vp, bp->b_iocount), ioblkno);
			ru->ru_inblock++;	/* XXX */
#ifdef notyet
			if (bp->b_rcred == NOCRED && cred != NOCRED) {
				crhold(cred);
				bp->b_rcred = cred;
			}
#endif
			iocount = bp->b_iocount;
			(void) VOP_STRATEGY(bp);
		}
	}

	/*
	 * Recalculate our maximum readahead
	 */
	if (iocount)
		vp->v_maxra = bp->b_lblkno + (iocount / size) - 1;

	if (syncbp)
		return (biowait(syncbp));
	return (error);
}

/*
 * Chain `run' additional requests (if run >= 1) to the buffer `chhead';
 * they are assumed to start contiguously from lbn+1 at physical address
 * blkno+btodb(size).  Even if run <= 0, set chhead's b_blkno and
 * b_flags.
 */
void
cluster_rbuild(vp, chhead, lbn, blkno, size, run, flags)
	struct vnode *vp;
	struct buf *chhead;
	daddr_t lbn;
	daddr_t blkno;
	long size;
	int run;
	long flags;
{
	register struct buf *bp, **bpp;
	daddr_t bn;
	int i, inc;

#ifdef DIAGNOSTIC
	if (size != vp->v_mount->mnt_stat.f_iosize)
		panic("cluster_rbuild: size %ld != iosize %ld\n",
			size, vp->v_mount->mnt_stat.f_iosize);
#endif
	bp = chhead;
	bp->b_blkno = blkno;
	bp->b_flags |= flags;
	bpp = &bp->b_chain;

	inc = btodb(size);
	for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) {
		/*
		 * Get buffer for i'th block after the one at the head
		 * of the chain.  If it is already in the cache, release
		 * it and stop chaining; otherwise tack it on and adjust
		 * the total i/o size.
		 */
		bp = getblk(vp, lbn + i, size, 0, 0);
		if (bp->b_flags & B_CACHE) {
			brelse(bp);
			break;
		}
		bp->b_blkno = bn;
		bp->b_flags |= flags | B_READ | B_ASYNC;
		*bpp = bp;
		bpp = &bp->b_chain;
		chhead->b_iocount += size;
	}
	/* *bpp = NULL;	(already set) */
}

/*
 * Do clustered write for FFS.
 *
 * Three cases:
 *	1. Write is not sequential (write asynchronously)
 *	Write is sequential:
 *	2.	beginning of cluster - begin cluster
 *	3.	middle of a cluster - add to cluster
 *	4.	end of a cluster - asynchronously write cluster
 */
void
cluster_write(bp, filesize)
        struct buf *bp;
	u_quad_t filesize;
{
        struct vnode *vp;
        daddr_t lbn;
        int maxclen, cursize;

        vp = bp->b_vp;
        lbn = bp->b_lblkno;

	/* Initialize vnode to beginning of file. */
	if (lbn == 0)
		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;

        if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
	    (bp->b_blkno != vp->v_lasta + btodb(bp->b_bcount))) {
		maxclen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1;
		if (vp->v_clen != 0) {
			/*
			 * Next block is not sequential.
			 *
			 * If we are not writing at end of file, the process
			 * seeked to another point in the file since its
			 * last write, or we have reached our maximum
			 * cluster size, then push the previous cluster.
			 * Otherwise try reallocating to make it sequential.
			 */
			cursize = vp->v_lastw - vp->v_cstart + 1;
			if (!doreallocblks ||
			    (lbn + 1) * bp->b_bcount != filesize ||
			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
				cluster_wbuild(vp, NULL, bp->b_bcount,
				    vp->v_cstart, cursize, lbn);
			} else {
				struct buf **bpp, **endbp;
				struct cluster_save *buflist;

				buflist = cluster_collectbufs(vp, bp);
				endbp = &buflist->bs_children
				    [buflist->bs_nchildren - 1];
				if (VOP_REALLOCBLKS(vp, buflist)) {
					/*
					 * Failed, push the previous cluster.
					 */
					for (bpp = buflist->bs_children;
					     bpp < endbp; bpp++)
						brelse(*bpp);
					free(buflist, M_SEGMENT);
					cluster_wbuild(vp, NULL, bp->b_bcount,
					    vp->v_cstart, cursize, lbn);
				} else {
					/*
					 * Succeeded, keep building cluster.
					 */
					for (bpp = buflist->bs_children;
					     bpp <= endbp; bpp++)
						bdwrite(*bpp);
					free(buflist, M_SEGMENT);
					vp->v_lastw = lbn;
					vp->v_lasta = bp->b_blkno;
					return;
				}
			}
		}
		/*
		 * Consider beginning a cluster.
		 * If at end of file, make cluster as large as possible,
		 * otherwise find size of existing cluster.
		 */
		if ((lbn + 1) * bp->b_bcount != filesize &&
		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) ||
		     bp->b_blkno == -1)) {
			bawrite(bp);
			vp->v_clen = 0;
			vp->v_lasta = bp->b_blkno;
			vp->v_cstart = lbn + 1;
			vp->v_lastw = lbn;
			return;
		}
                vp->v_clen = maxclen;
                if (maxclen == 0) {		/* I/O not contiguous */
			vp->v_cstart = lbn + 1;
                        bawrite(bp);
                } else {			/* Wait for rest of cluster */
			vp->v_cstart = lbn;
                        bdwrite(bp);
		}
	} else if (lbn == vp->v_cstart + vp->v_clen) {
		/*
		 * At end of cluster, write it out.
		 */
		cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart,
		    vp->v_clen + 1, lbn);
		vp->v_clen = 0;
		vp->v_cstart = lbn + 1;
	} else
		/*
		 * In the middle of a cluster, so just delay the
		 * I/O for now.
		 */
		bdwrite(bp);
	vp->v_lastw = lbn;
	vp->v_lasta = bp->b_blkno;
}


/*
 * Vaguely similar to cluster_rbuild...wish they could be combined.
 * The last lbn argument is the current block on which I/O is being
 * performed by the caller, and which the caller has locked.  If last_bp
 * is not NULL, it is the buffer for lbn; otherwise, skip lbn if it is
 * part of the current cluster.
 */
void
cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
	struct vnode *vp;
	struct buf *last_bp;
	long size;
	daddr_t start_lbn;
	int len;
	daddr_t	lbn;
{
	struct buf *bp, *chhead, **bpp;
	struct rusage *ru;
	int s;

#ifdef DIAGNOSTIC
	if (size != vp->v_mount->mnt_stat.f_iosize)
		panic("cluster_wbuild: size %ld != iosize %ld\n",
			size, vp->v_mount->mnt_stat.f_iosize);
#endif
	ru = &curproc->p_stats->p_ru;	/* XXX */
	chhead = NULL;
	for (; len; start_lbn++, len--) {
		/*
		 * Find buffer associated with this block.  Note, if
		 * start_lbn == lbn, we must avoid getblk (as we have
		 * the buffer locked); if last_bp is null, simply skip
		 * this block, which the caller will write.
		 */
		if (start_lbn != lbn) {
			bp = getblk(vp, start_lbn, 0, 0, 0);
			if (bp == NULL || (bp->b_flags & B_DELWRI) == 0) {
				/*
				 * Not in core, or already written out.
				 * Forget this one; push out any cluster
				 * we have accumulated so far.
				 */
				if (bp)
					brelse(bp);
				if (chhead) {
					bawrite(chhead);
					chhead = NULL;
				}
				continue;
			}
		} else {
			if ((bp = last_bp) == NULL) {
				if (chhead) {
					bawrite(chhead);
					chhead = NULL;
				}
				continue;
			}
		}
		if (chhead == NULL) {
			/*
			 * Start new chain.  A call to bawrite() will adjust
			 * this buffer's vnode's lists later.
			 */
			chhead = bp;
			bpp = &bp->b_chain;
		} else {
			/*
			 * Append this block to the chain.  Duplicate here
			 * the work that would have been done in bawrite().
			 */
			s = splbio();
			bp->b_flags =
			    bp->b_flags & ~(B_READ|B_DONE|B_ERROR|B_DELWRI) |
			    (B_ASYNC | B_AGE);
			reassignbuf(bp, vp);	/* move to clean list */
			vp->v_numoutput++;
			splx(s);
			*bpp = bp;
			bpp = &bp->b_chain;
			chhead->b_iocount += size;
			ru->ru_oublock--;	/* overpaid; get a refund */
		}
	}
	if (chhead != NULL)
		bawrite(chhead);
}

/*
 * Collect together all the buffers in a cluster.
 * Plus add one additional buffer.
 */
struct cluster_save *
cluster_collectbufs(vp, last_bp)
	struct vnode *vp;
	struct buf *last_bp;
{
	struct cluster_save *buflist;
	daddr_t	lbn;
	int i, len;

	len = vp->v_lastw - vp->v_cstart + 1;
	buflist = malloc(sizeof(*buflist) + (len * sizeof(struct buf *)),
	    M_SEGMENT, M_WAITOK);
	buflist->bs_nchildren = 0;
	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++)
		    (void)bread(vp, lbn, last_bp->b_bcount, NOCRED,
			&buflist->bs_children[i]);
	buflist->bs_children[i] = last_bp;
	buflist->bs_nchildren = i + 1;
	return (buflist);
}
