/*-
 * Copyright (c) 1994, 1995 Berkeley Software Design, Inc. All rights reserved.
 * The Berkeley Software Design Inc. software License Agreement specifies
 * the terms and conditions for redistribution.
 *
 *	BSDI $Id: vm_swap.c,v 2.2 1995/09/19 01:36:36 karels Exp $
 */

/*
 * Copyright (c) 1982, 1986, 1989, 1993
 *	The Regents of the University of California.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by the University of
 *	California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 *
 *	@(#)vm_swap.c	8.5 (Berkeley) 2/17/94
 */

#include <sys/param.h>
#include <sys/systm.h>
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/proc.h>
#include <sys/namei.h>
#include <sys/dmap.h>		/* XXX */
#include <sys/vnode.h>
#include <sys/map.h>
#include <sys/malloc.h>
#include <sys/file.h>

#include <vm/swap_pager.h>

#include <miscfs/specfs/specdev.h>

void	swstrategy __P((struct buf *));
int	swfree __P((struct proc *, struct swdevt *, int));

struct devsw swapsw = {
	NULL,
	nullopen, nullclose, rawread, rawwrite, noioctl, noselect, nommap,
	swstrategy, nodump, nopsize, 0,
	nostop
};

/*
 * Indirect driver for multi-device paging.
 */

struct	swdevt *swlist;		/* list of all swap devices */
struct	swdevt **swnext;	/* where to add next seq device */
struct	swdevt *swseq;		/* first sequential device */
int	niswdev;		/* number of interleaved swap devices */
int	niswap;			/* size of interleaved swap area */
struct	swapstats swapstats;
int	niswdev2 = 1;
int	swap_reservemax = 1024 * 1024 / DEV_BSIZE;

/*
 * Configure swap space and related parameters.
 * Initialize linked list of free swap
 * headers. These do not actually point
 * to buffers, but rather to pages that
 * are being swapped in and out.
 */
void
swapinit()
{
	register struct swdevt *swp;
	register int nblks;
	register int i;
	register struct buf *sp = swbuf;
	register struct proc *p = &proc0;	/* XXX */
	int error;

	/*
	 * Count pre-configured interleaved swap devices,
	 * and adjust total swap space available.
	 * Find size of each configured swap device.
	 * Some of the space will not be countable until later (dynamically
	 * configurable devices) and some of the counted space will not be
	 * available until a swapon() system call is issued, both usually
	 * happen when the system goes multi-user.
	 * Also, if we have more than one interleaved device, we currently
	 * arrange things as if there were twice as many interleaved devices,
	 * with alternating devices and holes; see the comment before swfree().
	 *
	 * If using NFS for swap, swdevt[0] will already be bdevvp'd.	XXX
	 */

	swnext = &swlist;
	for (swp = swdevt; swp->sw_dev != NODEV || swp->sw_vp != NULL; swp++) {
		*swnext = swp;
		swnext = &swp->sw_next;
		if ((u_int)major(swp->sw_dev) >= ndevsw)
			break;
		if (devsw[major(swp->sw_dev)]->d_psize) {
			nblks =
			  (*devsw[major(swp->sw_dev)]->d_psize)(swp->sw_dev);
			if (nblks != -1 &&
			    (swp->sw_nblks == 0 || swp->sw_nblks > nblks))
				/* Force page alignment */
				swp->sw_nblks = ctod(dtoc(nblks));
		}
		niswdev++;
		if (swp->sw_nblks > niswap)
			niswap = swp->sw_nblks;
	}
	if (niswdev > 1) {
		niswap = roundup(niswap, dmmax);
		niswdev2 = niswdev * 2;
		niswap *= niswdev2;
	}
	if (swdevt[0].sw_vp == NULL &&
	    bdevvp(swdevt[0].sw_dev, &swdevt[0].sw_vp))
		panic("swapvp");
	swapstats.swap_nswdev = niswdev;
	swapstats.swap_max = niswap;

	/*
	 * Initialize the swapmap.
	 */
	rminit(swapmap, (long) 0, (long) 0, "swap", nswapmap);
	swapstats.swap_mapsize = nswapmap;

#if 0
	/*
	 * We do not warn here because this happens during an initial
	 * installation from floppy, and don't want to concern the
	 * user unduly.
	 */
	if (swapstats.swap_max == 0)
		printf("WARNING: no swap space found\n");
	else
#else
	if (swapstats.swap_max)
#endif
	if (error = swfree(p, &swdevt[0], 0))
		printf("Warning, no swap space (swfree errno %d)\n", error);

	/*
	 * Now set up swap buffer headers.
	 */
	bswlist.b_actf = sp;
	for (i = 0; i < nswbuf - 1; i++, sp++) {
		sp->b_actf = sp + 1;
		sp->b_rcred = sp->b_wcred = p->p_ucred;
		sp->b_vnbufs.le_next = NOLIST;
	}
	sp->b_rcred = sp->b_wcred = p->p_ucred;
	sp->b_vnbufs.le_next = NOLIST;
	sp->b_actf = NULL;
}

void
swstrategy(bp)
	register struct buf *bp;
{
	int sz, off, seg;
	register struct swdevt *sp;
	struct vnode *vp;

#ifdef GENERIC
	/*
	 * A mini-root gets copied into the front of the swap
	 * and we run over top of the swap area just long
	 * enough for us to do a mkfs and restor of the real
	 * root (sure beats rewriting standalone restor).
	 */
#define	MINIROOTSIZE	4096
	if (rootdev == dumpdev)
		bp->b_blkno += MINIROOTSIZE;
#endif
	sz = howmany(bp->b_iocount, DEV_BSIZE);
	if (bp->b_blkno + sz > swapstats.swap_max) {
		bp->b_error = EINVAL;
		bp->b_flags |= B_ERROR;
		biodone(bp);
		return;
	}
	if (swapstats.swap_nswdev > 1) {
		if (bp->b_blkno < niswap) {
			if (niswdev > 1) {
				off = bp->b_blkno % dmmax;
				if (off + sz > dmmax) {
					bp->b_error = EINVAL;
					bp->b_flags |= B_ERROR;
					biodone(bp);
					return;
				}
				seg = bp->b_blkno / dmmax;
#ifdef DEBUG
				if (seg & 1)
					panic("swstrategy: block between devs");
#endif
				sp = &swdevt[(seg % niswdev2) / 2];
				seg /= niswdev2;
				bp->b_blkno = seg*dmmax + off;
			} else
				sp = swdevt;
		} else {
			bp->b_blkno -= niswap;
			for (sp = swseq; sp; sp = sp->sw_next) {
				if (bp->b_blkno < sp->sw_nblks)
					break;
				bp->b_blkno -= sp->sw_nblks;
			}
			if (sp == NULL || bp->b_blkno + sz > sp->sw_nblks) {
				bp->b_error = sp == NULL ?  ENODEV : EINVAL;
				bp->b_flags |= B_ERROR;
				biodone(bp);
				return;
			}
		}
	} else
		sp = swdevt;
	if ((bp->b_dev = sp->sw_dev) == NODEV)
		panic("swstrategy");
	if (sp->sw_vp == NULL) {
		bp->b_error = ENODEV;
		bp->b_flags |= B_ERROR;
		biodone(bp);
		return;
	}
	VHOLD(sp->sw_vp);
	if ((bp->b_flags & B_READ) == 0) {
		if (vp = bp->b_vp) {
			vp->v_numoutput--;
			if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) {
				vp->v_flag &= ~VBWAIT;
				wakeup((caddr_t)&vp->v_numoutput);
			}
		}
		sp->sw_vp->v_numoutput++;
	}
	if (bp->b_vp != NULL)
		brelvp(bp);
	bp->b_vp = sp->sw_vp;
	VOP_STRATEGY(bp);
}

/*
 * System call swapon(name) enables swapping on device name.
 * Return EBUSY if already swapping on this device.
 */
struct swapon_args {
	char	*name;
};
/* ARGSUSED */
int
swapon(p, uap, retval)
	struct proc *p;
	struct swapon_args *uap;
	int *retval;
{
	register struct vnode *vp;
	register struct swdevt *sp;
	dev_t dev;
	int error;
	struct nameidata nd;

	if (error = suser(p->p_ucred, &p->p_acflag))
		return (error);
	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->name, p);
	if (error = namei(&nd))
		return (error);
	vp = nd.ni_vp;
	if (vp->v_type != VBLK) {
		vrele(vp);
		return (ENOTBLK);
	}
	dev = (dev_t)vp->v_rdev;
	if (major(dev) >= ndevsw || devsw[major(dev)]->d_strategy == NULL) {
		vrele(vp);
		return (ENXIO);
	}
	for (sp = swdevt; sp; sp = sp->sw_next) {
		if (sp->sw_dev == dev) {
			if (sp->sw_flags & SW_FREED) {
				vrele(vp);
				return (EBUSY);
			}
			sp->sw_vp = vp;
			/*
			 * Assume that all sequential devices are freed when
			 * added to list; this must be an interleaved device.
			 */
			if (error = swfree(p, sp, sp - swdevt)) {
				vrele(vp);
				return (error);
			}
			return (0);
		}
	}
	sp = (struct swdevt *) malloc(sizeof(struct swdevt),
	    M_VMPGDATA, M_WAITOK);
	bzero(sp, sizeof(*sp));
	sp->sw_dev = dev;
	sp->sw_flags = SW_SEQUENTIAL;
	sp->sw_vp = vp;
	if (error = swfree(p, sp, -1)) {
		free(sp, M_VMPGDATA);
		vrele(vp);
		return (error);
	}
	if (swseq == NULL)
		swseq = sp;
	*swnext = sp;
	swnext = &sp->sw_next;
	++swapstats.swap_nswdev;
	return (0);
}

/*
 * Swfree(p, sp, index) frees the specified portion of the swap map,
 * associated with a specific device.  Portions from 0 through niswdev
 * provide 1/niswdev'th of the interleaved swap space, which is laid out
 * with blocks of dmmax pages circularly among the devices.  If there are
 * any sequential devices, they follow the interleaved range.  Index is
 * valid only for non-sequential devices.
 *
 * Currently, if there is more than one interleaved device, we lay out
 * the interleaved portion of the space as if there were 2 * niswdev
 * (or swapstats.swap_nswdev) devices, with the odd devices missing,
 * to prevent allocations from spanning devices.
 */
int
swfree(p, sp, index)
	struct proc *p;
	register struct swdevt *sp;
	int index;
{
	register swblk_t vsbase;
	register long blk;
	struct vnode *vp;
	register swblk_t dvbase;
	register int nblks;
	int error;

	vp = sp->sw_vp;
	if (error = VOP_OPEN(vp, FREAD|FWRITE, p->p_ucred, p))
		return (error);
	nblks = sp->sw_nblks;
	/*
	 * Some devices may not exist til after boot time.
	 * If so, their nblk count will be 0.
	 */
	if (nblks <= 0) {
		int perdev;
		dev_t dev = sp->sw_dev;

		if (devsw[major(dev)]->d_psize == 0 ||
		    (nblks = (*devsw[major(dev)]->d_psize)(dev)) == -1) {
			(void) VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p);
			return (ENXIO);
		}
		if ((sp->sw_flags & SW_SEQUENTIAL) == 0) {
			perdev = niswap / niswdev2;
			if (nblks > perdev)
				nblks = perdev;
		} else {
#if 0 /* we do not need to align to a dmmax boundary, although 4.4BSD does */
			if (nblks % dmmax)
				nblks -= (nblks % dmmax);
#endif
			swapstats.swap_max += nblks;
		}
		sp->sw_nblks = nblks;
	}
	if (nblks == 0) {
		(void) VOP_CLOSE(vp, FREAD|FWRITE, p->p_ucred, p);
		return (0);	/* XXX error? */
	}
	swapstats.swap_devs++;
	sp->sw_flags |= SW_FREED;
	if (sp->sw_flags & SW_SEQUENTIAL) {
		register struct swdevt *swp;

		blk = niswap;
		for (swp = swseq; swp; swp = swp->sw_next)
			blk += swp->sw_nblks;
		/*
		 * Don't use the first cluster of the device
		 * in case it starts with a label or boot block.
		 * This also prevents allocations from spanning devices.
		 */
		nblks -= ctod(CLSIZE);
		rmfree(swapmap, nblks, blk + ctod(CLSIZE));
		swapstats.swap_total += nblks;
		swapstats.swap_free += nblks;
	} else
	    for (dvbase = 0; dvbase < nblks; dvbase += dmmax) {
		blk = nblks - dvbase;
		if ((vsbase = 2 * index * dmmax + dvbase * niswdev2) >= niswap)
			panic("swfree");
		if (blk > dmmax)
			blk = dmmax;
		if (dvbase == 0) {
			/*
			 * Don't use the first cluster of the device
			 * in case it starts with a label or boot block.
			 */
			blk -= ctod(CLSIZE);
			vsbase += ctod(CLSIZE);
		}
		rmfree(swapmap, blk, vsbase);
		swapstats.swap_total += blk;
		swapstats.swap_free += blk;
	}
	/*
	 * Set reserve to smaller of swap_reservemax (initialized above)
	 * and 10% of new total swap space.  If free space gets down
	 * to the reserve, we stop promising additional virtual memory.
	 */
	swapstats.swap_reserve = min(swapstats.swap_total/10, swap_reservemax);
	return (0);
}
