/*
 *   iswraid.c Copyright (C) 2003,2004 Intel Corporation. 
 *   All rights reserved.
 *
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU General Public License as published by
 *   the Free Software Foundation; either version 2, or (at your option)
 *   any later version.
 *
 *   You should have received a copy of the GNU General Public License
 *   (for example /usr/src/linux/COPYING); if not, write to the Free
 *   Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 *   Authors: Boji Tony Kannanthanam 
 *            < boji dot t dot kannanthanam at intel dot com >
 *            Martins Krikis
 *            < martins dot krikis at intel dot com >
 *
 *   Based on ataraid codebase by Arjan van de Ven
 */

/* "iswraid" is an ataraid subdriver for Intel's ICH5R and ICH6R chipsets.
 * The "ataraid" module needs to be loaded before this driver can load. 
 * This subdriver differs from the other "ataraid" subdrivers in that it probes
 * SCSI disks looking for RAID member disks instead of the ATA/IDE subsystem.
 * Therefore, the "ata_piix" driver which detects the SATA drives connected
 * to ICH5R and ICH6R and presents them as SCSI devices is also needed.
 * Adding "alias scsi_hostadapter ata_piix" to modules.conf will ensure that
 * "ata_piix" gets loaded on demand when this driver starts.
 */
#include <linux/module.h>
#include <linux/init.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/smp_lock.h>
#include <linux/blkdev.h>
#include <linux/blkpg.h>
#include <linux/genhd.h>
#include <linux/ioctl.h>
#include <linux/list.h>
#include <linux/ide.h>
#include <linux/proc_fs.h>
#include <asm/uaccess.h>
#include "../../scsi/scsi.h"
#include <scsi/scsi.h>
#include <scsi/scsi_ioctl.h>
#include "ataraid.h"
#include "iswraid.h"

#if 0
#define DRIVERDEBUG
#endif

#ifdef DRIVERDEBUG
#define DEBUG(s, args...) printk(s, ## args)
#else
#define DEBUG(s, args...)
#endif

#define ISW_VERSION_STRING        "Version 0.1.3"

#define ISWRAID_CLAIM_DISKS   1 /* Claim disks with MPBs for RAID by default */
#define ISWRAID_HALT_DEGRADED 0 /* Don't halt IOs if RAID1 becomes degraded */
#define ISWRAID_ERROR_THRESHOLD 10 /* Threshold for marking a disk degraded */
static int iswraid_claim_disks = ISWRAID_CLAIM_DISKS;
static int iswraid_halt_degraded = ISWRAID_HALT_DEGRADED;
static int iswraid_error_threshold = ISWRAID_ERROR_THRESHOLD;
MODULE_PARM(iswraid_claim_disks, "i");
MODULE_PARM_DESC(iswraid_claim_disks,
		 "Claim disks with ISWRAID MPBs for RAID, 0 to disable");
MODULE_PARM(iswraid_halt_degraded, "i");
MODULE_PARM_DESC(iswraid_halt_degraded,
		 "Halt IOs if RAID1 becomes degraded, 1 to enable");
MODULE_PARM(iswraid_error_threshold, "i");
MODULE_PARM_DESC(iswraid_error_threshold,
		 "Error threshold for marking disks degraded, 0 to disable");

static int iswraid_open(struct inode *inode, struct file *filp);
static int iswraid_release(struct inode *inode, struct file *filp);
static int iswraid_ioctl(struct inode *inode, struct file *file,
			 unsigned int cmd, unsigned long arg);
static int iswraid0_make_request(request_queue_t *q, int rw,
				 struct buffer_head *bh);
static int iswraid1_make_request(request_queue_t *q, int rw,
				 struct buffer_head *bh);

#define MAX_RAID_ARRAYS 8
#define MAX_RAID_VOLUMES 16
#define MAX_RAID_MEMBER_DISKS 8
/* ataraid.c uses bits in variable ataraiduse to keep track of raid devices */
#define MAX_ATARAID_RAIDDEVS (sizeof(unsigned int) * 8) 

struct disk;
struct volume;

struct array {
	int disabled;
	struct _RaidMpb *mpb; /* the most up-to-date MPB among member disks */
	struct disk *disks[MAX_RAID_MEMBER_DISKS];
	struct volume *volumes[MAX_RAID_VOLUMES];
};

static struct array arrays[MAX_RAID_ARRAYS];
static int arraycount = 0;

struct disk {
	struct list_head head;
	kdev_t dev;
	struct block_device *bdev;
	atomic_t errorcount;
	unsigned int status; /* used with atomic bit operations */
	spinlock_t lock; /* to protect last_pos, and for future enhancements */
	unsigned long last_pos;
	make_request_fn *old_makereqfn;
	struct array *array;
	struct _RaidMpb *mpb;
	unsigned char serial[MAX_RAID_SERIAL_LEN + 1];
};

static LIST_HEAD(disklist);
static int diskcount = 0;
static make_request_fn *oneandonly_makereqfn = NULL; /* NULL if not valid */

#define DEGRADED_MAP 2
#define FAILED_MAP   3

#define SPARE_DISK      0x01  /* Spare */
#define CONFIGURED_DISK 0x02  /* Member of some RaidDev */
#define FAILED_DISK     0x04  /* Permanent failure */
#define USABLE_DISK     0x08  /* Fully usable unless FAILED_DISK is set */

#define DISABLED_BIT 0
#define DEGRADED_BIT 1
#define FAILED_BIT   2   /* should match FAILED_DISK's bit, above */

struct volume {
	int devbit;
	int refcnt;
	int raidlevel;
	unsigned int state; /* used with atomic bit operations */
	unsigned int pbaOfLba0;
	unsigned int blocksPerStrip;	
	unsigned long sectors;  /* size of the whole volume in blocks */
	int tiebreak;           /* helps choose a disk when there is a tie */
	struct array *array;
	int numdisks;
	unsigned int degradedbits;
	struct disk *disks[MAX_RAID_MEMBER_DISKS];
	struct geom geom;
	unsigned char serial[MAX_RAID_SERIAL_LEN + 1];
};

static struct volume volumes[MAX_RAID_VOLUMES];
static int volumecount = 0;

struct bh_private {
	struct buffer_head *parent;
	struct volume *volume;
	atomic_t count;
	int rw;
	unsigned long status; /* used with atomic bit ops; disk status bits */
	make_request_fn *old_makereqfn;
	void (*old_endiofn)(struct buffer_head *bh, int uptodate);
	void *old_private;
};

static struct raid_device_operations iswraid0_ops = {
	open:         iswraid_open,
	release:      iswraid_release,
	ioctl:        iswraid_ioctl,
	make_request: iswraid0_make_request
};

static struct raid_device_operations iswraid1_ops = {
	open:         iswraid_open,
	release:      iswraid_release,
	ioctl:        iswraid_ioctl,
	make_request: iswraid1_make_request
};

static struct volume *raid[MAX_ATARAID_RAIDDEVS];

static DECLARE_MUTEX(iswraid_sem);

/* Find a disk given its dev number */
static struct disk *
find_disk_by_dev(kdev_t dev)
{
	struct list_head *curr;
	list_for_each(curr, &disklist) {
		struct disk *disk = list_entry(curr, struct disk, head);
		if (disk->dev == dev)
			return disk;
	}
	return NULL;
}

/* Find a disk given its serial number */
static struct disk *
find_disk_by_serial(unsigned char *serial)
{
	struct list_head *curr;
	list_for_each(curr, &disklist) {
		struct disk *disk = list_entry(curr, struct disk, head);
		/* could consider partial matches here due to placeholders */
		if (!strncmp(disk->serial, serial, MAX_RAID_SERIAL_LEN))
			return disk;
	}
	return NULL;
}
		
/* this allocates our own bh tracking structure, code stolen from ataraid.c */
/* FIXME should use slab caches, not kmalloc */
static struct bh_private *
get_private(void)
{
	void *ptr = NULL;
	while (!ptr) {
		ptr=kmalloc(sizeof(struct bh_private), GFP_NOIO);
		if (!ptr) {
			__set_current_state(TASK_RUNNING);
			yield();
		}
	}
	((struct bh_private *) ptr)->status = 0;
	return ptr;
}

/* Figure out where the MPB data "starts" */
static unsigned long __init
calc_mpb_blocknum(int major, int minor)
{
	struct gendisk *gdisk = get_gendisk(MKDEV(major, minor));
	if (!gdisk) {
		DEBUG("iswraid: can't get partitioning info for major %d\n",
		      major);
		return 0;
	}
	/* gendisk->sizes are in 1024 sized blocks. We use 512. Hence * 2 */
	/* Our MPB's "first" block is 1024 bytes from the end of the disk */
	return (((unsigned long) gdisk->sizes[minor] - 1)
		* (1024 / ISW_DISK_BLOCK_SIZE));
}

/* Generate checksum of contents of Raid metadata for mpbSize/sizeof(U32) words
 * Note that the checksum field itself should be ignored for this calculation
 */
static u32
compute_checksum(const u32 *buffer, u32 mpbSize)
{
	u32 i, sum = -((struct _RaidMpb *) buffer)->checkSum;
	for (i = 0; i < (mpbSize / sizeof (u32)); i++)
		sum += *buffer++;
	return sum;
}

static void end_io(struct buffer_head *bh, int uptodate); /* forward decl. */

static void
end_mpb_write(struct buffer_head *bh, int uptodate)
{
	struct disk *disk = bh->b_private;
	printk(KERN_INFO "iswraid: MPB write to disk major %d minor %d %s\n",
	       MAJOR(disk->dev), MINOR(disk->dev),
	       uptodate ? "completed successfully" : "failed");
	kfree(bh); /* here we free the last one, end_io frees the first one */
}	

/* starts an MPB write to a particular disk. May not sleep */
static int
start_mpb_write(struct disk *disk, struct _RaidMpb *mpb)
{
	kdev_t dev = disk->dev;
	unsigned long mpb_blocknum;
	int mpbblocks;
	struct buffer_head *bh1, *bh2;
	struct bh_private *private;
	DEBUG("iswraid: start_mpb_write\n");
	DEBUG("iswraid: Size of RAID metadata is %d bytes\n", mpb->mpbSize);

	/* Find the block number of the "first" block of Intel RAID metadata */
	if (!(mpb_blocknum = calc_mpb_blocknum(MAJOR(dev), MINOR(dev))))
		return -EINVAL;

	/* if this is not nice, we can keep 2 static bufferheads per disk... */
	if (!(bh1 = kmalloc(sizeof(struct buffer_head), GFP_ATOMIC)))
		return -ENOMEM;

	memset(bh1, 0, sizeof(*bh1)); /* most fields will be NULL */
	/* FIXME need a valid b_page for CONFIG_HIGHMEM systems */
	bh1->b_rsector = mpb_blocknum;
	bh1->b_rdev = dev;
	bh1->b_size = ISW_DISK_BLOCK_SIZE;
	bh1->b_data = (char *) mpb;
	bh1->b_end_io = &end_io; /* our usual completion routine */
	set_bit(BH_Mapped, &bh1->b_state); /* checked in __make_request */
	set_bit(BH_Sync, &bh1->b_state);
	set_bit(BH_Lock, &bh1->b_state);

	/* again, keeping a static bh_private for each disk would do, too */
	if (!(private = kmalloc(sizeof(struct bh_private), GFP_ATOMIC))) {
		kfree(bh1);
		return -ENOMEM;
	}

	bh1->b_private = private;
	private->parent = NULL;
	private->volume = NULL;
	private->rw = WRITE;    /* irrelevant */
	private->status = 0;
	private->old_makereqfn = disk->old_makereqfn;
	private->old_endiofn = &end_mpb_write;
	private->old_private = disk;
	
	mpbblocks = ((mpb->mpbSize + ISW_DISK_BLOCK_SIZE - 1)
		     / ISW_DISK_BLOCK_SIZE);
	if (mpbblocks <= 1) /* only == 1 possible, really */
		atomic_set(&private->count, 1);
	else {
		atomic_set(&private->count, 2);
		if (!(bh2 = kmalloc(sizeof(struct buffer_head), GFP_ATOMIC))) {
			kfree(bh1);
			kfree(private);
			return -ENOMEM;
		}
		memcpy(bh2, bh1, sizeof(*bh1)); /* most fields like in bh1 */
		bh2->b_rsector = mpb_blocknum - (mpbblocks - 1);
		bh2->b_size = ISW_DISK_BLOCK_SIZE * (mpbblocks - 1);
		bh2->b_data = (char *) mpb + ISW_DISK_BLOCK_SIZE;
		init_waitqueue_head(&bh2->b_wait); /* FIXME do we need this? */
		get_bh(bh2);                       /* FIXME do we need this? */
		generic_make_request(WRITE, bh2);
	}

	init_waitqueue_head(&bh1->b_wait);         /* FIXME do we need this? */
	get_bh(bh1);                               /* FIXME do we need this? */
	generic_make_request(WRITE, bh1);
	DEBUG("iswraid: start_mpb_write exiting for major %d minor %d\n",
	      MAJOR(dev), MINOR(dev));
	return 0;
}

/* returns a pointer to the next raiddev in MPB, given the current one */
static struct _MpbRaidDev *advance_raiddev(struct _MpbRaidDev *raiddev)
{
	int k; /* map size correction */
	struct _MpbRaidVol *vol = &raiddev->raidVol;
	struct _MpbRaidMap *map = &vol->loMap;
	k = (map->numMembers - 1) * sizeof(u32);
	raiddev++; /* off by k bytes for sure */
	raiddev = (struct _MpbRaidDev *) ((u8 *) raiddev + k);
	if (vol->migrState) { /* need to add space for another map */
		map = (struct _MpbRaidMap *) raiddev;
		k = (map->numMembers - 1) * sizeof(u32); /* correction again */
		raiddev = (struct _MpbRaidDev *) ((u8 *) raiddev
						  + sizeof(*map) + k);
	}
	return raiddev;
}

/* Updates array's MPB, starts MPB writes for each disk. May not sleep */
static void
update_mpb(struct array *array)
{
	int i, j;
	struct _RaidMpb *mpb = array->mpb;
	struct _MpbRaidDev *raiddev
		= (struct _MpbRaidDev *) &mpb->diskTbl[mpb->numDisks];

	/* easy update of the status field for each of array's disks */
	for (i = 0; i < mpb->numDisks; i++) {
		struct _MpbDisk *mpbdisk = &mpb->diskTbl[i];
		struct disk *disk = array->disks[i];
		mpbdisk->status = disk->status;
	}

	/* volume degraded or failed update, + disk degraded updates */
	for (i = 0; i < mpb->numRaidDevs; i++) {
		struct _MpbRaidVol *vol = &raiddev->raidVol;
		struct _MpbRaidMap *map = &vol->loMap;
		struct volume *volume = array->volumes[i];
		if (test_bit(FAILED_BIT, &volume->state))
			map->mapState = FAILED_MAP;
		else if (test_bit(DEGRADED_BIT, &volume->state))
			map->mapState = DEGRADED_MAP;
		for (j = 0; j < volume->numdisks; j++) {
			if (test_bit(FAILED_BIT, &volume->disks[j]->status)
			    || test_bit(j, &volume->degradedbits))
				map->diskOrdTbl[j] |= (1 << 24);
		}
		raiddev = advance_raiddev(raiddev);
	}		

	/* Increment the generation number */
	mpb->generationNum++;
	
	/* Update the checksum */
	mpb->checkSum = compute_checksum((const u32 *) mpb, mpb->mpbSize);

	/* now just try to write the new MPB to each disk */
	for (i = 0; i < mpb->numDisks; i++) {
		struct disk *disk = array->disks[i];
		start_mpb_write(disk, array->mpb);
	}
}

/* Notifies all volumes containing the given disk that the disk has failed.
 * When this happens, volumes become degraded or turn from degraded to failed.
 * To be called only after the initialization of all data structures is done.
 */
/* FIXME a new mode of operation where we don't fail degraded volumes ever
 * would be nice to have, as marking yet another disk failed causes us to
 * lose information about which disk we should rather rebuild from...
 */
static void
notify_volumes(struct disk *disk)
{
	int j;
	/* process each volume that has this disk... */
	for (j = 0; j < disk->array->mpb->numRaidDevs; j++) {
		struct volume *vol = disk->array->volumes[j];
		/* and degrade or fail it if RAID1 and up */
		if (vol->raidlevel >= 1) {
			if (test_bit(DEGRADED_BIT, &vol->state))
				set_bit(FAILED_BIT, &vol->state);
			set_bit(DEGRADED_BIT, &vol->state);
		}
	}
}

/* Makes a quick note of a disk error (but does not update errorcount yet).
 * Fails a disk if RAID1 volume write to it fails. Initiates MPB writing to
 * mark this condition. Resubmits failed RAID1 reads to mirror. May not sleep.
 */
static int
handle_io_error(struct buffer_head *bh)
{
	struct disk *disk;
	int i, oldfail;
	struct bh_private *private = bh->b_private;
	struct volume *volume = private->volume;
	if (!volume) {
		printk(KERN_ERR "iswraid: non-volume related IO error on "
		       "disk major %d minor %d\n",
		       MAJOR(bh->b_rdev), MINOR(bh->b_rdev));
		set_bit(0, &private->status); /* indicate error with any bit */
		return 0;
	}

	/* look for the disk number in this volume that got error */
	for (i = 0; i < volume->numdisks; i++)
		if (volume->disks[i]->dev == bh->b_rdev)
			break;
	if (i >= volume->numdisks)
		BUG(); /* currently disks cannot just leave volumes */
	set_bit(i, &private->status); /* just mark that disk had a problem */

	printk(KERN_ERR "iswraid: %s IO error on disk major %d minor %d, "
	       "volume devbit %d, RAID level %d\n",
	       private->rw == WRITE ? "write" : "read", MAJOR(bh->b_rdev),
	       MINOR(bh->b_rdev), volume->devbit, volume->raidlevel);

	if (volume->raidlevel == 0)
		return 0; /* nothing else to do at the moment */
	
	if (private->rw == WRITE) { /* RAID1 (or higher RAID level) write */
		disk = volume->disks[i];
		oldfail = test_bit(FAILED_BIT, &disk->status);
		set_bit(FAILED_BIT, &disk->status);
		set_bit(i, &volume->degradedbits);
		if (!oldfail) { /* it wasn't failed before */
			notify_volumes(disk); /* notify all volumes it's in */
			update_mpb(volume->array);
		}
		return 0;
	}

	if (volume->raidlevel > 1)
		return 0; /* high RAID levels not yet supported */
			     
	/* RAID1, RAID1E read */
	if (test_bit(BH_PrivateStart, &bh->b_state) /* a mirror read */
	    || volume->numdisks == 1) /* or there is no mirror */
		return 0; /* then we've done all we can, bail */
	
	/* try to find a suitable mirror for this IO */
	if (!(volume->numdisks & 0x01)) /* even number of disks */
		disk = volume->disks[i ^ 0x01]; /* just move 1 up or down */
	else {  /* odd number of disks, not possible with ICH6R, not tested */
		int rsect = bh->b_rsector - volume->pbaOfLba0;
		int strip_in_disk = rsect / volume->blocksPerStrip;
		int disknum;
		if (i ^ strip_in_disk) /* even disk, odd strip or vice versa */
			disknum = i - 1; /* mirror behind */
		else         /* even disk, even strip or odd disk, odd strip */
			disknum = i + 1; /* mirror ahead */
		if (disknum >= volume->numdisks) { /* forward wraparound */
			disknum = 0;
			bh->b_rsector += volume->blocksPerStrip; /* strip++ */
		} else if (disknum < 0) { /* backward wraparound */
			disknum = volume->numdisks - 1;
			bh->b_rsector -= volume->blocksPerStrip; /* strip-- */
		}
		disk = volume->disks[disknum];
	}

	if (!test_bit(FAILED_BIT, &disk->status) /* is the mirror OK? */
	    && !test_bit(i, &volume->degradedbits)) { /* really OK? */
		bh->b_rdev = disk->dev;
		set_bit(BH_PrivateStart, &bh->b_state); /* a mirror read */
		/* clear disk error bit to avoid spoiling overall success */
		clear_bit(i, &private->status);
		/* but remember that this error happened, anyway */
		set_bit(i + MAX_RAID_MEMBER_DISKS, &private->status);
		DEBUG("iswraid: resubmitting RAID1 read IO to a mirror: new "
		      "rsector = %lu, major %d minor %d\n", bh->b_rsector,
		      MAJOR(bh->b_rdev), MINOR(bh->b_rdev));
		generic_make_request(READ, bh);  /* submit the new IO */
		return 1; /* let caller know there is still hope */
	}

	return 0;
}		

/* Called when the original, "big" IO completes. Increments errorcounts for
 * each disk that is marked as having had a problem with one or more of 
 * components of the big IO. Also clears the bits that remember problems
 * before an IO was reissued to a mirror---this way, if the overall IO
 * succeeded, the following code will see it as successful.
 * Fails those disks whose errorcount exceeded threshold and starts the MPB
 * writes (all these disks are in the same array). May not sleep.
 */
static void
update_error_counts(struct buffer_head *bh)
{
	int i, mpbchanged = 0;
	struct bh_private *private = bh->b_private;
	struct volume *volume = private->volume;
	if (!volume) /* ignore non-volume related IOs */
	        return;
	
	for (i = 0; i < volume->numdisks; i++) {
		struct disk *disk = volume->disks[i];
		/* test normal and pre-mirror bits for each disk */
		if (test_bit(i, &private->status)
		    || test_bit(i + MAX_RAID_MEMBER_DISKS, &private->status)) {
			atomic_inc(&disk->errorcount);
			if (iswraid_error_threshold /* threshold in use */
			    && (atomic_read(&disk->errorcount)
				>= iswraid_error_threshold) /* and reached */
			    && !test_bit(FAILED_BIT, &disk->status)) {
				set_bit(FAILED_BIT, &disk->status); /* mark */
				set_bit(i, &volume->degradedbits);
				notify_volumes(disk); /* degrade, fail vols */ 
				mpbchanged = 1; /* will need to write MPBs */
			}
		}
		clear_bit(i + MAX_RAID_MEMBER_DISKS, &private->status);
	}

	if (mpbchanged)
		update_mpb(volume->array);
}
 
/* Our copy of end_buffer_io_sync, which fs/buffer.c does not export */
static void
end_io_sync(struct buffer_head *bh, int uptodate)
{
	mark_buffer_uptodate(bh, uptodate);
	unlock_buffer(bh);
	put_bh(bh);
}

/* we use this as end_io function for each bh to keep track of their success */
/* it is called from a softirq context, so can't do any sleeping... */
static void
end_io(struct buffer_head *bh, int uptodate)
{
	struct bh_private *private = bh->b_private;
	if (!private) /* it should not have come to us if !private */
		BUG();

	if (!uptodate) { /* this IO failed */
		if (handle_io_error(bh))  /* if failed IO got resubmitted */
			return;           /* then we must bail early here */
	}
	
	if (atomic_dec_and_test(&private->count)) {
		if (private->status) /* if any disks had errors */
			update_error_counts(bh);
		if (private->parent) /* if we had an extra bh allocated */
			private->parent->b_end_io(private->parent,
						  private->status ? 0 : 1);
		else { /* if we had just changed a few fields in bh */
			bh->b_private = private->old_private;
			bh->b_end_io = private->old_endiofn;
			private->old_endiofn(bh, private->status ? 0 : 1);
			kfree(private);
			return; /* don't free bh because we didn't alloc it */
		}
		kfree(private);
	}
	bh->b_end_io = NULL; /* just in case, clear our authorization magic */
	kfree(bh);
}

/* we use this function to control which requests do get passed through
 * to the scsi disks that we've claimed for RAID and which don't.
 */
static int
scsidisk_make_request(request_queue_t *q, int rw, struct buffer_head *bh)
{
	struct bh_private *private = bh->b_private; /* !NULL if ours */
	struct disk *disk;
	
	if ((bh->b_end_io != &end_io) /* not part of normal vol/MPB IO */
	    && (bh->b_end_io != &end_io_sync)) { /* not sync MPB read/write */
		printk(KERN_ERR "Disk major %d minor %d has been claimed for "
		       "ISWRAID, no other IO operations permitted on it\n",
		       MAJOR(bh->b_rdev), MINOR(bh->b_rdev));
		bh->b_end_io(bh, 0); /* not uptodate */
		return 0; /* no point repeating, we're done here */
	}

	/* authorized IO, we might have arranged for a quick way to queue it */
	if (private && private->old_makereqfn)
		return private->old_makereqfn(q, rw, bh);

	/* if not, then we have to find the function by device */
	if ((disk = find_disk_by_dev(bh->b_rdev)))
		return disk->old_makereqfn(q, rw, bh);
	BUG(); /* we seem to have lost the make_request function to call */
	return 0;
}	

static int
iswraid0_make_request(request_queue_t *q, int rw, struct buffer_head *bh)
{
	unsigned long strip, strip_in_disk, disk, block_in_strip, numstrips;
	int minor = MINOR(bh->b_rdev);
	int devbit = minor >> SHIFT;
	unsigned long lastsect, rsect = bh->b_rsector;
	struct volume *volume = raid[devbit];
	struct bh_private *private;

	DEBUG("iswraid: iswraid0_make_request, minor = %d\n", minor);

	/* add partition offset because we work with the whole volume */
	rsect += ataraid_gendisk.part[minor].start_sect;
	/* lastsect >= rsect unless b_size <= 0, which it shouldn't be */
	lastsect = rsect + (bh->b_size >> 9) - 1; 
	DEBUG("iswraid: rsect with partition offset = %lu, lastsect = %lu\n",
	      rsect, lastsect);

	if (lastsect >= volume->sectors) { /* request beyond volume end */
		/* FIXME should we use a static counter here? */
		printk("iswraid: request beyond end of volume, minor = %d, "
		       "startsect = %lu, endsect = %lu\n",
		       minor, rsect, lastsect);
		bh->b_end_io(bh, 0); /* fail it immediately */
		return 0;
	}

	/* time for the kernel to provide the div function... */
	/* it's a good thing we checked blocksPerStrip in detect_volumes */
	strip = rsect / volume->blocksPerStrip;
	block_in_strip = rsect % volume->blocksPerStrip;
	/* it's a good thing we checked numdisks in detect_volumes */
	disk = strip % volume->numdisks;
	strip_in_disk = strip / volume->numdisks;

	numstrips = ((lastsect - (rsect - block_in_strip))
		     / volume->blocksPerStrip) + 1; /* strips to play with */
	DEBUG("iswraid: numstrips = %lu\n", numstrips);
	
	private = get_private(); /* need this to track bh */
	atomic_set(&private->count, numstrips); /* 1 or more */
	private->rw = rw;
	private->volume = volume;
	
	if (numstrips <= 1) { /* optimizable case, numstrips == 1, actually */
		unsigned long drsect = (strip_in_disk * volume->blocksPerStrip
					+ block_in_strip);
		DEBUG("iswraid: strip = %lu, block_in_strip = %lu, disk = %lu,"
		      " strip_in_disk = %lu, drsect (without pbaOfLba0 offset)"
		      " = %lu, size = %u, rw = %d\n", strip, block_in_strip,
		      disk, strip_in_disk, drsect, bh->b_size, rw);

		/* The new BH_Lock semantics in ll_rw_blk.c guarantee that this
		 * is the only IO operation happening on this bh.
		 */
		bh->b_rdev = volume->disks[disk]->dev;
		bh->b_rsector = drsect + volume->pbaOfLba0;
		DEBUG("iswraid: final (only) b_rsector = %lu\n",
		      bh->b_rsector);

		/* we carefully save the fields we'll muck up, overkill? */
		private->old_private = bh->b_private;
		private->old_endiofn = bh->b_end_io;
		/* This lets us queue it quickly once we check authorization */
		private->old_makereqfn = volume->disks[disk]->old_makereqfn;
		/* Clear parent, set count and tie to bh with return to us */
		private->parent = NULL;
		bh->b_private = private;
		bh->b_end_io = &end_io;

		/* update the last known head position for the drive */
		spin_lock(&volume->disks[disk]->lock);
		volume->disks[disk]->last_pos
			= bh->b_rsector + (bh->b_size >> 9) - 1;
		spin_unlock(&volume->disks[disk]->lock);
		return 1; /* force the upper level to resubmit this IO */
	}
	else {  /* The complicated case where we work with numerous strips.
		 * Obviously, we could call ataraid's split function,
		 * but it simply splits in halves with no regard for
		 * natural strip boundaries. Furthermore, we want to be
		 * in charge of the complete IO success/failure determination,
		 * as ataraid wrongly seems to respond with the status of
		 * the last component...
		 */
		
		/* Last and first strips are different from the rest */
		unsigned long drsect = (strip_in_disk * volume->blocksPerStrip
					+ block_in_strip);
		unsigned long size = ((volume->blocksPerStrip - block_in_strip)
				      << 9);
		char *bufferposition = bh->b_data;
		private->parent = bh;
		/* if oneandonly_makereqfn non-NULL, quick queueing possible */
		private->old_makereqfn = oneandonly_makereqfn;

		while (rsect <= lastsect) {
			struct buffer_head *bh1 = ataraid_get_bhead();
			if (!bh1)
				BUG(); /* impossible, see ataraid.c */

			/* dupe the bufferhead and update what's different */
			memcpy(bh1, bh, sizeof(*bh));
			bh1->b_end_io = &end_io; /* return to us */
			bh1->b_private = private;

			/* adjust the size for the last strip */
			if ((block_in_strip == 0)
			    && (lastsect - rsect < volume->blocksPerStrip))
				size = (lastsect - rsect + 1) << 9;
			
			DEBUG("iswraid: strip = %lu, block_in_strip = %lu, "
			      "disk = %lu, strip_in_disk = %lu, drsect "
			      "(without pbaOfLba0 offset) = %lu, size = %lu, "
			      "rw = %d\n", strip, block_in_strip, disk,
			      strip_in_disk, drsect, size, rw);

			/* The new BH_Lock semantics in ll_rw_blk.c guarantee
			 * that this is the only IO operation on this bh.
			 */
			bh1->b_rdev = volume->disks[disk]->dev;
			bh1->b_rsector = drsect + volume->pbaOfLba0;
			bh1->b_size = size;
			bh1->b_data = bufferposition;
			DEBUG("iswraid: final (many) b_rsector = %lu\n",
			      bh1->b_rsector);

			/* update the last known head position for the drive */
			spin_lock(&volume->disks[disk]->lock);
			volume->disks[disk]->last_pos
				= bh1->b_rsector + (bh1->b_size >> 9) - 1;
			spin_unlock(&volume->disks[disk]->lock);

			/* submit and update our variables for next strip */
			generic_make_request(rw, bh1);
			strip++; /* don't really need this, except for debug */
			bufferposition += size;
			drsect -= block_in_strip; /* needed on first strip */
			block_in_strip = 0; /* all after first start at 0 */
			rsect += (size >> 9);
			disk++;
			size = volume->blocksPerStrip << 9; /* full strip */
			if (disk >= volume->numdisks) { /* start next stripe */
				disk = 0;
				strip_in_disk++;
				drsect += volume->blocksPerStrip;
			}				
		}
	}

	return 0; /* We've submitted all IOs ourselves */
}

static int
raid1_write_request(request_queue_t *q, int rw, struct buffer_head *bh,
		    unsigned long rsect, unsigned long lastsect)
{
	int i, minor = MINOR(bh->b_rdev);
	int devbit = minor >> SHIFT;
	struct volume *volume = raid[devbit];
	struct bh_private *private;
	DEBUG("iswraid: iswraid1_write_request, minor = %d\n", minor);

	private = get_private(); /* for tracking original bh */
	private->parent = bh;
	/* if oneandonly_makereqfn non-NULL, quick queueing possible */	
	private->old_makereqfn = oneandonly_makereqfn;
	private->rw = rw;
	private->volume = volume;

	if (volume->numdisks <= 2) { /* numdisks == 2 is normal RAID1 */
		int count = 0; /* we'll count the IOs submitted */
		atomic_set(&private->count, volume->numdisks); /* assume OK */
		
		for (i = 0; i < volume->numdisks; i++) {
			struct buffer_head *bh1;
			if (test_bit(FAILED_BIT, &volume->disks[i]->status)
			    || test_bit(i, &volume->degradedbits)) {
				atomic_dec(&private->count); /* adjustment */
				DEBUG("iswraid: disk state = 0x%08x, "
				      "volume degraded bits = 0x%08x\n",
				      volume->disks[i]->status,
				      volume->degradedbits);
				continue;
			}
			
			if (!(bh1 = ataraid_get_bhead()))
				BUG(); /* impossible, see ataraid.c */

			/* dupe the bufferhead and change what's needed */
			memcpy(bh1, bh, sizeof(*bh));
			bh1->b_end_io = &end_io; /* return to us */
			bh1->b_private = private;
			bh1->b_rsector = rsect + volume->pbaOfLba0;
			bh1->b_rdev = volume->disks[i]->dev;
			
			/* update the last known head position for the drive */
			spin_lock(&volume->disks[i]->lock);
			volume->disks[i]->last_pos = lastsect;
			spin_unlock(&volume->disks[i]->lock);
		
			generic_make_request(rw, bh1);
			count++;
		}
		if (!count) { /* if we didn't submit anything */
			printk(KERN_ERR "iswraid: no nonfailed disks were "
			       "found for volume minor %d, failing IO\n",
			       minor);
			kfree(private);
			bh->b_end_io(bh, 0);
		}
	} else { /* numdisks > 2; FIXME should do RAID1E here */
		/* This is not possible with ICH5R and ICH6R option ROM */
		printk(KERN_ERR "iswraid: RAID1E volume minor %d has %d "
		       "member disks, this case has not been implemented\n",
		       minor, volume->numdisks);
		bh->b_end_io(bh, 0);
	}
	return 0; /* We've submitted all IO ourselves */
}

#define HUGE_MOVE_BLOCKS 4096

static int
raid1_read_request(request_queue_t *q, int rw, struct buffer_head *bh,
		   unsigned long rsect, unsigned long lastsect)
{
	int minor = MINOR(bh->b_rdev);
	int devbit = minor >> SHIFT;
	struct volume *volume = raid[devbit];
	struct bh_private *private;
	int i, bestdisk = -1; /* best disk found; initially a bogus value */
	unsigned int bestdistance = ~0; /* as far as we could possibly go */
	int distance;
	DEBUG("iswraid: iswraid1_read_request, minor = %d\n", minor);

	if (volume->numdisks <= 2) { /* numdisks == 2 is normal RAID1 */
		bh->b_rsector = rsect + volume->pbaOfLba0;

		/* Reads are simple in principle. Pick a disk and go.
		 * We chose the disk with the last known head position closest.
		 * FIXME performance considerations, offline checking needed.
		 * All moves over HUGE_MOVE_BLOCKS are considered equally bad.
		 * When there is a tie, use a volume-specific tiebreaker.
		 * Of course, without knowing the true physical disk geometry
		 * or at least knowing which requests from the queue have truly
		 * been dispatched to the disk and whether the head is moving
		 * down or up, all this is likely fairly worthless...
		 * FIXME read about elevator scheme and possibly adjust this.
		 * We could also keep an IO count for each disk and use that,
		 * but looking the disk up from b_rdev is a bit clumsy...
		 */
		for (i = 0; i < volume->numdisks; i++) {
			if (test_bit(FAILED_BIT, &volume->disks[i]->status)
			    || test_bit(i, &volume->degradedbits)) {
				DEBUG("iswraid: disk state = 0x%08x, "
				      "degradedbits = 0x%x\n",
				      volume->disks[i]->status,
				      volume->degradedbits);
				continue; /* ignore failed disks */
			}
			spin_lock(&volume->disks[i]->lock);
			distance = (bh->b_rsector
				    - volume->disks[i]->last_pos);
			spin_unlock(&volume->disks[i]->lock);
		
			if (distance < 0)    /* change to absolute value */
				distance = -distance;
			
			if (distance > HUGE_MOVE_BLOCKS)     /* huge moves  */
				distance = HUGE_MOVE_BLOCKS; /* are all bad */

			if (distance < bestdistance) { /* imporvement */
				bestdistance = distance;
				bestdisk = i;
			} else if (distance == bestdistance) {
				/* races regarding tiebreak are irrelevant */
				bestdisk = volume->tiebreak;
				volume->tiebreak = 1 - volume->tiebreak;
			}
		}

		if (bestdisk < 0) {
			printk(KERN_ERR "iswraid: no nonfailed disks were "
			       "found for volume minor %d, failing IO\n",
			       minor);
			bh->b_end_io(bh, 0);
			return 0;
		}			
	
		DEBUG("iswraid: bestdisk = #%d, bestdistance = %d\n",
		      bestdisk, bestdistance);
		bh->b_rdev = volume->disks[bestdisk]->dev;
		private = get_private(); /* for tracking bh */
		/* we save the fields we'll mess up, an overkill perhaps */
		private->old_private = bh->b_private;
		private->old_endiofn = bh->b_end_io;
		/* This lets us queue it quickly once we check authorization */
		private->old_makereqfn
			= volume->disks[bestdisk]->old_makereqfn;
		/* Clear parent, set count and tie to bh with return to us */
		private->parent = NULL;
		atomic_set(&private->count, 1);
		private->rw = rw;
		private->volume = volume;
		bh->b_private = private;
		bh->b_end_io = &end_io;
		clear_bit(BH_PrivateStart, &bh->b_state); /* not mirror read */
	
		/* update the last known head position for the drive */
		spin_lock(&volume->disks[bestdisk]->lock);
		volume->disks[bestdisk]->last_pos = lastsect;
		spin_unlock(&volume->disks[bestdisk]->lock);
		return 1; /* force the upper level to resubmit the IO */
	} else { /* numdisks > 2; FIXME should do RAID1E here */
		/* This is not possible with ICH5R and ICH6R option ROM */
		printk(KERN_ERR "iswraid: RAID1E volume minor %d has %d "
		       "member disks, this case has not been implemented\n",
		       minor, volume->numdisks);
		bh->b_end_io(bh, 0);
	}
	return 0; /* We've submitted all IO ourselves */
}

static int
iswraid1_make_request(request_queue_t *q, int rw, struct buffer_head *bh)
{
	int minor = MINOR(bh->b_rdev);
	int devbit = minor >> SHIFT;
	unsigned long lastsect, rsect = bh->b_rsector;
	struct volume *volume = raid[devbit];
	DEBUG("iswraid: iswraid1_make_request, minor = %d\n", minor);

	if (test_bit(FAILED_BIT, &volume->state)) {
		/* FIXME should we use a static counter here? */
		printk(KERN_ERR "iswraid: volume minor = %d is in a failed "
		       "state, failing IO\n", minor);
		bh->b_end_io(bh, 0); /* fail it immediately */
		return 0;
	}
	if (iswraid_halt_degraded
	    && test_bit(DEGRADED_BIT, &volume->state)) {
		/* FIXME should we use a static counter here? */
		printk(KERN_ERR "iswraid: volume minor = %d is in a degraded "
		       "state and ISWRAID was instructed to fail IOs to such "
		       "volumes\n", minor);
		bh->b_end_io(bh, 0); /* fail it immediately */
		return 0;
	}

	/* add partition offset because we work with the whole volume */
	rsect += ataraid_gendisk.part[minor].start_sect;
	/* lastsect >= rsect unless b_size <= 0, which it shouldn't be */
	lastsect = rsect + (bh->b_size >> 9) - 1;        
	DEBUG("iswraid: rsect with partition offset = %lu, lastsect = %lu\n",
	      rsect, lastsect);
	if (lastsect >= volume->sectors) { /* request beyond volume end */
		/* FIXME should we use a static counter here? */
		printk(KERN_ERR "iswraid: request beyond end of volume, "
		       "failing IO, minor = %d, startsect = %lu, endsect = "
		       "%lu\n", minor, rsect, lastsect);
		bh->b_end_io(bh, 0); /* fail it immediately */
		return 0;
	}

	if (rw == READA)
		rw = READ;

	if (rw == READ) /* Read and Write are totally different; split them */
		return raid1_read_request(q, rw, bh, rsect, lastsect);
	else
		return raid1_write_request(q, rw, bh, rsect, lastsect);
}

static int
iswraid_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
	      unsigned long arg)
{
	unsigned int minor, devbit, i;

	if (!inode || !inode->i_rdev)
		return -EINVAL;

	minor = MINOR(inode->i_rdev);
	devbit = minor >> SHIFT; /* SHIFT comes from ataraid, = 4 */

	switch (cmd) {
	case BLKGETSIZE:	/* Return device size */
		DEBUG("iswraid: ioctl BLKGETSIZE, minor = %d\n", minor);
		if (!arg)
			return -EINVAL;

		if (minor & 0xf) /* individual partition, not the whole vol */
			return put_user(ataraid_gendisk.part[minor].nr_sects,
					(unsigned long *) arg);

		return put_user(raid[devbit]->sectors, /* the whole volume */
				(unsigned long *) arg);

	case HDIO_GETGEO:
		DEBUG("iswraid: ioctl HDIO_GETGEO, minor = %d\n", minor);
		if (!arg)
			return -EINVAL;

		if (put_user(raid[devbit]->geom.heads,
			     &((struct hd_geometry *) arg)->heads))
			return -EFAULT;
		if (put_user(raid[devbit]->geom.sectors,
			     &((struct hd_geometry *) arg)->sectors))
			return -EFAULT;
		/* cylinders may get truncated here */
		if (put_user((unsigned short) raid[devbit]->geom.cylinders,
			     &((struct hd_geometry *) arg)->cylinders))
			return -EFAULT;
		if (put_user(ataraid_gendisk.part[minor].start_sect,
			     &((struct hd_geometry *) arg)->start))
			return -EFAULT;

		return 0;

	case HDIO_GETGEO_BIG:
		DEBUG("iswraid: ioctl HDIO_BIG_GETGEO, minor = %d\n", minor);
		if (!arg)
			return -EINVAL;

		if (put_user(raid[devbit]->geom.heads,
			     &((struct hd_big_geometry *) arg)->heads))
			return -EFAULT;
		if (put_user(raid[devbit]->geom.sectors,
			     &((struct hd_big_geometry *) arg)->sectors))
			return -EFAULT;
		if (put_user(raid[devbit]->geom.cylinders,
			     &((struct hd_big_geometry *) arg)->cylinders))
			return -EFAULT;
		if (put_user(ataraid_gendisk.part[minor].start_sect,
			     &((struct hd_big_geometry *) arg)->start))
			return -EFAULT;

		return 0;

	case BLKRRPART: 	    /* Re-Read Partition Table. */
		DEBUG("iswraid: ioctl BLKRRPART, minor = %d\n", minor);
		if (!capable(CAP_SYS_ADMIN))
			return -EACCES;

		if (down_interruptible(&iswraid_sem))
			return -ERESTARTSYS;
		
		if (raid[devbit]->refcnt > 1) {
			up(&iswraid_sem);
			return -EBUSY;
		}

		for (i = 0; i < 16; i++) { /* for whole disk and partitions */
			int m = (minor & ~0xf) + i; /* is this correct? */
			if (ataraid_gendisk.part[m].nr_sects > 0) {
				invalidate_device(
					MKDEV(ataraid_gendisk.major, m), 1);
			}
			/* Clear existing partition sizes  */
			ataraid_gendisk.part[m].start_sect = 0;
			ataraid_gendisk.part[m].nr_sects = 0;
			/* Reset the Block Size */
			set_blocksize(MKDEV(ataraid_gendisk.major, m),
				      ISW_DISK_BLOCK_SIZE);
		}

		ataraid_register_disk(devbit, raid[devbit]->sectors);
		up(&iswraid_sem);
		return 0;

	default:
		DEBUG("iswraid: other ioctl, minor = %d\n", minor);
		return blk_ioctl(inode->i_rdev, cmd, arg);
	} /* ENDOF switch (cmd) */

	return 0;
}

/* Synchronous read or write of a bunch of contiguous disk sectors */
static int __init
rw_sectors_sync(int major, int minor, int rw, unsigned char *buffer,
		unsigned long sector, int count)
{
	int ret = 0;
	kdev_t dev = MKDEV(major, minor);
	struct buffer_head *bh = ataraid_get_bhead();
	if (!bh)
		BUG(); /* impossible, see ataraid.c */

	memset(bh, 0, sizeof(*bh)); /* most fields NULL, hopefully that's OK */
	/* FIXME need a valid b_page for CONFIG_HIGHMEM systems */
	bh->b_rsector = sector;
	bh->b_rdev = dev;
	bh->b_size = ISW_DISK_BLOCK_SIZE * count;
	bh->b_data = buffer;
	set_bit(BH_Mapped, &bh->b_state); /* checked in __make_request */
	init_waitqueue_head(&bh->b_wait);
	/* From here on we're picking lines from bread() from fs/buffer.c,
	 * ll_rw_block() and submit_bh() from drivers/block/ll_rw_blk.c
	 */
	set_bit(BH_Sync, &bh->b_state);
	set_bit(BH_Lock, &bh->b_state);
	bh->b_end_io = &end_io_sync;  
	get_bh(bh);
	generic_make_request(rw, bh);
	/* FIXME do we need barriers or wake_up-s somewhere around here? */
	wait_on_buffer(bh);
	if (!buffer_uptodate(bh)) {      
		printk(KERN_ERR "Synchronous IO to disk major %d minor %d "
		       "failed\n", major, minor);
		ret = -EIO;
	}
	bh->b_end_io = NULL; /* just in case, clear our authorization magic */
	kfree(bh);
	return ret;
}

/* make sure this is divisible by 4 because we work on it by words */
#define INQUIRY_BUFLEN 64 /* 24 would likely suffice here, see SCSI manual */

/* The raid member disks are scsi devices. We do an inquiry to
 * determine the disk serial number. This information is used
 * to order the member disks correctly in the raid array.
 * isw_serial_no should point to a buffer of length at least
 * (MAX_RAID_SERIAL_LEN + 1), so that we can null-terminate the serial number.
 */
static int __init
do_inquiry(int major, int minor, unsigned char *isw_serial_no)
{
	kdev_t dev = MKDEV(major, minor);
	int i, retval;
	unsigned char *cmd;
	u32 buffer[INQUIRY_BUFLEN / sizeof(u32) + 2]; /* 2 extra for lengths */
	Scsi_Device *SDpnt;

	DEBUG("iswraid: do_inquiry\n");

	/* If the device exists it must have a request queue.
	 * And, according to scsi.c, Scsi_Device * was put in queuedata.
	 */
	SDpnt = blk_get_queue(dev)->queuedata;

	if (!SDpnt) {
		DEBUG("iswraid: could not get SDpnt for major %d minor %d\n",
		      major, minor);
		return -EINVAL;
	}
	
	for (i = 3; i < INQUIRY_BUFLEN / sizeof(u32) + 2; i++)
		buffer[i] = 0; /* clean the bytes we won't set explicitly */

	/* input data size. No input. */
	buffer[0] = 0;
	/* output buffer size. Doesn't include the two length words up front */
	buffer[1] = INQUIRY_BUFLEN;
	/* cmd is the SCSI command to send */
	cmd = (unsigned char *) &buffer[2];

	cmd[0] = 0x12;		/* Opcode INQUIRY=12h */
	cmd[1] = 0x01;		/* EVPD=1. Return the vital product data
				   specified in page code */
	cmd[2] = 0x80;		/* Page Code Unit serial number page=80h */
	cmd[3] = 0x00;		/* Reserved byte */
	cmd[4] = 0x18;		/* allocation length; 24 bytes is everything */
	cmd[5] = 0x00;		/* Control  byte */

	retval = kernel_scsi_ioctl(SDpnt, SCSI_IOCTL_SEND_COMMAND, buffer);

	if (retval) {
		printk(KERN_ERR "iswraid: ERROR kernel_scsi_ioctl("
		       "SCSI_IOCTL_SEND_COMMAND) failed, code= %d\n", retval);
		return retval;
	}

	/* VPD page is our result at the same location as cmd */
	/* Third byte is Page Length. Serial number seems to be just 8 bytes */
	for (i = 0; i < cmd[3] && i < MAX_RAID_SERIAL_LEN; i++) {
		if (cmd[4 + i] == ' ')
			break;
		isw_serial_no[i] = cmd[4 + i];
	}
	isw_serial_no[i] = '\0';
	return 0;
}

#define TYPICAL_MPBSIZE 1024

/* Read all the MPB blocks, check signature and checksum */
static struct _RaidMpb * __init
read_mpb(int major, int minor)
{
	unsigned long mpb_blocknum;
	unsigned char *mpbbuf, *mpbbuf2;
	int mpbblocks;
	struct _RaidMpb *mpb;
	
	/* Find the block number of the "first" block of Intel RAID metadata */
	if (!(mpb_blocknum = calc_mpb_blocknum(major, minor)))
		return NULL;

	/* get permanent storage space for the MPB */
	mpbbuf = kmalloc(TYPICAL_MPBSIZE, GFP_KERNEL);

	if (!mpbbuf) {
		printk(KERN_ERR "iswraid: can't kmalloc %d bytes\n",
		       TYPICAL_MPBSIZE);
		return NULL;
	}

	/* Read the RAID metadata "header" */
	if (rw_sectors_sync(major, minor, READ, mpbbuf, mpb_blocknum, 1))
		goto freempbbuf;

	mpb = (struct _RaidMpb *) mpbbuf;

	/* Check Signature and version info, older versions supported */
	if (strncmp(mpb->sig.text, MPB_SIGNATURE, sizeof(MPB_SIGNATURE) - 1)
	    || (strcmp(mpb->sig.text + sizeof(MPB_SIGNATURE) - 1,
		       MPB_VERSION_RAID2) > 0))
	{
		DEBUG("iswraid: read_mpb for major %d minor %d did not find "
		      "a valid Intel Software Raid signature, or version was "
		      "newer than supported: '%-32s'\n",
		      major, minor, mpb->sig.text);
		goto freempbbuf;
	}

	DEBUG("iswraid: Intel RAID Member disk found at major %d minor %d\n",
	      major, minor);

	mpbblocks = ((mpb->mpbSize + ISW_DISK_BLOCK_SIZE - 1)
		     / ISW_DISK_BLOCK_SIZE);

	/* If we need more space for the MPB */
	if (mpbblocks > TYPICAL_MPBSIZE / ISW_DISK_BLOCK_SIZE) { 
		mpbbuf2 = kmalloc(mpbblocks * ISW_DISK_BLOCK_SIZE, GFP_KERNEL);

		if (!mpbbuf2) {
			printk(KERN_ERR "iswraid: can't kmalloc %d bytes\n",
			       mpbblocks);
			goto freempbbuf;
		}
	
		memcpy(mpbbuf2, mpbbuf, ISW_DISK_BLOCK_SIZE);
		mpb = (struct _RaidMpb *) mpbbuf2;
		kfree(mpbbuf);
		mpbbuf = mpbbuf2;
	}
	
	/* Here we read the rest of the MPB, if necessary */
	if (mpbblocks > 1) {
		DEBUG("iswraid: Size of RAID metadata is %d bytes\n",
		      mpb->mpbSize);

		/* note that the rest of the MPB lives _before_ its "header" */
		if (rw_sectors_sync(major, minor, READ,
				    mpbbuf + ISW_DISK_BLOCK_SIZE,
				    mpb_blocknum - (mpbblocks - 1),
				    mpbblocks - 1)) {
			DEBUG("iswraid: couldn't read the rest of RAID MPB\n");
			goto freempbbuf;
		}
	}

	/* Compare checksum read from MPB with newly calculated value */
	if (mpb->checkSum
	    != compute_checksum((const u32 *) mpb, mpb->mpbSize)) {
		printk(KERN_ERR "iswraid: MPB checksum error\n");
		goto freempbbuf;
	}

	DEBUG("iswraid: MPB checksum OK for major %d minor %d\n",
	      major, minor);
	return mpb;
	
 freempbbuf:
	kfree(mpbbuf);
	return NULL;
}

#if 0
/* Update checksum, write the MPB "synchronously", Not yet used. */
static int
write_mpb(int major, int minor, struct _RaidMpb *mpb)
{
	unsigned long mpb_blocknum;
	int mpbblocks;

	DEBUG("iswraid: write_mpb\n");
	
	/* Find the block number of the "first" block of Intel RAID metadata */
	if (!(mpb_blocknum = calc_mpb_blocknum(major, minor)))
		return -EINVAL;

	/* Update the checksum */
	mpb->checkSum = compute_checksum((const u32 *) mpb, mpb->mpbSize);

	/* Write the RAID metadata "header" */
	if (rw_sectors_sync(major, minor, WRITE, (unsigned char *) mpb,
			    mpb_blocknum, 1))
		return -EIO;

	mpbblocks = ((mpb->mpbSize + ISW_DISK_BLOCK_SIZE - 1)
		     / ISW_DISK_BLOCK_SIZE);

	if (mpbblocks <= 1) /* < 1 isn't really possible */
		return 0;

	/* Here we write the rest of the MPB */
	DEBUG("iswraid: Size of RAID metadata is %d bytes\n", mpb->mpbSize);

	/* note that the rest of the MPB lives _before_ its "header" */
	if (rw_sectors_sync(major, minor, WRITE,
			    (unsigned char *) mpb + ISW_DISK_BLOCK_SIZE,
			    mpb_blocknum - (mpbblocks - 1), mpbblocks - 1))
		return -EIO;

	return 0;
}
#endif

/* Read the MPB, get disk serial number */
static struct disk * __init
probe_disk(int major, int minor)
{
	struct disk *newdisk;
	struct _RaidMpb *mpb;
	
	DEBUG("iswraid: probe_disk for major %d minor %d\n", major, minor);

	if (!(mpb = read_mpb(major, minor)))
		return NULL;
	
	newdisk = kmalloc(sizeof(struct disk), GFP_KERNEL);

	if (!newdisk) {
		printk(KERN_ERR "iswraid: can't kmalloc struct disk\n");
		goto freempb;
	}

	/* FIXME We should read the serial number for every disk, then we
	 * could do operations like array creation and adding disks to arrays,
	 * marking failed disks normal, etc. When we start reading serial
	 * numbers for disks regardless of MPB existence/state, then it would
	 * be best to limit disks to those attached to Intel SATA controllers.
	 * But currently we only read serial for disks that have a decent MPB.
	 */
	if (do_inquiry(major, minor, newdisk->serial)) {
		printk(KERN_INFO "iswraid: inquiry returned error\n");
		goto freenewdisk;
	}

	DEBUG("iswraid: Serial number for major %d minor %d is '%s'\n",
	      major, minor, newdisk->serial);
	newdisk->dev = MKDEV(major, minor);
	newdisk->array = NULL;
	newdisk->mpb = mpb;
	newdisk->status = 0; /* will match mpbdisk->status soon */
	atomic_set(&newdisk->errorcount, 0);
	newdisk->last_pos = 0; /* we don't know it, but make it same for all */
	spin_lock_init(&newdisk->lock); /* access to last_pos only thru lock */
	/* remember what the disk's original make_request_fn is */
	newdisk->old_makereqfn = blk_get_queue(newdisk->dev)->make_request_fn;
	DEBUG("iswraid: makereqfn = %p\n", newdisk->old_makereqfn);
	return newdisk;

 freenewdisk:
	kfree(newdisk);
 freempb:
	kfree(mpb);
	return NULL;
}

/* Set up a new array (or redo an existing one) from the given MPB data */
static void __init
update_array(struct array *array, struct _RaidMpb *mpb)
{
	int j;
	array->mpb = mpb; /* in future may need to kmalloc and memcpy this */
	/* FIXME need to deal with missing disks more gracefully,
	 * in particular, need to support degraded RAID1 in this case
	 */
	array->disabled = 1; /* assume the array won't be complete */

	/* caller must guarantee that mpb->numDisks <= MAX_RAID_MEMBER_DISKS */
	for (j = 0; j < mpb->numDisks; j++) {
		struct _MpbDisk *mpbdisk = &mpb->diskTbl[j];
		struct disk *disk
			= find_disk_by_serial(mpbdisk->serial.serial);
		array->disks[j] = NULL; /* assume trouble with this disk */
		
		if (!disk) { /* can't find the disk mentioned in MPB */
			unsigned char tmp[MAX_RAID_SERIAL_LEN + 1];
			strncpy(tmp, mpbdisk->serial.serial,
				MAX_RAID_SERIAL_LEN);
			tmp[MAX_RAID_SERIAL_LEN] = 0;
			printk(KERN_INFO "iswraid: Disk '%s' missing, "
			       "disabling array with familyNum 0x%08x\n",
			       tmp, mpb->familyNum);
			continue;
		}

		disk->status = mpbdisk->status;	
		
		/* if disk's MPB contradicts our original MPB, the disk wins */
		if (disk->mpb->familyNum != mpb->familyNum) {
			printk(KERN_INFO "iswraid: Disk '%s' claims to belong "
			       "to array with familyNum 0x%08x, hence marking "
			       "it missing from array with familyNum 0x%08x\n",
			       disk->serial, disk->mpb->familyNum,
			       mpb->familyNum);
			continue;
		}
		
		disk->array = array;
		array->disks[j] = disk;

		DEBUG("iswraid: Disk '%s' is #%d in array with familyNum "
		      "0x%08x\n", disk->serial, j, mpb->familyNum);

		/* shouldn't encounter any other than CONFIGURED_DISKs here */
		if (mpbdisk->status & FAILED_DISK) {
			printk(KERN_INFO "iswraid: Disk '%s' from array with "
			       "familyNum 0x%08x has failed\n",
			       disk->serial, mpb->familyNum);
			/* a "continue" here would leave the array disabled */
		}

		array->disabled = 0; /* the array is actually OK */
	}
}

/* Find arrays by parsing the MPB data on all disks */
static void __init
find_arrays(void)
{
	struct list_head *curr;
	DEBUG("iswraid: looking for RAID arrays\n");

	/* for each disk, set up the array that it belongs to, if any */
	/* for each array being set up, make a list of its disks */
	list_for_each(curr, &disklist) {
		int i;
		struct disk *disk
			= list_entry(curr, struct disk, head);
		int needupdate = 1; /* assume this disk's array needs update */

		for (i = 0; i < arraycount; i++)
			if (arrays[i].mpb->familyNum
			    == disk->mpb->familyNum) {
				if ((arrays[i].mpb->checkSum
				     == disk->mpb->checkSum)
				    || (arrays[i].mpb->generationNum
					>= disk->mpb->generationNum))
					needupdate = 0; /* older MPB found */
				break;
			}

		if (i >= MAX_RAID_ARRAYS) {
			printk(KERN_ERR "iswraid: maximum array count %d "
			       "exceeded, ignoring MPB on disk %s",
			       MAX_RAID_ARRAYS, disk->serial);
			continue;
		}

		if (disk->mpb->numDisks == 1) {
			struct _MpbDisk *mpbdisk = &disk->mpb->diskTbl[0];
			disk->status = mpbdisk->status;
			if (!(mpbdisk->status & CONFIGURED_DISK)) {
				printk(KERN_INFO "iswraid: disk %s with "
				       "status %d is not a member of any "
				       "array, ignoring it\n",
				       disk->serial, mpbdisk->status);
				continue;
			}
		}

		/* unless we do this check, may not call update_array */
		if (disk->mpb->numDisks > MAX_RAID_MEMBER_DISKS) {
			printk(KERN_ERR "iswraid: MPB on disk %s contains "
			       "too many (%d) disks, ignoring it\n",
			       disk->serial, disk->mpb->numDisks);
			continue;
		}
		
		if (i >= arraycount)
			arraycount++;

		if (needupdate)
			update_array(&arrays[i], disk->mpb);
	}

	printk(KERN_INFO "iswraid: found %d ISWRAID arrays\n", arraycount);
}

/* Detect volumes belonging to one particular array */
static int __init
detect_volumes(struct array *array)
{
	int j, k, numdegrdisks, count = 0;
	struct _RaidMpb *mpb = array->mpb;
	struct _MpbRaidDev *raiddev
		= (struct _MpbRaidDev *) &mpb->diskTbl[mpb->numDisks];
	struct volume *volume = &volumes[volumecount];

	if (mpb->numRaidDevs > MAX_RAID_VOLUMES) {
		printk(KERN_WARNING "iswraid: array with familyNum 0x%08x "
		       "contains too many (%d) volumes, using only the first "
		       "%d of them\n", mpb->familyNum, mpb->numRaidDevs,
		       MAX_RAID_VOLUMES);
		mpb->numRaidDevs = MAX_RAID_VOLUMES;
	}
	
	for (j = 0; j < mpb->numRaidDevs; j++, volumecount++, volume++) {
		struct _MpbRaidVol *vol = &raiddev->raidVol;
		struct _MpbRaidMap *map = &vol->loMap;
		volume->state = 0;      /* assume OK, but check on map */
		if (map->mapState == FAILED_MAP)
			set_bit(FAILED_BIT, &volume->state);
		else if (map->mapState == DEGRADED_MAP)
			set_bit(DEGRADED_BIT, &volume->state);
		volume->devbit = -1;
		volume->refcnt = 0;
		volume->tiebreak = 0;
		volume->raidlevel = map->raidLevel;
		volume->numdisks = map->numMembers; /* <= mpb->numDisks */
		volume->pbaOfLba0 = map->pbaOfLba0;
		volume->blocksPerStrip = map->blocksPerStrip;		
		/* FIXME we don't use numDataBlocksHi, so 2TB is max size */
		volume->sectors = raiddev->numDataBlocksLo;
		/* FIXME shouldn't heads and sectors rather match those on the
		 * first disk? Need to find out how the win. driver fakes them.
		 */
		volume->geom.heads = 255;
		volume->geom.sectors = 63;
		volume->geom.cylinders = (volume->sectors / volume->geom.heads
					  / volume->geom.sectors);
		strncpy(volume->serial, raiddev->serial.serial,
			MAX_RAID_SERIAL_LEN);
		volume->serial[MAX_RAID_SERIAL_LEN] = 0;
		DEBUG("iswraid: Volume #%d with name %s has %lu total sectors,"
		      " H/S/C = 255/63/%u, pbaOfLba0 = %u, blocksPerMember = "
		      "%u, blocksPerStrip = %u, numDataStripes = %u\n",
		      volumecount, volume->serial, volume->sectors,
		      volume->geom.cylinders, volume->pbaOfLba0,
		      map->blocksPerMember, volume->blocksPerStrip,
		      map->numDataStripes);
		volume->array = array;
		array->volumes[j] = volume;
		DEBUG("iswraid: Volume #%d with name %s is volume #%d "
		      "in%s array with familyNum 0x%08x and it has "
		      "raid level %d\n", volumecount, volume->serial, j,
		      (array->disabled ? " disabled" : ""),
		      mpb->familyNum, volume->raidlevel);

		volume->degradedbits = 0;
		for (k = 0; k < MAX_RAID_MEMBER_DISKS; k++)
			volume->disks[k] = NULL;
		/* We do not attempt to do any kind of error recovery on RAID
		 * array errors. Just do not use arrays which are in abnormal
		 * state. Use Option ROM or config utility to fix, for now.
		 * The only exception is degraded map state---we allow it.
		 * And we don't support RAID levels other than 0 and 1.
		 * (raiddev->status can be ignored---nothing meaningful there.)
		 */
		if (!volume->numdisks || volume->numdisks > mpb->numDisks
		    || !volume->blocksPerStrip || vol->migrState
		    || (map->mapState && map->mapState != DEGRADED_MAP)
		    || volume->raidlevel > 1) {
			printk(KERN_INFO "iswraid: Volume #%d with name %s "
			       "has %d member disks, %u blocks per strip, "
			       "migration state %d, map state %d and raid "
			       "level %d; disabling it\n", volumecount,
			       volume->serial, volume->numdisks,
			       volume->blocksPerStrip, vol->migrState,
			       map->mapState, volume->raidlevel);
			set_bit(DISABLED_BIT, &volume->state);
		}

		/* in this case let's not attempt disk ordering */
		if (volume->numdisks > mpb->numDisks) {
			raiddev = advance_raiddev(raiddev);
			continue;
		}
		
		/* Now we do the ordering of disks in the volume. */
		numdegrdisks = 0; /* also check on their fail status */
		for (k = 0; k < volume->numdisks; k++) {
			int index = map->diskOrdTbl[k] & 0xffffff;
			int status = map->diskOrdTbl[k] >> 24;
			if (index > array->mpb->numDisks) {
				printk(KERN_INFO "iswraid: Disk #%d of volume "
				       "#%d with name %s refers to a "
				       "nonexisting disk #%d in array; "
				       "disabling volume\n", k, volumecount,
				       volume->serial, index);
				set_bit(DISABLED_BIT, &volume->state);
				continue;
			}
			volume->disks[k] = array->disks[index];
			if (status) { /* volume considers the disk degraded */
				printk(KERN_INFO "iswraid: Disk #%d of volume "
				       "#%d with name %s is degraded\n",
				       k, volumecount, volume->serial);
				set_bit(k, &volume->degradedbits);
				/* for RAID 1 and up double check vol state */
				if (volume->raidlevel > 0) {
					numdegrdisks++;
					set_bit(DEGRADED_BIT, &volume->state);
				}
			}					
		}
		if (numdegrdisks > 1) { /* volume must be marked failed */
			set_bit(FAILED_BIT, &volume->state);
			printk(KERN_INFO "iswraid: Volume #%d with name %s "
			       "and raid level %d has been marked failed due "
			       "to %d degraded disks; disabling it\n",
			       volumecount, volume->serial, volume->raidlevel,
			       numdegrdisks);
			set_bit(DISABLED_BIT, &volume->state);
		}
		
		/* if volume isn't disabled and its array isn't disabled, then
		 * register it with ATARAID and do the remaining setup steps
		 */
		if (!test_bit(DISABLED_BIT, &volume->state)
		    && !array->disabled) {
			int devbit = ataraid_get_device(volume->raidlevel
							? &iswraid1_ops
							: &iswraid0_ops);
			if (devbit < 0) {
				printk(KERN_ERR "iswraid: too many "
				       "RAID devices for ATARAID\n");
				break;
			}
			raid[devbit] = volume;
			volume->devbit = devbit;
			printk(KERN_INFO "iswraid: registering volume #%d "
			       "with name %s over %d member disks as a RAID "
			       "device with minor %d, ATARAID raiddev %d\n",
			       volumecount, volume->serial, volume->numdisks,
			       devbit << SHIFT, devbit);
			ataraid_register_disk(devbit, volume->sectors);
			count++;
		}
		raiddev = advance_raiddev(raiddev);		
	}
	return count;
}

#ifdef CONFIG_PROC_FS

static int
find_disk_index(struct disk *disk)
{
	struct list_head *curr;
	int k = 0;
	list_for_each(curr, &disklist) {
		if (disk == list_entry(curr, struct disk, head))
			return k;
		k++;
	}
	return -1;
}

static int
iswraid_proc_readarrays(char *page, char **start, off_t offset, int count,
			int *eof, void *data)
{
	/* with the current MAX_RAID_ARRAYS, MAX_RAID_MEMBER_DISKS and
	 * MAX_RAID_VOLUMES there is no need to check anything; it will fit.
	 */
	int i, j, len = 0;
	MOD_INC_USE_COUNT;
	
	len += sprintf(page + len, "# familynum generationnum disabled "
		       "numdisks numvolumes disks volumes\n");
	for (i = 0; i < arraycount; i++) {
		len += sprintf(page + len, "%08x\t%u\t%d\t%d\t%d\t",
			       arrays[i].mpb->familyNum,
			       arrays[i].mpb->generationNum,
			       arrays[i].disabled, arrays[i].mpb->numDisks,
			       arrays[i].mpb->numRaidDevs);
		for (j = 0; j < arrays[i].mpb->numDisks; j++) {
			int k = find_disk_index(arrays[i].disks[j]);
			if (!j)
				len += sprintf(page + len, "%d", k);
			else
				len += sprintf(page + len, ",%d", k);
		}
		len += sprintf(page + len, "\t");
		for (j = 0; j < arrays[i].mpb->numRaidDevs; j++) {
			if (!j)
				len += sprintf(page + len, "%d",
					       (arrays[i].volumes[j]
						- &volumes[0]));
			else
				len += sprintf(page + len, ",%d",
					       (arrays[i].volumes[j]
						- &volumes[0]));
		}
		len += sprintf(page + len, "\n");
	}
	*eof = 1;
	
	MOD_DEC_USE_COUNT;
	return len;
}

#define DISK_LINE_LEN 128    /* overkill */

static int
iswraid_proc_readdisks(char *page, char **start, off_t offset, int count,
		       int *eof, void *data)
{
	struct list_head *curr;
	int limit, len = 0, item = 0;
	MOD_INC_USE_COUNT;
	
	limit = count - DISK_LINE_LEN;
	len += sprintf(page + len, "# major minor status array errorcount "
		       "lastpos serial\n");
	list_for_each(curr, &disklist) {
		struct disk *disk = list_entry(curr, struct disk, head);
		if (len > limit)
			break;
		item++;           /* consider it processed already */
		if (offset && item <= offset) /* attempt to use the "hack" */
			continue;
		
		spin_lock(&disk->lock); /* not all that important */
		len += sprintf(page + len,
			       "%2d\t%2d\t0x%x\t%d\t%2d\t%10lu\t%s\n",
			       MAJOR(disk->dev), MINOR(disk->dev),
			       disk->status, disk->array - &arrays[0], 
			       atomic_read(&disk->errorcount),
			       disk->last_pos, disk->serial);
		spin_unlock(&disk->lock);
	}

	if (item < diskcount) {         /* all of them didn't fit */
		*start = (char *) item;
		*eof = 0;
	} else
		*eof = 1;
	
	MOD_DEC_USE_COUNT;
	return len;
}

static int
iswraid_proc_readvolumes(char *page, char **start, off_t offset, int count,
			 int *eof, void *data)
{
	/* with the current MAX_RAID_VOLUMES and MAX_RAID_MEMBER_DISKS
	 * there is no need to check anything; it will fit.
	 */
	int i, j, len = 0;
	MOD_INC_USE_COUNT;

	len += sprintf(page + len, "# node state refcnt array raidlevel "
		       "sectors blocksperstrip pbaoflba0 numdisks degradedbits"
		       " serial disks\n");
	for (i = 0; i < volumecount; i++) {
		if (volumes[i].devbit >= 0)
			len += sprintf(page + len, "d%d\t", volumes[i].devbit);
		else
			len += sprintf(page + len, "--\t");
		len += sprintf(page + len, "0x%x\t%d\t%d\t%d\t%10lu\t%3d"
			       "\t%10u\t%d\t0x%x\t%s\t",
			       volumes[i].state, volumes[i].refcnt,
			       volumes[i].array - &arrays[0],
			       volumes[i].raidlevel, volumes[i].sectors,
			       volumes[i].blocksPerStrip, volumes[i].pbaOfLba0,
			       volumes[i].numdisks, volumes[i].degradedbits,
			       volumes[i].serial);
		for (j = 0; j < volumes[i].numdisks; j++) {
			int k = find_disk_index(volumes[i].disks[j]); 
			if (!j)
				len += sprintf(page + len, "%d", k);
			else
				len += sprintf(page + len, ",%d", k);
		}
		len += sprintf(page + len, "\n");
	}
	*eof = 1;
	
	MOD_DEC_USE_COUNT;
	return len;
}

struct proc_dir_entry *iswraid_proc_dir;

static int __init
iswraid_proc_init(void)
{
	if (!(iswraid_proc_dir = proc_mkdir("iswraid", NULL)))
		goto out;
	if (!create_proc_read_entry("arrays", 0, iswraid_proc_dir,
				    &iswraid_proc_readarrays, NULL))
		goto freedir;
	if (!create_proc_read_entry("disks", 0, iswraid_proc_dir,
				    &iswraid_proc_readdisks, NULL))
		goto freearrays;
	if (!create_proc_read_entry("volumes", 0, iswraid_proc_dir,
				    &iswraid_proc_readvolumes, NULL))
		goto freedisks;
	return 0;

 freedisks:
	remove_proc_entry("disks", iswraid_proc_dir);
 freearrays:
	remove_proc_entry("arrays", iswraid_proc_dir);
 freedir:
	remove_proc_entry("iswraid", NULL);
 out:
	return -ENOMEM;
}

static void __init
iswraid_proc_cleanup(void)
{
	remove_proc_entry("volumes", iswraid_proc_dir);
	remove_proc_entry("disks", iswraid_proc_dir);
	remove_proc_entry("arrays", iswraid_proc_dir);
	remove_proc_entry("iswraid", NULL);
}

#endif /* #ifdef CONFIG_PROC_FS */

/* Free memory, release block devices, but do not unregister raid devices */
static void
free_resources(void)
{
	struct list_head *curr, *tmp;
	list_for_each_safe(curr, tmp, &disklist) {
		struct disk *disk
			= list_entry(curr, struct disk, head);
		DEBUG("iswraid: freeing resources associated with major %d, "
		      "minor %d\n", MAJOR(disk->dev), MINOR(disk->dev));
		/* if we changed the make_request now is the time to restore */
		if (iswraid_claim_disks) 
			blk_queue_make_request(blk_get_queue(disk->dev),
					       disk->old_makereqfn);
		blkdev_put(disk->bdev, BDEV_RAW);
		if (disk->mpb)
			kfree(disk->mpb);
		kfree(disk); /* we don't bother with list_del, reinit at end */
	}

	/* the following aren't really necessary since we're exiting anyway */
	INIT_LIST_HEAD(&disklist);
	diskcount = 0;
	arraycount = 0;
	volumecount = 0;
}

	
/* Claim the disks for RAID by hijacking their make_request functions.
 * Because we already noted their original functions, we can just force
 * on them our own function (and don't have to worry about fooling
 * ourselves into thinking our function was theirs in the case when
 * some of them share the same request queue...)
 * At the same time we set up the global oneandonly_makereqfn variable,
 * which may give us some extra chances for optimization when dispatching IOs.
 */
static void __init
claim_disks_for_raid(void)
{
	struct list_head *curr;
	/* the caller must ensure that there is at least one disk! */
	struct disk *disk = list_entry(disklist.next, struct disk, head);
	oneandonly_makereqfn = disk->old_makereqfn;
	printk(KERN_INFO "iswraid: claiming ISWRAID disks for RAID, "
	       "no other IO on them allowed\n");
	list_for_each(curr, &disklist) {
		disk = list_entry(curr, struct disk, head);
		blk_queue_make_request(blk_get_queue(disk->dev),
				       &scsidisk_make_request);
		if (oneandonly_makereqfn /* no difference found yet */
		    /* but finding a difference now */
		    && disk->old_makereqfn != oneandonly_makereqfn)
			oneandonly_makereqfn = NULL; /* diff. found, disable */
	}
}

static int __init
iswraid_init(void)
{
	struct list_head *curr;
	struct block_device *bdev;
	struct disk *newdisk;
	int dev_count = 0;             /* block devices processed */
	int major = SCSI_DISK0_MAJOR, minor = 0; /* start w/ major 8 */
	int i;

	printk(KERN_INFO "iswraid: Intel(tm) Software RAID driver %s\n",
	       ISW_VERSION_STRING);

	/* Initialize the raid structure to init values */
	for (i = 0; i < MAX_ATARAID_RAIDDEVS; i++)
		raid[i] = NULL;

	DEBUG("iswraid: probing SCSI disks\n");
	
	/* Probe each SCSI disk looking for our MPB.
	 * SCSI disks are claimed sequentially, so we can stop searching
	 * when we encounter the first invalid device.
	 * The blkdev_get call may cause a request_module("block-major-%d"),
	 * typically causing sd_mod to be loaded.
	 */
	for ( ; ; ) {
		if (!(bdev = bdget(MKDEV(major, minor)))
		    || blkdev_get(bdev, FMODE_READ | FMODE_WRITE,
				  0, BDEV_RAW)) {
			break;
		}

		newdisk = probe_disk(major, minor);
		if (newdisk) {
			newdisk->bdev = bdev;
			list_add_tail(&newdisk->head, &disklist);
			diskcount++; /* the global counter of raid disks */
		}

		dev_count++; /* local counter of all disks processed */
		minor += 16;

		/* finding the next major for SCSI disks is a bit tricky */
		if (minor > 255) {
			minor = 0;
			if (major == SCSI_DISK7_MAJOR) /* stop after maj. 71 */
				break;
			if (major == SCSI_DISK0_MAJOR)    /* from major    8 */
				major = SCSI_DISK1_MAJOR; /* leap to maj. 65 */
			else
				major++;
		}
	}

	printk(KERN_INFO "iswraid: probed %d SCSI disks, "
	       "found %d ISWRAID disks\n", dev_count, diskcount);

	if (!diskcount)
		return -ENODEV;

	if (iswraid_claim_disks)
		claim_disks_for_raid();

	find_arrays(); /* parse MPB data looking for RAID arrays */
	DEBUG("iswraid: detecting RAID volumes\n");
	dev_count = 0; /* reuse this to count new RAID devices registered */

	/* for each array's MPB, discover the volumes listed there */
	for (i = 0; i < arraycount; i++)
		dev_count += detect_volumes(&arrays[i]);

	printk(KERN_INFO "iswraid: detected %d ISWRAID volumes, registered "
	       "%d of them as RAID devices\n", volumecount, dev_count);

	/* for each array, free up all disk MPBs except the array MPB */
	list_for_each(curr, &disklist) {
		struct disk *disk = list_entry(curr, struct disk, head);
		if (disk->mpb == disk->array->mpb) /* need to keep this one */
			continue;
		kfree(disk->mpb);
		disk->mpb = NULL;
	}
	
	if (dev_count) { /* if some raid devs got registered, then we're OK */
#ifdef CONFIG_PROC_FS
		iswraid_proc_init();      /* sorry about the #ifdef */
#endif
		return 0;
	}
	
	free_resources();
	return -ENODEV;
}

static void __exit
iswraid_exit(void)
{
	int i;
	for (i = 0; i < MAX_ATARAID_RAIDDEVS; i++)
		if (raid[i])     /* if this one got registered */
			ataraid_release_device(i);
#ifdef CONFIG_PROC_FS
	iswraid_proc_cleanup();           /* sorry about the #ifdef */
#endif	
	free_resources();
}

static int
iswraid_open(struct inode *inode, struct file *filp)
{
	unsigned int devbit;
	MOD_INC_USE_COUNT;
	if (!inode || !inode->i_rdev)
		return -EINVAL;
	devbit = MINOR(inode->i_rdev) >> SHIFT;
	if (down_interruptible(&iswraid_sem))
		return -ERESTARTSYS;
	raid[devbit]->refcnt++;
	up(&iswraid_sem);
	return 0;
}
static int
iswraid_release(struct inode *inode, struct file *filp)
{
	unsigned int devbit;
	if (!inode || !inode->i_rdev)
		return -EINVAL;
	devbit = MINOR(inode->i_rdev) >> SHIFT;
	if (down_interruptible(&iswraid_sem))
		return -ERESTARTSYS;
	raid[devbit]->refcnt--;
	up(&iswraid_sem);
	MOD_DEC_USE_COUNT;
	return 0;
}

EXPORT_NO_SYMBOLS;
module_init(iswraid_init);
module_exit(iswraid_exit);
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Intel");
MODULE_DESCRIPTION("Intel Software RAID support at block device level");
