/*
-*- linux-c -*-
   drbd_syncer.c
   Kernel module for 2.2.x/2.4.x Kernels

   This file is part of drbd by Philipp Reisner.

   Copyright (C) 1999-2001, Philipp Reisner <philipp.reisner@gmx.at>.
        main author.

   Copyright (C) 2002, Lars Ellenberg <l.g.e@web.de>.
        changed scheduling algorithm
        keep track of syncer progress

   drbd is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option)
   any later version.

   drbd is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with drbd; see the file COPYING.  If not, write to
   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.

 */

#include <linux/config.h>
#ifdef CONFIG_MODVERSIONS
#include <linux/modversions.h>
#endif

#include <asm/uaccess.h>
#include <asm/bitops.h>
#include <net/sock.h>
#include <linux/smp_lock.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/proc_fs.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/in.h>
#include <linux/pkt_sched.h>
#define __KERNEL_SYSCALLS__
#include <linux/unistd.h>
#include <linux/sched.h>

#include "drbd.h"
#include "drbd_int.h"


/*
  We can not use getblk()/bforget() here, because we can not
  send (maybe dirty) blocks of the buffer cache.
  We really need to read in the data from our disk.
*/

struct ds_buffer {
	page_t** buffers;
	unsigned long *blnr;
	struct buffer_head **bhs;
	int number;
	int io_pending_number;
	int b_size;
};

int ds_check_block(struct Drbd_Conf *mdev, unsigned long bnr)
{
	struct ds_buffer *buffer;

	buffer=mdev->syncer_b;
	if(buffer) {
		int i,j,pending;
		for(j=0;j<2;j++) {
			pending=buffer[j].io_pending_number;
			for(i=0;i<pending;i++) {
				if( buffer[j].blnr[i] == bnr )
				{ return TRUE; }
			}
		}
	}
	return FALSE;
}

STATIC void ds_end_dio(struct buffer_head *bh, int uptodate)
{
	mark_buffer_uptodate(bh, uptodate);
	clear_bit(BH_Lock, &bh->b_state);
	smp_wmb(); // cleared bit should be visible in ds_buffer_wait_on()

	if (waitqueue_active(&bh->b_wait))
		wake_up(&bh->b_wait);
}

STATIC void ds_buffer_init(struct ds_buffer *this,int minor)
{
	int i;
	int bpp = PAGE_SIZE/this->b_size; // buffers per page

	for (i=0;i<this->number;i++) {
		drbd_init_bh(this->bhs[i],
			     this->b_size,
			     ds_end_dio);
		set_bh_page(this->bhs[i],
			    this->buffers[i/bpp],
			    (i % bpp) * this->b_size);// sets b_data and b_page
	}
}

// Shift right with round up. :)
#define SR_RU(A,B) ( ((A)>>(B)) + ( ((A) & ((1<<(B))-1)) > 0 ? 1 : 0 ) )


STATIC void ds_buffer_alloc(struct ds_buffer *this,int minor)
{
	int amount,amount_blks,blocksize,pages,i;

	amount=drbd_conf[minor].sock->sk->sndbuf >> 1;
	pages=SR_RU(amount,PAGE_SHIFT);

	/* We want to fill half of the send buffer*/
	blocksize = 1 << drbd_conf[minor].blk_size_b; 
	amount_blks=amount/blocksize;
	this->number=amount_blks;
	this->io_pending_number=0;
	this->b_size=blocksize;

	this->buffers=kmalloc(sizeof(page_t*) * pages,GFP_USER);
	if(!this->buffers) BUG();

	for(i=0;i<pages;i++) {
		if(! (this->buffers[i]=alloc_page(GFP_USER))) BUG();
	}

	this->blnr = kmalloc(sizeof(unsigned long)*amount_blks,GFP_USER);
	if(!this->blnr) BUG();

	this->bhs = kmalloc(sizeof(struct buffer_head*)*amount_blks,GFP_USER);
	if(!this->bhs) BUG();

	for(i=0;i<amount_blks;i++) {
		this->bhs[i]=kmalloc(sizeof(struct buffer_head),GFP_USER);
		if(!this->bhs[i]) BUG();
	}

	ds_buffer_init(this,minor);
}

STATIC void ds_buffer_free(struct ds_buffer *this)
{
	int amount,pages,i;

	amount=this->number*this->b_size;
	pages=SR_RU(amount,PAGE_SHIFT);

	for(i=0;i<this->number;i++) {
		kfree(this->bhs[i]);
	}
	kfree(this->bhs);

	kfree(this->blnr);

	for(i=0;i<pages;i++) {
		drbd_free_page(this->buffers[i]);
	}
	kfree(this->buffers);
}

STATIC int ds_buffer_read(struct ds_buffer *this,
		   unsigned long (*get_blk)(void*,int),
		   void* id,
		   int minor)
{
	int count=0;
	int amount_blks=this->number;
	int ln2_bs = drbd_log2(this->b_size);
	unsigned long flags;

	while (count < amount_blks) {
		unsigned long block_nr;

		block_nr=get_blk(id,ln2_bs);
		if(block_nr == MBDS_DONE) break;

		spin_lock_irqsave(&drbd_conf[minor].bb_lock,flags);
		this->io_pending_number=count+1;
		this->blnr[count]=block_nr;
		if(tl_check_sector(drbd_conf+minor,block_nr << (ln2_bs-9))) {
			struct busy_block bl;
			bb_wait_prepare(drbd_conf+minor,block_nr,&bl);
			spin_unlock_irqrestore(&drbd_conf[minor].bb_lock,flags);
			bb_wait(&bl);
		} else spin_unlock_irqrestore(&drbd_conf[minor].bb_lock,flags);

		drbd_set_bh(this->bhs[count],
			    block_nr,
			    drbd_conf[minor].lo_device);
		clear_bit(BH_Uptodate, &this->bhs[count]->b_state);
		set_bit(BH_Lock, &this->bhs[count]->b_state);
		submit_bh(READ,this->bhs[count]);
		count++;
	}

	if(count) {
		run_task_queue(&tq_disk);
	}
	return count;
}

STATIC int ds_buffer_reread(struct ds_buffer *this,int minor)
{
	int i,count;
	unsigned long flags;
	unsigned long block_nr;
	int ln2_bs = drbd_log2(this->b_size);

	count=this->io_pending_number;

	for(i=0;i<count;i++) {

		block_nr = this->blnr[i];

		spin_lock_irqsave(&drbd_conf[minor].bb_lock,flags);
		if(tl_check_sector(drbd_conf+minor,block_nr << (ln2_bs-9))) {
			struct busy_block bl;
			bb_wait_prepare(drbd_conf+minor,block_nr,&bl);
			spin_unlock_irqrestore(&drbd_conf[minor].bb_lock,flags);
			bb_wait(&bl);
		} else spin_unlock_irqrestore(&drbd_conf[minor].bb_lock,flags);

		drbd_set_bh(this->bhs[i], block_nr,
			    drbd_conf[minor].lo_device);

		clear_bit(BH_Uptodate, &this->bhs[i]->b_state);
		set_bit(BH_Lock, &this->bhs[i]->b_state);
		submit_bh(READ,this->bhs[i]);
	}

	if(count) {
		run_task_queue(&tq_disk);
	}

	return count;
}

STATIC int ds_buffer_wait_on(struct ds_buffer *this,int minor)
{
	int i;
	int pending=this->io_pending_number;
	int size_kb = this->b_size >> 10;

	for(i=0;i<pending;i++) {
		struct buffer_head *bh;
		bh=this->bhs[i];
		if (!buffer_uptodate(bh)) wait_on_buffer(bh);
		smp_rmb(); // we want to see the changes from ds_end_dio()
		if (!buffer_uptodate(bh)) {
			printk(KERN_ERR DEVICE_NAME "%d: !uptodate\n", minor);
			return -1;
		}
		drbd_conf[minor].read_cnt+=size_kb;
	}
	return pending;
}

STATIC inline void ds_buffer_done(struct Drbd_Conf *mdev, struct ds_buffer *this)
{
	int i,pending=this->io_pending_number;

	this->io_pending_number=0;
	for(i=0;i<pending;i++) {
		bb_done(mdev,this->blnr[i]);
	}
}

STATIC int ds_buffer_send(struct Drbd_Conf *mdev, struct ds_buffer *this)
{
	int i,blocksize,rr,rv=TRUE;
	int pending=this->io_pending_number;
	unsigned long flags;

	blocksize = this->b_size >> 10;

	for(i=0;i<pending;i++) {
		/* maybe someone has drbd_thread_stop_nowait()ed us.
		 * drbd_send_block won't notice (DRBD_SIG != DRBD_SIGKILL),
		 * and we'd need to wait for timeout...
		 * XXX maybe check for == DRBD_SIGKILL
		 */
		if (mdev->syncer.t_state != Running) {
			rv=FALSE;
			break;
		}
		rr=drbd_send_block(mdev,this->bhs[i],ID_SYNCER);

		if(rr < blocksize) {
			rv=FALSE;
			break;
		}
	}

	spin_lock_irqsave(&mdev->bb_lock,flags);
	ds_buffer_done(mdev,this);
	spin_unlock_irqrestore(&mdev->bb_lock,flags);

	return rv;
}

STATIC unsigned long ds_sync_all_get_blk(void* id, int ln2_bs)
{
	struct Drbd_Conf *mdev=(struct Drbd_Conf *)id;
	int shift=ln2_bs - 9;
	int minor=(int)(mdev-drbd_conf);

	// truncate to full blocks; convert to sectors;
	if(mdev->synced_to == 
	    ((blk_size[MAJOR_NR][minor] >> (ln2_bs-10))-1) << (ln2_bs-9) ) {
		   return MBDS_DONE;
	}

	mdev->synced_to += (1L<<shift);

	return mdev->synced_to >> shift;
}

#define swap(a,b) { tmp=a; a=b; b=tmp; }

/*lge
 * progress bars shamelessly adapted from drivers/md/md.c
 */
/* hardcoded for now */
#define SYNC_MARKS      10
#define SYNC_MARK_STEP  (3*HZ)
#if defined(HAVE_O1_SCHED)
	/* this should work for the O(1) scheduler */
#define drbd_set_user_nice(current,x) set_user_nice(current,(x))
#else
	/* FIXME which kernel introduced ->nice ? */
# if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,0)
	/* for 2.2 kernel */
#  define drbd_set_user_nice(current,x) (current->priority = 20-(x))
# else
	/* 2.4 */
#  define drbd_set_user_nice(current,x) (set_user_nice(current, (x)))
# endif
#endif

void drbd_wait_for_other_sync_groups(struct Drbd_Conf *mdev)
{
	int i;
	int did_wait=0;
	mdev->sync_method = mdev->cstate;
	do {
		for (i=0; i < minor_count; i++) {
			if (signal_pending(current)) return;
			if ( drbd_conf[i].sync.group < mdev->sync.group
			  && drbd_conf[i].cstate > Connected )
			{
				int ret;
				printk(KERN_INFO DEVICE_NAME
					"%d: Syncer waits for sync group %i\n",
					(mdev-drbd_conf),
					drbd_conf[i].sync.group
				);
				set_cstate(mdev,SyncPaused);
				drbd_send_cstate(mdev);
				ret = wait_event_interruptible(
					drbd_conf[i].cstate_wait,
					drbd_conf[i].sync.group >= mdev->sync.group ||
					drbd_conf[i].cstate <= Connected );
				// FIXME if (ret < 0) do something sensible...
				did_wait=1;
				// XXX why sleep again?
				current->state = TASK_INTERRUPTIBLE;
				schedule_timeout(HZ/10);
				break;
			};
		}
	} while (i < minor_count);
	if (did_wait) {
		printk(KERN_INFO DEVICE_NAME
			"%d: resumed synchronisation.\n",
			(mdev-drbd_conf)
		);
		set_cstate(mdev,mdev->sync_method);
		drbd_send_cstate(mdev);
	}
}

int drbd_syncer(struct Drbd_thread *thi)
{
	int minor = thi->minor;
	struct Drbd_Conf *mdev = drbd_conf+minor;
	struct ds_buffer buffers[2];
	struct ds_buffer *disk_b, *net_b, *tmp;
	int amount,amount_blks;
	int my_blksize,ln2_bs,retry;
	unsigned long (*get_blk)(void*,int);
	void* id;
	unsigned long flags;
	unsigned long mark[SYNC_MARKS];
	unsigned long mark_cnt[SYNC_MARKS];
	unsigned int currspeed;
	int last_mark,m;

	sprintf(current->comm, "drbd_syncer_%d", minor);

	amount=drbd_conf[minor].sock->sk->sndbuf >> (1+10);
	/* We want to fill half of the send buffer in KB */
	my_blksize= 1 << drbd_conf[minor].blk_size_b; 
	ln2_bs = drbd_conf[minor].blk_size_b; 
	amount_blks=(amount<<10)/my_blksize;

	printk(KERN_INFO DEVICE_NAME "%d: Synchronisation started blks=%d\n",
		minor,amount_blks);

	if(drbd_conf[minor].cstate == SyncingAll) {
		drbd_conf[minor].synced_to = -( 1UL << (ln2_bs-9));

		get_blk=&ds_sync_all_get_blk;
		id=drbd_conf+minor;
        } else if(drbd_conf[minor].cstate == SyncingQuick) {
		bm_reset(drbd_conf[minor].mbds_id,
			 drbd_conf[minor].blk_size_b);
		get_blk=(unsigned long (*)(void*,int))&bm_get_blocknr;
		id=drbd_conf[minor].mbds_id;
        } else {
                /* print warning/error ? */
		return 0;
	}

	ds_buffer_alloc(&buffers[0],minor);
	ds_buffer_alloc(&buffers[1],minor);
	disk_b=buffers;
	net_b=buffers+1;

	spin_lock_irqsave(&drbd_conf[minor].bb_lock,flags);
	drbd_conf[minor].syncer_b = buffers;
	spin_unlock_irqrestore(&drbd_conf[minor].bb_lock,flags);

	/*
	 * Resync has low priority.
	 */
	drbd_set_user_nice(current,mdev->sync.nice);

	for (m = 0; m < SYNC_MARKS; m++) {
		mark[m] = jiffies;
		mark_cnt[m] = 0;
	}
	last_mark = 0;
	drbd_conf[minor].resync_mark_start = mark[last_mark];
	drbd_conf[minor].resync_mark = mark[last_mark];
	drbd_conf[minor].resync_mark_cnt = mark_cnt[last_mark];

	//if (mdev->cstate == SyncingAll)
	drbd_wait_for_other_sync_groups(mdev);
	if(thi->t_state == Exiting) { // unlikely, but anyways ...
		printk(KERN_ERR DEVICE_NAME
		       "%d: Syncer aborted before it started.\n",minor);
		goto err;
	}
	ds_buffer_read(disk_b,get_blk,id,minor);

	while (TRUE) {
		retry=0;
	retry:
		if (jiffies >= mark[last_mark] + SYNC_MARK_STEP) {
			/* step marks */
			int next = (last_mark+1) % SYNC_MARKS;

			mdev->resync_mark = mark[next];
			mdev->resync_mark_cnt = mark_cnt[next];
			mark[next] = jiffies;
		/*
		 * there may be an issue due to non atomic_t of synced_to, etc.
		 * could even be related to "access beyond end of device"
		 * please tell me I'm wrong.             lge
		 */
			mark_cnt[next] = mdev->synced_to;
			last_mark = next;
		}
		if (current->need_resched)
			schedule();

		currspeed = -(mdev->resync_mark_cnt - mdev->synced_to)/2
		          / ((jiffies - mdev->resync_mark)/HZ +1)         +1;

		if (currspeed >= (mdev->sync.min)) {
			drbd_set_user_nice(current,mdev->sync.nice);

			if ( (currspeed > mdev->sync.max)
				/* what to do with this one?
				|| !is_mddev_idle(mddev) */
				)
			{
				current->state = TASK_INTERRUPTIBLE;
				schedule_timeout(HZ/2);
				goto retry;
				/* this is no retry++, but slowdown */
			}
		} else
			drbd_set_user_nice(current,-20);

		//if (mdev->cstate == SyncingAll)
		drbd_wait_for_other_sync_groups(mdev);
		// signal pending is checked below with thi->t_state == Exiting

		switch(ds_buffer_wait_on(disk_b,minor)) {
		case 0: goto done;  /* finished */
		case -1:
			if(my_blksize != blksize_size[MAJOR_NR][minor]) {
				printk(KERN_ERR DEVICE_NAME
				       "%d: Changing blksize not supported\n"
				       "Please consider contributing it!\n",
				       minor);
			} else {
				printk(KERN_ERR DEVICE_NAME
				       "%d: Syncer reread.\n",minor);
				ds_buffer_init(disk_b,minor);
				ds_buffer_reread(disk_b,minor);
			}
			if(retry++ < 5) goto retry;
			printk(KERN_ERR DEVICE_NAME
			       "%d: Syncer read failed.\n",minor);
			goto err;
		}
		swap(disk_b,net_b);
		if(thi->t_state == Exiting) {
			ds_buffer_send(mdev,net_b);
			printk(KERN_ERR DEVICE_NAME
			       "%d: Syncer aborted.\n",minor);
			goto err;
		}
		ds_buffer_read(disk_b,get_blk,id,minor);
		if(!ds_buffer_send(mdev,net_b)) {
			ds_buffer_wait_on(disk_b,minor);
			printk(KERN_ERR DEVICE_NAME
			       "%d: Syncer send failed.\n",minor);
			goto err;
		}
	}

 done:
	drbd_send_cmd(mdev,SetConsistent,0);
	mdev->sync_method = 0;
	if (mdev->sync.skip == 1)
		mdev->sync.skip = 0;
	printk(KERN_INFO DEVICE_NAME "%d: Synchronisation done.\n",minor);

 err:
	if(mdev->cstate == SyncingAll ||
	   mdev->cstate == SyncingQuick) {
		set_cstate(mdev,Connected);
		drbd_send_cstate(mdev);
	}

	spin_lock_irqsave(&mdev->bb_lock,flags);
	mdev->syncer_b = 0;
	ds_buffer_done(mdev,disk_b);
	ds_buffer_done(mdev,net_b);
	spin_unlock_irqrestore(&mdev->bb_lock,flags);

	ds_buffer_free(&buffers[0]);
	ds_buffer_free(&buffers[1]);

	return 0;
}
#undef SYNC_MARKS
#undef SYNC_MARK_STEP
#undef drbd_set_user_nice
