/*
 * nasd_od_uio.c
 *
 * User-level I/O for integrated NASD.
 *
 * Authors: Jim Zelenka, Marc Unangst
 */
/*
 * Copyright (c) of Carnegie Mellon University, 1997,1998,1999.
 *
 * Permission to reproduce, use, and prepare derivative works of
 * this software for internal use is granted provided the copyright
 * and "No Warranty" statements are included with all reproductions
 * and derivative works. This software may also be redistributed
 * without charge provided that the copyright and "No Warranty"
 * statements are included in all redistributions.
 *
 * NO WARRANTY. THIS SOFTWARE IS FURNISHED ON AN "AS IS" BASIS.
 * CARNEGIE MELLON UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER
 * EXPRESSED OR IMPLIED AS TO THE MATTER INCLUDING, BUT NOT LIMITED
 * TO: WARRANTY OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY
 * OF RESULTS OR RESULTS OBTAINED FROM USE OF THIS SOFTWARE. CARNEGIE
 * MELLON UNIVERSITY DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT
 * TO FREEDOM FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 */


#include <nasd/nasd_options.h>
#include <nasd/nasd_drive_options.h>
#include <nasd/nasd_types.h>
#include <nasd/nasd_freelist.h>
#include <nasd/nasd_itypes.h>
#include <nasd/nasd_mem.h>
#include <nasd/nasd_cache.h>
#include <nasd/nasd_common.h>
#include <nasd/nasd_timer.h>
#include <nasd/nasd_control.h>
#define NASD_DRIVE_IO_MODULE 1
#include <nasd/nasd_drive_io.h>
#include <nasd/nasd_ioqueue.h>

#include <stdio.h>
#include <errno.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <signal.h>

#include <sys/param.h>
#include <sys/uio.h>
#ifdef DEC_OSF
#include <nlist.h>
#include <sys/utctime.h>
#include <sys/resource.h>
#endif /* DEC_OSF */
#include <unistd.h>

#if NASD_SECURITY_KEEP_STATS > 0 
#include <nasd/nasd_security.h>
#endif /* NASD_SECURITY_KEEP_STATS > 0  */

/*
 * This I/O module is designed to be a generic user-level
 * module for most unices.
 *
 * I/Os are executed by a set of worker threads- at system
 * startup time, nasd_od_ioq_max_outstanding of these threads
 * are created. nasd_u_pending is a list of I/O chains to
 * perform (linked on the cnext field). The act of launching
 * an I/O is putting it in this list, and signalling nasd_u_q_cond.
 * The list itself is protected by nasd_u_q_lock (operated on
 * in code by NASD_IO_PEND_LOCK()/NASD_IO_PEND_UNLOCK() operations).
 * When an I/O is launched, nasd_od_io_ios_outstanding is incremented.
 *
 * Each worker thread sleeps on nasd_u_q_cond if there is no
 * work to do. When it wakes up, if nasd_u_pending is
 * empty, it resumes sleeping. Otherwise, it dequeues the next
 * I/O chain from that list and executes the I/O. When it is
 * done executing an I/O, it first sees if nasd_u_pending contains
 * another I/O. If so, it executes it, will continue in this manner
 * until nasd_u_pending does not contain any I/Os. At this point, it
 * will attempt to retrieve the next I/O directly from the queue
 * module by calling nasd_od_io_deq_next(). If this retrieves an
 * I/O chain, it will continue dispatching I/Os, always checking
 * first nasd_u_pending then the queue module until there are no
 * pending I/Os in the system, at which point it decrements
 * nasd_od_io_ios_outstanding, and sleeps on nasd_u_q_cond.
 *
 * Note that changes to nasd_od_io_ios_outstanding are protected by
 * nasd_u_q_lock.
 *
 * Asynchronous block flushing is not really implemented- the
 * asynchronous flush operation in turn calls the synchronous flush
 * operation.
 */

nasd_threadgroup_t nasd_u_io_threadgroup;
nasd_threadgroup_t nasd_u_flushproc_threadgroup;

int nasd_u_flushpipe[2] = {(-1),(-1)};
#define nasd_u_flushpipe_read  nasd_u_flushpipe[0]
#define nasd_u_flushpipe_write nasd_u_flushpipe[1]

NASD_DECLARE_MUTEX(nasd_u_extrafd_lock)
char nasd_u_devname[MAXPATHLEN];
int nasd_u_extrafd = (-1);

NASD_DECLARE_COND(nasd_u_run_cond)
NASD_DECLARE_COND(nasd_u_q_cond)
NASD_DECLARE_MUTEX(nasd_u_q_lock)

extern int nasd_od_rusage_at_shutdown;

#define NASD_IO_LASTCOMP_ASSIGN(_off_) { \
  nasd_uint64 _off; \
  _off = (_off_); \
  _off >>= NASD_OD_SECT_SHIFT; \
  nasd_od_io_last_completed_sect = _off; \
}

#define NASD_IO_EFD_LOCK()   NASD_LOCK_MUTEX(nasd_u_extrafd_lock)
#define NASD_IO_EFD_UNLOCK() NASD_UNLOCK_MUTEX(nasd_u_extrafd_lock)

/*
 * If I/O lock must be held, lock before this
 */
#define NASD_IO_PEND_LOCK()   NASD_LOCK_MUTEX(nasd_u_q_lock)
#define NASD_IO_PEND_UNLOCK() NASD_UNLOCK_MUTEX(nasd_u_q_lock)

int nasd_u_max_retries = 3;

nasd_odc_ent_t nasd_u_pending;

void
nasd_od_uio_flush_proc(
  nasd_threadarg_t  ignored)
{
  struct timeval tv;
  fd_set rfd, lrfd;
  nasd_status_t rc;
  int ret;
  char c;

  NASD_IO_PEND_LOCK();
  NASD_THREADGROUP_RUNNING(&nasd_u_flushproc_threadgroup);
  NASD_IO_PEND_UNLOCK();
  NASD_BROADCAST_COND(nasd_u_run_cond);
  FD_ZERO(&rfd);
  FD_SET(nasd_u_flushpipe_read, &rfd);
  while (!NASD_THREADGROUP_SHUTDOWNP(&nasd_u_flushproc_threadgroup)) {
    tv.tv_sec = 5;
    tv.tv_usec = 0;
    lrfd = rfd;
#if MJU_DEBUG
    fprintf(stderr, "flushproc: entering select\n");
#endif /* MJU_DEBUG */
    ret = select(nasd_u_flushpipe_read+1, &lrfd, NULL, NULL, &tv);
    if (NASD_THREADGROUP_SHUTDOWNP(&nasd_u_flushproc_threadgroup))
      break;
    if ((ret < 0) && (errno != EINTR)) {
      nasd_printf("DRIVE WARNING: got unexpected ret=%d errno=%d from select\n",
        ret, errno);
    }
#if MJU_DEBUG
    fprintf(stderr, "flushproc [%d:%09d]: back from select, ts.ts_sec, ts.ts_nsec, ret=%d errno=%d\n",
            ret, errno);
#endif /* MJU_DEBUG */
    if((ret < 0) && (errno == EINTR))
      continue;
    if (FD_ISSET(nasd_u_flushpipe_read, &lrfd)) {
#if MJU_DEBUG
      fprintf(stderr, "flushproc: activity on flushpipe\n");
#endif /* MJU_DEBUG */
      ret = read(nasd_u_flushpipe_read, &c, 1);
      if (ret != 1) {
        NASD_PANIC();
      }
      break;
    }
    if (ret) {
      NASD_PANIC();
    }
    NASD_IO_INC_STAT(auto_flush);
    rc = nasd_odc_flush_dirty(0);
    if (rc) {
      NASD_PANIC();
    }
  }
#if MJU_DEBUG
  fprintf(stderr, "flushproc: trying to stop\n");
#endif /* MJU_DEBUG */
  NASD_THREADGROUP_DONE(&nasd_u_flushproc_threadgroup);
  NASD_BROADCAST_COND(nasd_u_run_cond);
#if MJU_DEBUG
  fprintf(stderr, "flushproc: done\n");
#endif /* MJU_DEBUG */
}

void
nasd_od_uio_proc(
  nasd_threadarg_t  arg)
{
  int j, wb, t, fd, rc, iodir, retry_count;
  struct iovec iov[NASD_IO_MAX_COALESCE];
  nasd_odc_ent_t *e, *dispatch, *next;
  nasd_uint64 want, got;

#if MJU_DEBUG
  nasd_thread_id_t self = nasd_thread_self();
  fprintf(stderr, "nasd_od_uio_proc[%" NASD_THREAD_ID_FMT "] running\n", self);
#endif /* MJU_DEBUG */

  t = (int)((u_long)arg);
  fd = open(nasd_u_devname, O_RDWR);
  if (fd < 0) {
    perror(nasd_u_devname);
    NASD_PANIC();
  }
  NASD_IO_PEND_LOCK();
  NASD_THREADGROUP_RUNNING(&nasd_u_io_threadgroup)
  NASD_SIGNAL_COND(nasd_u_run_cond);
  while(!NASD_THREADGROUP_SHUTDOWNP(&nasd_u_io_threadgroup)) {
    if (nasd_u_pending.cnext != &nasd_u_pending) {
      /*
       * We have a pending I/O, dispatch it.
       */
#if MJU_DEBUG
      fprintf(stderr, "nasd_od_uio_proc[%" NASD_THREAD_ID_FMT
             "]: pending I/O on dispatch queue\n", self);
#endif /* MJU_DEBUG */
      dispatch = nasd_u_pending.cnext;
#if MJU_DEBUG
      fprintf(stderr, "nasd_od_uio_proc[%" NASD_THREAD_ID_FMT
             "]: pending I/O is 0x%lx\n", self, dispatch);
#endif /* MJU_DEBUG */
      /* remove from pending queue */
      dispatch->cnext->cprev = dispatch->cprev;
      dispatch->cprev->cnext = dispatch->cnext;
      dispatch->cnext = dispatch->cprev = NULL;
do_dispatch:
      iodir = dispatch->iodir;
      NASD_IO_INC_STAT(pull_ios);

      for(wb=j=0,e=dispatch;e;e=e->inext) {
        NASD_ASSERT(j<NASD_UIO_MAXIOV);
        NASD_ASSERT(j<NASD_IO_MAX_COALESCE);
        NASD_ASSERT(e->blkno <= nasd_od_blocks);
        NASD_ASSERT(e->iodir == iodir);
        NASD_ASSERT((e->io_flags&(NASD_CI_DISPATCH|NASD_CI_IOQ)) == NASD_CI_DISPATCH);
        iov[j].iov_base = (caddr_t)e->data.buf;
        iov[j].iov_len = NASD_OD_BASIC_BLOCKSIZE;
        j++;
        wb += NASD_OD_BASIC_BLOCKSIZE;
        NASD_IO_TM_LAUNCH(e);
      }
      if (iodir == NASD_U_READ) {
        NASD_IO_INC_SIZE_STAT(j,read);
      }
      else {
        NASD_IO_INC_SIZE_STAT(j,write);
      }
      NASD_IO_PEND_UNLOCK();
      /*
       * Actually do the I/O.
       */
      retry_count = 0;
      want = ((nasd_uint64)dispatch->real_sectno) << NASD_OD_SECT_SHIFT;
do_retry:
      got = nasd_lseek(fd, want, SEEK_SET, &rc);
      if (got != want) {
        NASD_PANIC();
      }
      switch(iodir) {
        case NASD_U_READ:
          NASD_IO_INC_STAT(num_io_reads);
          rc = readv(fd, iov, j);
          break;
        case NASD_U_WRITE:
          NASD_IO_INC_STAT(num_io_writes);
          rc = writev(fd, iov, j);
          break;
        default:
          NASD_PANIC();
      }
      if (rc > 0) {
        NASD_IO_LASTCOMP_ASSIGN(got + (nasd_uint64)rc);
      }

      if (rc != wb) {
        nasd_printf("DRIVE UIO: got rc=%d wanted wb=%d\n", rc, wb);
        nasd_printf("DRIVE UIO: got=%" NASD_64u_FMT
          " j=%d errno %d retry_count %d\n",
          (nasd_uint64)got, j, errno, retry_count);
        retry_count++;
        if (retry_count <= nasd_u_max_retries) {
          NASD_IO_INC_STAT(retries);
          goto do_retry;
        }
        NASD_PANIC();
      }

      NASD_IO_LOCK();
      for(e=dispatch;e;e=e->inext) {
        e->io_flags &= ~NASD_CI_DISPATCH;
        NASD_IO_TM_COMPLETE(e);
      }
      NASD_IO_UNLOCK();
      /*
       * Announce completion of all I/Os in list
       */
      for(e=dispatch;e;e=next) {
        next = e->inext;
        e->inext = e->iprev = NULL;
        nasd_od_io_iodone(e);
      }
      NASD_IO_PEND_LOCK();
      if (nasd_u_pending.cnext != &nasd_u_pending) {
        dispatch = nasd_u_pending.cnext;
        dispatch->cnext->cprev = dispatch->cprev;
        dispatch->cprev->cnext = dispatch->cnext;
        dispatch->cnext = dispatch->cprev = NULL;
      }
      else {
        dispatch = NULL;
      }
      NASD_IO_PEND_UNLOCK();
      if (dispatch == NULL) {
        nasd_od_io_deq_next(&dispatch, 0);
      }
      NASD_IO_PEND_LOCK();
      if (dispatch) {
        /* We already have our next I/O in hand. Do it. */
        goto do_dispatch;
      }
      else {
        nasd_od_io_ios_outstanding--;
      }
    }
    else {
#if MJU_DEBUG
      fprintf(stderr, "nasd_od_uio_proc[%" NASD_THREAD_ID_FMT
             "]: no I/Os on queues, waiting\n", self);
#endif /* MJU_DEBUG */
      NASD_WAIT_COND(nasd_u_q_cond,nasd_u_q_lock);
#if MJU_DEBUG
      fprintf(stderr, "nasd_od_uio_proc[%" NASD_THREAD_ID_FMT
             "]: back from wait\n", self);
#endif /* MJU_DEBUG */
    }
  }
  close(fd);
  NASD_IO_PEND_UNLOCK();
  NASD_SIGNAL_COND(nasd_u_run_cond);
  NASD_THREADGROUP_DONE(&nasd_u_io_threadgroup)
#if MJU_DEBUG
  fprintf(stderr, "nasd_od_uio_proc[%" NASD_THREAD_ID_FMT "]: shutting down\n", self);
#endif /* MJU_DEBUG */
}

void
nasd_u_close_fd(
  void  *arg)
{
  int fd;

  fd = (int)((u_long)arg);
  close(fd);
}

void
nasd_u_shutdown_io_threadgroup(
  void  *ignored)
{
  nasd_status_t rc;

  rc = nasd_destroy_threadgroup(&nasd_u_io_threadgroup);
  if (rc) {
    nasd_printf("DRIVE WARNING: got 0x%x (%s) destroying nasd_u_io_threadgroup\n",
      rc, nasd_error_string(rc));
  }
}

void
nasd_u_shutdown_flushproc_threadgroup(
  void  *ignored)
{
  nasd_status_t rc;

  rc = nasd_destroy_threadgroup(&nasd_u_flushproc_threadgroup);
  if (rc) {
    nasd_printf(
      "DRIVE WARNING: got 0x%x (%s) destroying nasd_u_flushproc_threadgroup\n",
      rc, nasd_error_string(rc));
  }
}

void
nasd_u_stop_iothreads(
  void  *ignored)
{
  NASD_THREADGROUP_INDICATE_SHUTDOWN(&nasd_u_io_threadgroup);
  NASD_BROADCAST_COND(nasd_u_q_cond);
  NASD_THREADGROUP_WAIT_STOP(&nasd_u_io_threadgroup);
}

void
nasd_u_stop_flush_proc(
  void  *ignored)
{
  int ret;
  char c;

  c = 'a';
  nasd_odc_dirty_kick();
  NASD_THREADGROUP_INDICATE_SHUTDOWN(&nasd_u_flushproc_threadgroup);
  nasd_odc_dirtythread_force = 1;
  ret = write(nasd_u_flushpipe_write, &c, 1);
  if (ret != 1) {
    nasd_printf("DRIVE ERROR: got %d expected 1 kicking pipe %d (errno %d)\n",
      ret, nasd_u_flushpipe_write, errno);
    NASD_PANIC();
  }

#if MJU_DEBUG
  fprintf(stderr, "stop_flush_proc: wait for shutdown\n");
#endif /* MJU_DEBUG */
  NASD_THREADGROUP_WAIT_STOP(&nasd_u_flushproc_threadgroup);
#if MJU_DEBUG
  fprintf(stderr, "stop_flush_proc: done waiting for shutdown\n");
#endif /* MJU_DEBUG */
}

#define NASD_U_ICOND(_c_) { \
  rc = nasd_cond_init(_c_); \
  if (rc) { \
    return(rc); \
  } \
  rc = nasd_shutdown_cond(nasd_odc_shutdown, _c_); \
  if (rc) { \
    return(rc); \
  } \
}

#define NASD_U_MUTEX(_m_) { \
  rc = nasd_mutex_init(_m_); \
  if (rc) { \
    return(rc); \
  } \
  rc = nasd_shutdown_mutex(nasd_odc_shutdown, _m_); \
  if (rc) { \
    return(rc); \
  } \
}

void
nasd_u_kill_flushpipe(
  void *ignored)
{
  close(nasd_u_flushpipe_read);
  nasd_u_flushpipe_read = (-1);
  close(nasd_u_flushpipe_write);
  nasd_u_flushpipe_write = (-1);
}

#ifdef DEC_OSF
void
nasd_u_shutdown_rusage(
  void  *ignored)
{
  struct rusage ru;
  int ret;

  ret = getrusage(RUSAGE_SELF, &ru);
  if (ret) {
    perror("DRIVE: rusage");
    nasd_printf("Could not get drive rusage\n");
    return;
  }
  nasd_printf("Drive rusage stats:\n");
  nasd_printf("  %d:%06d user seconds\n",
    ru.ru_utime.tv_sec, ru.ru_utime.tv_usec);
  nasd_printf("  %d:%06d system seconds\n",
    ru.ru_stime.tv_sec, ru.ru_stime.tv_usec);
  nasd_printf("  %ld integral shared memory\n", ru.ru_ixrss);
  nasd_printf("  %ld integral unshared data\n", ru.ru_idrss);
  nasd_printf("  %ld integral unshared stack\n", ru.ru_isrss);
  nasd_printf("  %ld page reclaims (total vm faults)\n", ru.ru_minflt);
  nasd_printf("  %ld page faults\n", ru.ru_majflt);
  nasd_printf("  %ld swaps\n", ru.ru_nswap);
  nasd_printf("  %ld,%ld blocks in,out\n", ru.ru_inblock, ru.ru_oublock);
  nasd_printf("  %ld,%ld messages sent,received\n", ru.ru_msgsnd, ru.ru_msgrcv);
  nasd_printf("  %ld signals\n", ru.ru_nsignals);
  nasd_printf("  %ld voluntary context switches\n", ru.ru_nvcsw);
  nasd_printf("  %ld involuntary context switches\n", ru.ru_nivcsw);
}
#endif /* DEC_OSF */

nasd_status_t
nasd_od_io_init(
  char              *devname,
  nasd_od_config_t  *config)
{
  nasd_thread_t handle;
  nasd_status_t rc;
  int i, ret;

  nasd_od_ioq_max_outstanding = config->ios_outstanding;
  if (nasd_od_ioq_max_outstanding < 1)
    return(NASD_BAD_IOQUEUE_LEN);

  NASD_IO_MODULE_INIT();

  bzero((char *)&nasd_u_pending, sizeof(nasd_u_pending));
  nasd_u_pending.cnext = nasd_u_pending.cprev = &nasd_u_pending;

#ifdef DEC_OSF
  if (nasd_od_rusage_at_shutdown) {
    rc = nasd_shutdown_proc(nasd_odc_shutdown, nasd_u_shutdown_rusage, NULL);
    if (rc) {
      /* don't need to call it here */
      return(rc);
    }
  }
#endif /* DEC_OSF */



  strcpy(nasd_u_devname, devname);

  nasd_u_extrafd = open(nasd_u_devname, O_RDWR);
  if (nasd_u_extrafd < 0) {
    perror(nasd_u_devname);
    exit(1);
  }
  rc = nasd_shutdown_proc(nasd_odc_shutdown, nasd_u_close_fd,
    (void *)((u_long)nasd_u_extrafd));
  if (rc) {
    close(nasd_u_extrafd);
    return(rc);
  }

  NASD_U_ICOND(&nasd_u_run_cond);
  NASD_U_ICOND(&nasd_u_q_cond);

  NASD_U_MUTEX(&nasd_u_q_lock);
  NASD_U_MUTEX(&nasd_u_extrafd_lock);

  rc = nasd_init_threadgroup(&nasd_u_io_threadgroup);
  if (rc)
    return(rc);
  rc = nasd_shutdown_proc(nasd_odc_shutdown, nasd_u_shutdown_io_threadgroup,
    NULL);
  if (rc) {
    nasd_u_shutdown_io_threadgroup(NULL);
    return(rc);
  }

  rc = nasd_init_threadgroup(&nasd_u_flushproc_threadgroup);
  if (rc)
    return(rc);
  rc = nasd_shutdown_proc(nasd_odc_shutdown,
    nasd_u_shutdown_flushproc_threadgroup, NULL);
  if (rc) {
    nasd_u_shutdown_flushproc_threadgroup(NULL);
    return(rc);
  }

  /* create I/O threads */
  for(i=0;i<nasd_od_ioq_max_outstanding;i++) {
    rc = nasd_thread_create(&handle, nasd_od_uio_proc,
      (nasd_threadarg_t)((u_long)i));
    if (rc) {
      NASD_THREADGROUP_WAIT_START(&nasd_u_io_threadgroup);
      nasd_u_stop_iothreads(NULL);
      return(rc);
    }
    NASD_THREADGROUP_STARTED(&nasd_u_io_threadgroup);
  }

  /* wait for threads to start */
  NASD_THREADGROUP_WAIT_START(&nasd_u_io_threadgroup);

  ret = pipe(nasd_u_flushpipe);
  if (ret) {
    nasd_printf("ERROR %d from pipe()\n", errno);
    return(NASD_FAIL);
  }
  rc = nasd_shutdown_proc(nasd_odc_shutdown, nasd_u_kill_flushpipe, NULL);
  if (rc) {
    nasd_u_kill_flushpipe(NULL);
    return(rc);
  }

  rc = nasd_shutdown_proc(nasd_odc_shutdown, nasd_u_stop_iothreads, NULL);
  if (rc) {
    nasd_u_stop_iothreads(NULL);
    return(rc);
  }

  return(NASD_SUCCESS);
}

nasd_status_t
nasd_od_io_go()
{
  nasd_thread_t handle;
  nasd_status_t rc;

  rc = nasd_thread_create(&handle, nasd_od_uio_flush_proc, NULL);
  if (rc)
    return(rc);
  NASD_THREADGROUP_STARTED(&nasd_u_flushproc_threadgroup);

  NASD_THREADGROUP_WAIT_START(&nasd_u_flushproc_threadgroup);

  rc = nasd_shutdown_proc(nasd_odc_shutdown, nasd_u_stop_flush_proc, NULL);
  if (rc) {
    nasd_u_stop_flush_proc(NULL);
    return(rc);
  }

  return(NASD_SUCCESS);
}

/*
 * Caller holds I/O lock here
 */
nasd_status_t
nasd_od_io_launch(
  nasd_odc_ent_t  *entlist)
{
  NASD_IO_PEND_LOCK();

  if (nasd_od_io_ios_outstanding >= nasd_od_ioq_max_outstanding) {
    NASD_IO_PEND_UNLOCK();
    return(NASD_IOSYS_FULL);
  }
  nasd_od_io_ios_outstanding++;

  entlist->cprev = nasd_u_pending.cprev;
  entlist->cnext = &nasd_u_pending;
  entlist->cprev->cnext = entlist;
  entlist->cnext->cprev = entlist;

  NASD_IO_PEND_UNLOCK();

  NASD_SIGNAL_COND(nasd_u_q_cond);

  return(NASD_SUCCESS);
}

void
nasd_od_io_sys_flush_block(
  nasd_odc_ent_t  *ent)
{
  nasd_uint64 got, want;
  int rc;

  NASD_IO_EFD_LOCK();

  want = ((nasd_uint64)ent->real_sectno) << NASD_OD_SECT_SHIFT;
  got = nasd_lseek(nasd_u_extrafd, want, SEEK_SET, &rc);
  NASD_ASSERT(got == want);

  NASD_IO_INC_SIZE_STAT(1,write);
  nasd_od_io_sync_launch(ent->real_sectno);
  NASD_IO_TM_LAUNCH(ent);
  rc = write(nasd_u_extrafd, ent->data.buf, NASD_OD_BASIC_BLOCKSIZE);
  NASD_ASSERT(rc == NASD_OD_BASIC_BLOCKSIZE);
  NASD_IO_TM_COMPLETE(ent);

  NASD_IO_EFD_UNLOCK();

  if (rc > 0) {
    NASD_IO_LASTCOMP_ASSIGN(got + (nasd_uint64)rc);
  }
}

void
nasd_od_io_sys_flush_block_async(
  nasd_odc_ent_t  *ent)
{
  /*
   * Eventually, something clever could be done here.
   */
  nasd_od_io_sys_flush_block(ent);
  nasd_od_io_flush_block_async_finish(ent);
}

/*
 * Call with diskstate locked
 */
nasd_status_t
nasd_od_write_diskstate(
  int  force_sync)
{
  nasd_sectno_t last_comp, diff1, diff2;
  nasd_odc_ent_t fake_ent;
  int rc, write_which;
  nasd_uint64 got, want;

  NASD_IO_TM_ENQ(&fake_ent);

  nasd_odc_state->disk->mod_time = nasd_odc_state->nvstate->mod_time;

  if (force_sync) {
    NASD_IO_INC_IO_STAT(header_force_sync,write);
  }

  write_which = 0; /* shut up whiner compiler */
  if (force_sync == 0) {
    last_comp = nasd_od_io_last_completed_sect;
    if (last_comp >= nasd_diskheader_dup_blk) {
      write_which = 2;
    }
    else {
      NASD_ASSERT(nasd_diskheader_blk <= last_comp);
      NASD_ASSERT(nasd_diskheader_dup_blk > last_comp);
      diff1 = last_comp - nasd_diskheader_blk;
      diff2 = nasd_diskheader_dup_blk - last_comp;
      if (diff1 > diff2)
        write_which = 2;
      else
        write_which = 1;
    }
  }

  NASD_IO_EFD_LOCK();

  if (force_sync || (write_which == 1)) {
    want = ((nasd_uint64)nasd_diskheader_blk) << NASD_OD_SECT_SHIFT;
    got = nasd_lseek(nasd_u_extrafd, want, SEEK_SET, &rc);
    NASD_ASSERT(got == want);
    NASD_IO_TM_LAUNCH(&fake_ent);
    nasd_od_io_sync_launch(nasd_diskheader_blk);
    rc = write(nasd_u_extrafd, (void *)nasd_odc_state->disk,
      NASD_OD_SECT_SIZE);
    if (rc != NASD_OD_SECT_SIZE)
      NASD_PANIC();
    NASD_IO_TM_COMPLETE(&fake_ent);

    if ((force_sync == 0) && (rc > 0)) {
      NASD_IO_LASTCOMP_ASSIGN(got + (nasd_uint64)rc);
    }

    NASD_IO_INC_IO_STAT(header_write,write);
    NASD_IO_INC_IO_STAT(header1_write,write);
  }

  if (force_sync || (write_which == 2)) {
    want = ((nasd_uint64)nasd_diskheader_dup_blk) << NASD_OD_SECT_SHIFT;
    got = nasd_lseek(nasd_u_extrafd, want, SEEK_SET, &rc);
    NASD_ASSERT(got == want);
    NASD_IO_TM_LAUNCH(&fake_ent);
    nasd_od_io_sync_launch(nasd_diskheader_dup_blk);
    rc = write(nasd_u_extrafd, (void *)nasd_odc_state->disk,
      NASD_OD_SECT_SIZE);
    if (rc != NASD_OD_SECT_SIZE)
      NASD_PANIC();
    NASD_IO_TM_COMPLETE(&fake_ent);

    if (rc > 0) {
      NASD_IO_LASTCOMP_ASSIGN(got + (nasd_uint64)rc);
    }

    NASD_IO_INC_IO_STAT(header_write,write);
    NASD_IO_INC_IO_STAT(header2_write,write);
  }

  NASD_IO_EFD_UNLOCK();

  NASD_IO_TM_DONE(&fake_ent);

  return(NASD_SUCCESS);
}

void
nasd_od_io_read_header(
  nasd_blkno_t     sectno,
  nasd_od_disk_t  *disk)
{
  nasd_odc_ent_t fake_ent;
  nasd_uint64 got, want;
  int rc;

  NASD_IO_TM_ENQ(&fake_ent);

  NASD_IO_EFD_LOCK();

  want = ((nasd_uint64)sectno) << NASD_OD_SECT_SHIFT;
  got = nasd_lseek(nasd_u_extrafd, want, SEEK_SET, &rc);
  NASD_ASSERT(got == want);
  nasd_od_io_sync_launch(sectno);
  NASD_IO_TM_LAUNCH(&fake_ent);
  rc = read(nasd_u_extrafd, (void *)disk,
    NASD_OD_SECT_SIZE);
  if (rc != NASD_OD_SECT_SIZE)
    NASD_PANIC();
  NASD_IO_TM_COMPLETE(&fake_ent);
  if (rc > 0) {
    NASD_IO_LASTCOMP_ASSIGN(got + (nasd_uint64)rc);
  }
  NASD_IO_INC_IO_STAT(header_read,read);

  NASD_IO_EFD_UNLOCK();

  NASD_IO_TM_DONE(&fake_ent);
}

/*
 * Alloc actual page of storage for a cache handle.
 */
nasd_status_t
nasd_odc_io_alloc_page(
  nasd_odc_ent_t  *ent)
{
  void *buf;

  NASD_Valloc(buf, NASD_OD_BASIC_BLOCKSIZE, (void *));
  if (buf == NULL) {
    return(NASD_NO_MEM);
  }

  bzero(buf, NASD_OD_BASIC_BLOCKSIZE);
  ent->data.buf = buf;

  return(NASD_SUCCESS);
}

/*
 * Release page of storage
 */
void
nasd_odc_io_release_page(
  nasd_odc_ent_t  *ent)
{
  if (ent->data.buf == NULL) {
    /* nothing here */
    return;
  }

  NASD_Free(ent->data.buf, NASD_OD_BASIC_BLOCKSIZE);

  ent->data.buf = NULL;
}

nasd_status_t
nasd_od_sys_rshutdown(
  nasd_drive_rshutdown_flags_t  flags)
{
  extern pid_t nasd_drive_signal_pid;
  pid_t pid;
  int ret;

  /*
   * This is to work around the fact that linux
   * is TOTALLY INSANE. Under linux, every thread
   * has its own pid. getpid() returns the pid for
   * the current thread, getppid() returns the pid
   * for some random thread in the same process (NOT
   * the spawning thread's pid, even). So when the
   * user-level drive starts, it squirrels away the
   * value of getpid() here. On linux, we also have to
   * have an insane shutdown thread, so we keep its
   * pid here.
   */
  pid = nasd_drive_signal_pid;
  ret = kill(pid, SIGINT);
  if (ret)
    return(NASD_FAIL);

  return(NASD_SUCCESS);
}

/* Local Variables:  */
/* indent-tabs-mode: nil */
/* tab-width: 2 */
/* End: */
