/*
 * nasd_diskman.c
 *
 * Basic disk management for on-disk fs
 *
 * Author: Jim Zelenka
 */
/*
 * Copyright (c) of Carnegie Mellon University, 1997,1998,1999.
 *
 * Permission to reproduce, use, and prepare derivative works of
 * this software for internal use is granted provided the copyright
 * and "No Warranty" statements are included with all reproductions
 * and derivative works. This software may also be redistributed
 * without charge provided that the copyright and "No Warranty"
 * statements are included in all redistributions.
 *
 * NO WARRANTY. THIS SOFTWARE IS FURNISHED ON AN "AS IS" BASIS.
 * CARNEGIE MELLON UNIVERSITY MAKES NO WARRANTIES OF ANY KIND, EITHER
 * EXPRESSED OR IMPLIED AS TO THE MATTER INCLUDING, BUT NOT LIMITED
 * TO: WARRANTY OF FITNESS FOR PURPOSE OR MERCHANTABILITY, EXCLUSIVITY
 * OF RESULTS OR RESULTS OBTAINED FROM USE OF THIS SOFTWARE. CARNEGIE
 * MELLON UNIVERSITY DOES NOT MAKE ANY WARRANTY OF ANY KIND WITH RESPECT
 * TO FREEDOM FROM PATENT, TRADEMARK, OR COPYRIGHT INFRINGEMENT.
 */


#include <nasd/nasd_options.h>
#include <nasd/nasd_drive_options.h>
#include <nasd/nasd_types.h>
#include <nasd/nasd_freelist.h>
#include <nasd/nasd_itypes.h>
#include <nasd/nasd_mem.h>
#include <nasd/nasd_cache.h>
#include <nasd/nasd_common.h>
#include <nasd/nasd_sys.h>
#include <nasd/nasd_layout.h>
#include <nasd/nasd_keymgmt_dr.h>
#include <nasd/nasd_shutdown.h>
#include <nasd/nasd_nonce_mgmt.h>
#include <nasd/nasd_trace.h>
#include <nasd/nasd_trace_dr.h>
#include <nasd/nasd_timeout.h>
#include <nasd/nasd_security_dr.h>
#include <nasd/nasd_drive_utils.h>
#ifndef KERNEL
#include <nasd/nasd_mq.h>
#endif /* !KERNEL */
#include <nasd/nasd_pipe.h>
#include <nasd/nasd_udppipe.h>
#include <nasd/nasd_remote.h>

#if MJU_DEBUG
extern pthread_mutex_t nasd_u_q_lock;
#endif /* MJU_DEBUG */

/* convenience function: do we create a partition on format */
extern int nasd_create_partition_on_format;

/*
 * Drive-global shutdown.
 */
nasd_shutdown_list_t *nasd_odc_shutdown = NULL;

/*
 * Do we force alignment of the first datablock?
 * If so, to what boundary? (0 = no force)
 * This is a sector count, so 64=32k, 128=64k, etc
 */
extern int nasd_od_force_align_disk_boundary;

/*
 * How many sectors into the disk the first "data" block starts at
 */
nasd_sectcnt_t nasd_firstblock_offset;
/*
 * How many sectors into the disk the first refcnt starts at
 * (one for diskheader)
 */
nasd_sectcnt_t nasd_firstref_offset;
/*
 * Which sector holds the disk headers
 */
nasd_sectno_t nasd_diskheader_blk;
nasd_sectno_t nasd_diskheader_dup_blk;

/*
 * Disk layout:
 * header block
 * refcnt blocks
 * partitions ("data" blocks)
 * header block 2
 *
 * "data" blocks (including metadata) begin with block "0" in the
 * partitions section... so the I/O subsystem adds nasd_firstblock_offset
 * to the block number for all data blocks, and nasd_firstref_offset
 * for all refcnt blocks
 */

static nasd_odc_state_t nasd_odc_state_v;

/*
 * XXX should eventually be nvram
 */
static nasd_odc_nvstate_t nasd_odc_nvstate_v;

/*
 * What percentage of the disk might be inodes
 */
extern int nasd_od_nnpt_pcg;

/*
 * For conserving stack space
 */
nasd_freelist_t *nasd_drive_opholder_freelist;

extern nasd_sec_nonce_mgr_t *nasd_drive_nonce_mgr;

/*
 * Create the shutdown list for later inits
 */
nasd_status_t
nasd_basic_init()
{
  nasd_status_t rc;

  /*
   * We do this really early so that things such as security
   * that want to use it will have it all ready.
   */
  bzero((char *)nasd_odc_zeroblk, NASD_OD_BASIC_BLOCKSIZE);

  rc = nasd_threads_init();
  if (rc)
    return(rc);

  rc = nasd_mem_init();
  if (rc)
    return(rc);

  rc = nasd_shutdown_sys_init();
  if (rc) {
    nasd_mem_shutdown();
    nasd_threads_shutdown();
    return(rc);
  }

  rc = nasd_shutdown_list_init(&nasd_odc_shutdown);
  if (rc) {
    nasd_shutdown_cleanup();
    nasd_mem_shutdown();
    nasd_threads_shutdown();
    return(rc);
  }

  rc = nasd_timeout_init();
  if (rc) {
    nasd_shutdown_list_shutdown(nasd_odc_shutdown,
      NASD_SHUTDOWN_ANNOUNCE_NONE);
    nasd_shutdown_cleanup();
    nasd_mem_shutdown();
    nasd_threads_shutdown();
    return(rc);
  }

  rc = nasd_pipe_init(nasd_odc_shutdown);
  if(rc) {
    nasd_timeout_shutdown();
    nasd_shutdown_list_shutdown(nasd_odc_shutdown,
      NASD_SHUTDOWN_ANNOUNCE_NONE);
    nasd_shutdown_cleanup();
    nasd_mem_shutdown();
    nasd_threads_shutdown();
    return(rc);
  }

  return(NASD_SUCCESS);
}

nasd_status_t
nasd_basic_shutdown()
{
  nasd_status_t rc;

#if NASD_DRIVE_SHUTDOWN_DEBUG > 0
#define shutdown_printf(_p_) nasd_printf _p_
#else /* NASD_DRIVE_SHUTDOWN_DEBUG > 0 */
#define shutdown_printf(_p_)
#endif /* NASD_DRIVE_SHUTDOWN_DEBUG > 0 */

  shutdown_printf(("DRIVE: shutting down incoming RPCs\n"));
  nasd_drive_shutdown_rpc();
  shutdown_printf(("DRIVE: no more requests outstanding\n"));

#ifndef KERNEL
  shutdown_printf(("DRIVE: shutting down message queue handlers\n"));
  nasd_shutdown_msgq();
  shutdown_printf(("DRIVE: message queue handlers done\n"));
#endif /* !KERNEL */

  shutdown_printf(("DRIVE: suspending timeout processing\n"));
  nasd_timeout_suspend();
  shutdown_printf(("DRIVE: no more timeouts will be dispatched\n"));

  nasd_debug_breakpoint();
  shutdown_printf(("DRIVE: begin automated shutdown\n"));
  rc = nasd_shutdown_list_shutdown(nasd_odc_shutdown,
    NASD_DRIVE_AUTO_SHUTDOWN_DEBUG);
  shutdown_printf(("DRIVE: automated shutdown complete, status 0x%x (%s)\n",
    rc, nasd_error_string(rc)));

  shutdown_printf(("DRIVE: shutdown timeout subsystem\n"));
  nasd_timeout_shutdown();
  shutdown_printf(("DRIVE: timeout subsystem shutdown complete\n"));

  shutdown_printf(("DRIVE: cleanup automated shutdown subsystem\n"));
  nasd_shutdown_cleanup();
  shutdown_printf(("DRIVE: automated shutdown subsystem cleanup complete\n"));

  shutdown_printf(("DRIVE: shutdown memory subsystem\n"));
  nasd_mem_shutdown();
  shutdown_printf(("DRIVE: memory subsystem shutdown complete\n"));

  shutdown_printf(("DRIVE: shutdown threads subsystem\n"));
  nasd_threads_shutdown();
  shutdown_printf(("DRIVE: threads subsystem shutdown complete\n"));

  return(rc);
}

/*
 * Retrieve newest disk header.
 */
nasd_status_t
nasd_load_diskstate(
  nasd_od_config_t  *config)
{
  nasd_od_disk_t disk1, disk2, *newer;
  char str1[64], str2[64];
  int bad_nvram, i;

  nasd_od_io_read_header(nasd_diskheader_blk, &disk1);
  nasd_od_io_read_header(nasd_diskheader_dup_blk, &disk2);

  config->layout_type = disk1.layout_type;

  if ((nasd_odc_state->nvstate->key1 != NASD_C_KEY1)
    || (nasd_odc_state->nvstate->key2 != NASD_C_KEY2))
  {
    /*
     * "Keys" don't match, nvram is full of garbage.
     */
    bad_nvram = 1;
    nasd_odc_need_recover = 1;
  }
  else {
    bad_nvram = 0;
  }

  /*
   * Headers have timestamps- sometimes, we only write one header
   * (maybe the other write failed, or was never scheduled), so we
   * trust the "newest" one. In any case, if the header modified
   * times disagree, we need a sanity check.
   */
  if (!NASD_TIMESPEC_EQ(disk1.mod_time, disk2.mod_time)) {
    nasd_printf("Primary and duplicate disk headers do not match.\n");
    nasd_printf("%d:%09d != %d:%09d\n", disk1.mod_time.ts_sec,
      disk1.mod_time.ts_nsec, disk2.mod_time.ts_sec, disk2.mod_time.ts_nsec);
    nasd_odc_need_recover = 1;
  }
  else {
    if (!bad_nvram) {
      if (!NASD_TIMESPEC_EQ(nasd_odc_state->nvstate->mod_time, disk1.mod_time))
      {
        nasd_printf("NVRAM sequence disagrees with on-disk header.\n");
        nasd_odc_need_recover = 1;
      }
    }
  }
  newer = NULL;
  if (NASD_TIMESPEC_GT(disk1.mod_time, disk2.mod_time)) {
    newer = &disk1;
    bcopy((char *)&disk1, (char *)nasd_odc_state->disk,
      sizeof(nasd_od_disk_t));
  }
  else if (NASD_TIMESPEC_GT(disk2.mod_time, disk1.mod_time)) {
    newer = &disk2;
    bcopy((char *)&disk2, (char *)nasd_odc_state->disk,
      sizeof(nasd_od_disk_t));
  }
  else {
    /*
     * Disk header itself has same timestamp, check partitions
     */
    for(i=0;i<NASD_OD_MAXPARTS;i++) {
      if (NASD_TIMESPEC_GT(disk1.partitions[i].mod_time,
        disk2.partitions[i].mod_time))
      {
        newer = &disk1;
        break;
      }
      else if (NASD_TIMESPEC_GT(disk2.partitions[i].mod_time,
        disk1.partitions[i].mod_time))
      {
        newer = &disk2;
        break;
      }
    }
  }

  if (newer == NULL) {
    nasd_printf("DRIVE: headers synchronized\n");
    newer = &disk1;
  }
  else {
    nasd_odc_need_recover = 1;
    if (newer == &disk1) {
      nasd_printf("DRIVE: header1 newer than header2\n");
    }
    else {
      NASD_ASSERT(newer == &disk2);
      nasd_printf("DRIVE: header2 newer than header1\n");
    }
  }

  sprintf(str1, "%d:%09d", newer->format_time.ts_sec, newer->format_time.ts_nsec);
  nasd_timestr_r(newer->format_time, str2);
  if (strcmp(str1, str2)) {
    nasd_printf("DRIVE: header formatted at %s (%s)\n", str1, str2);
  }
  else {
    nasd_printf("DRIVE: header formatted at %s\n", str1);
  }

  sprintf(str1, "%d:%09d", newer->mod_time.ts_sec, newer->mod_time.ts_nsec);
  nasd_timestr_r(newer->mod_time, str2);
  if (strcmp(str1, str2)) {
    nasd_printf("DRIVE: header modified at %s (%s)\n", str1, str2);
  }
  else {
    nasd_printf("DRIVE: header modified at %s\n", str1);
  }

  bcopy((char *)newer, (char *)nasd_odc_state->disk, sizeof(nasd_od_disk_t));
  nasd_odc_state->npt_sz = disk1.npt_ext.last - disk1.npt_ext.first + 1;

  return(NASD_SUCCESS);
}

/*
 * Perform basic setup
 */
nasd_status_t
nasd_setup_disk(
  unsigned long      num_real_sectors,
  dev_t              dev,
  nasd_od_config_t  *config)
{
  nasd_drive_layout_basic_t basic_layout;
  nasd_odc_icpart_t *icp;
  nasd_status_t rc;
  int i, formatted;

  nasd_printf("DRIVE: Initializing disk with %lu sectors\n", num_real_sectors);

  rc = nasd_drive_compute_basic_layout(num_real_sectors,
    &basic_layout);
  if (rc) {
    nasd_printf("DRIVE: failed computing layout rc=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    return(rc);
  }

  nasd_diskheader_blk = basic_layout.header1_blk;
  nasd_diskheader_dup_blk = basic_layout.header2_blk;
  nasd_od_blocks = basic_layout.num_blocks;
  nasd_odc_refblocks = basic_layout.num_refblocks;
  nasd_firstblock_offset = basic_layout.firstblock_offset;
  nasd_firstref_offset = basic_layout.firstref_offset;

#if 1
  nasd_printf("\nSector layout for %lu real sectors:\n", num_real_sectors);
  nasd_printf("  System overhead: 0..%lu\n", (u_long)(nasd_diskheader_blk - 1));
  nasd_printf("  Header: %lu..%lu\n", (u_long)nasd_diskheader_blk,
    (u_long)nasd_diskheader_blk);
  nasd_printf("  Refcounts: %lu..%lu\n", (u_long)nasd_firstref_offset,
    (u_long)(nasd_firstref_offset
    + (nasd_odc_refblocks*NASD_OD_SECTORS_PER_BLK) - 1));
  nasd_printf("  Data: %lu..%lu\n",
    (u_long)nasd_firstblock_offset,
    (u_long)basic_layout.last_data_sector);
  nasd_printf("  Duplicate header: %lu..%lu\n",
    (u_long)nasd_diskheader_dup_blk,
    (u_long)nasd_diskheader_dup_blk);
  nasd_printf("  Block 1..2: %u..%u\n",
    nasd_odc_real_sectno(1, NASD_ODC_T_ANON),
    nasd_odc_real_sectno(2, NASD_ODC_T_ANON));
  nasd_printf("  Block 1..%u: %u..%u\n", nasd_od_blocks,
    nasd_odc_real_sectno(1, NASD_ODC_T_ANON),
    nasd_odc_real_sectno(nasd_od_blocks, NASD_ODC_T_ANON));
#endif

  nasd_odc_state = &nasd_odc_state_v;

  NASD_Valloc(nasd_odc_state->disk,sizeof(nasd_od_disk_t),(nasd_od_disk_t *));
  NASD_ASSERT(nasd_odc_state->disk != NULL);

  /* Make sure it comes up as uninialized unless we load data
     to the contrary */
  nasd_odc_state->disk->initialized = 0;
  rc = nasd_mutex_init(&nasd_odc_state->lock);
  if (rc) {
    nasd_printf("DRIVE ERROR: failed nasd_mutex_init, rc=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    return(rc);
  }
  rc = nasd_shutdown_mutex(nasd_odc_shutdown, &nasd_odc_state->lock);
  if (rc) {
    return(rc);
  }
  for(i=0;i<NASD_OD_MAXPARTS;i++) {
    icp = &nasd_odc_state->parts[i];
    rc = nasd_rwlock_init(&icp->lock);
    if (rc) {
      nasd_printf("DRIVE ERROR: failed nasd_rwlock_init, i=%d, rc=0x%x (%s)\n",
        i, rc, nasd_error_string(rc));
      return(rc);
    }
    rc = nasd_shutdown_rwlock(nasd_odc_shutdown, &nasd_odc_state->parts[i].lock);
    if (rc) {
      return(rc);
    }
    icp->last_objlist_npt = 0;
    icp->last_objlist_off = 0;
  }

  /*
   * ??? What is this for? It will get blown
   * away below, it seems.
   */
  nasd_odc_state->disk->partitions[0].min_protection = NASD_NO_PROTECTION;

  nasd_odc_state->dev = dev;
  nasd_odc_state->cr_ind = 0;
  /* XXX eventually, hook up to nvram */
  nasd_odc_state->nvstate = &nasd_odc_nvstate_v;
  /* always invalidate while we don't really have nvram */
  bzero((char *)&nasd_odc_nvstate_v, sizeof(nasd_odc_nvstate_v));

  NASD_FREELIST_CREATE(nasd_drive_opholder_freelist,
    1024, /* max free */
      24, /* increment */
    sizeof(nasd_drive_opholder_t));
  if (nasd_drive_opholder_freelist == NULL)
    return(NASD_NO_MEM);
  rc = nasd_shutdown_proc(nasd_odc_shutdown,
    nasd_od_shutdown_drive_opholder_freelist, NULL);
  if (rc) {
    nasd_od_shutdown_drive_opholder_freelist(NULL);
    return(rc);
  }

  NASD_FREELIST_PRIME(nasd_drive_opholder_freelist, 128,next,
    (nasd_drive_opholder_t *));

  rc = nasd_od_ioqueue_init(config);
  if (rc) {
    nasd_printf("DRIVE ERROR: "
      "failed initializing I/O queue subsystem, rc=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    return(rc);
  }

  if (nasd_odc_force_format == 0) {
    rc = nasd_load_diskstate(config);
    if (rc) {
      nasd_printf("DRIVE ERROR: failed nasd_load_diskstate, rc=0x%x (%s)\n",
        rc, nasd_error_string(rc));
      return(rc);
    }
  }


  rc = nasd_odc_freeblock_init();
  if (rc) {
    nasd_printf("DRIVE ERROR: failed nasd_odc_freeblock_init, rc=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    return(rc);
  }

  rc = nasd_od_obj_sysinit();
  if (rc) {
    nasd_printf("DRIVE ERROR: failed nasd_od_obj_sysinit, rc=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    return(rc);
  }

  rc = nasd_obj_control_init();
  if (rc) {
    nasd_printf("DRIVE ERROR: failed nasd_obj_control_init, rc=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    return(rc);
  }

  rc = nasd_cache_init();
  if (rc) {
    nasd_printf("DRIVE ERROR: failed nasd_cache_init, rc=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    return(rc);
  }

  rc = nasd_od_layout_init(config);
  if (rc) {
    nasd_printf("DRIVE ERROR: could not init static layout subsys, rc=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    return(rc);
  }

  if (nasd_odc_need_format || nasd_odc_force_format) {
    rc = nasd_format_disk(config);
    if (rc) {
      nasd_printf("DRIVE ERROR: failed nasd_format_disk, rc=0x%x (%s)\n",
        rc, nasd_error_string(rc));
      return(rc);
    }
    formatted = 1;

    rc = nasd_od_check_startup();
    if (rc) {
      nasd_printf("DRIVE WARNING: failed startup sanity checks, rc=0x%x (%s)\n",
        rc, nasd_error_string(rc));
    }
  }
  else {
    /*
     * init w/out format
     * XXX check dirty counts, try to recover?
     * read in refcnts
     */
    rc = nasd_od_check_startup();
    if (rc) {
      nasd_printf("DRIVE WARNING: failed startup sanity checks, rc=0x%x (%s)\n",
        rc, nasd_error_string(rc));
    }

    formatted = 0;
  }

#if NASD_DRIVE_PREREAD_REFCNT > 0
  nasd_printf("DRIVE: prereading %u refcount blocks\n", nasd_odc_refblocks);
  rc = nasd_odc_load_refs();
  if (rc) {
    nasd_printf("DRIVE ERROR: cannot load old refcnts, rc=0x%x (%s)\n",
      rc, nasd_error_string(rc));
  }
  nasd_printf("DRIVE: done prereading refcnt blocks\n");
#endif /* NASD_DRIVE_PREREAD_REFCNT > 0 */

  nasd_printf("DRIVE: npt1 %u..%u (%u..%u)\n",
    nasd_odc_state->disk->npt_ext.first,
    nasd_odc_state->disk->npt_ext.last,
    nasd_odc_real_sectno(nasd_odc_state->disk->npt_ext.first, NASD_ODC_T_NPT1),
    nasd_odc_real_sectno(nasd_odc_state->disk->npt_ext.last, NASD_ODC_T_NPT1));
  nasd_printf("DRIVE: npt2 %u..%u (%u..%u)\n",
    nasd_odc_state->disk->npt2_ext.first,
    nasd_odc_state->disk->npt2_ext.last,
    nasd_odc_real_sectno(nasd_odc_state->disk->npt2_ext.first, NASD_ODC_T_NPT2),
    nasd_odc_real_sectno(nasd_odc_state->disk->npt2_ext.last, NASD_ODC_T_NPT2));

  if (nasd_odc_need_recover) {
    nasd_printf("DRIVE: NVRAM contents lost, rechecking all info\n");
  }
  if ((formatted == 0) && (nasd_odc_need_recover
    || nasd_odc_state->nvstate->dirty_counts[NASD_ODC_T_NODE]
    || nasd_odc_state->nvstate->dirty_counts[NASD_ODC_T_NPT1]
    || nasd_odc_state->nvstate->dirty_counts[NASD_ODC_T_NPT2]
    || nasd_odc_state->nvstate->dirty_counts[NASD_ODC_T_IND]))
  {
    nasd_odc_need_recover = 1;
#if 0
    nasd_printf("DRIVE: Rescanning blocks for sanity\n");
    /* XXX check nodes, node lists, etc, for sanity */
#endif
  }
  if ((formatted == 0) && (nasd_odc_need_recover
    || nasd_odc_state->nvstate->dirty_counts[NASD_ODC_T_REFCNT]))
  {
#if 0
    nasd_printf("DRIVE: Recomputing block reference counts\n");
    /* XXX recompute refcnts */
#endif
  }

  rc = nasd_odc_freeblock_build_lists();
  if (rc) {
    nasd_printf("DRIVE ERROR: cannot build free block lists, rc=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    return(rc);
  }

  rc = nasd_od_layout_init_dynamic(nasd_odc_state->disk->npt2_ext.last + 1);
  if (rc) {
    nasd_printf("DRIVE ERROR: could not init dynamic layout subsys, rc=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    return(rc);
  }

  rc = nasd_drive_rpc_init();
  if (rc) {
    nasd_printf("DRIVE ERROR: cannot init rpc subsystem, rc=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    return(rc);
  }

  rc = nasd_od_udp_init();
  if (rc) {
    nasd_printf("DRIVE ERROR: cannot init udp pipe subsystem, rc=0x%x (%s)\n",
           rc, nasd_error_string(rc));
    return rc;
  }

#ifndef KERNEL
  rc = nasd_od_mq_init();
  if (rc) {
    nasd_printf("DRIVE ERROR: cannot init message queue subsystem, rc=0x%x (%s)\n",
           rc, nasd_error_string(rc));
    return(rc);
  }
#endif /* !KERNEL */



  rc = nasd_remote_sysinit();
  if (rc) {
    nasd_printf("DRIVE ERROR: cannot init active disk subsystem, rc=0x%x (%s)\n",
                rc, nasd_error_string(rc));
    return (rc);
  }

  rc = nasd_sec_init();
  if (rc) {
    nasd_printf("DRIVE ERROR: cannot init security subsystem, rc=0x%x (%s)\n",
                rc, nasd_error_string(rc));
    return rc;
  }

  rc = nasd_sec_init_keymgmt();
  if (rc) {
    nasd_printf("DRIVE ERROR: cannot init key management subsystem, rc=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    return(rc);
  }

  rc = nasd_sec_init_nonce();
  if (rc) {
    nasd_printf(
      "DRIVE ERROR: cannot init nonce management subsystem, rc=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    return(rc);
  }
  rc = nasd_shutdown_proc(nasd_odc_shutdown, nasd_sec_shutdown_nonce,
                          NULL);
  if(rc) {
    nasd_printf("DRIVE ERROR: cannot add shutdown handler for nonce subsystem, rc=0x%x (%s)\n",
                rc, nasd_error_string(rc));
    nasd_sec_shutdown_nonce(NULL);
    return rc;
  }

  rc = nasd_sec_nonce_mgr_init(&nasd_drive_nonce_mgr,
    NASD_DRIVE_NONCE_TABLE_SIZE);
  if (rc) {
    nasd_printf("DRIVE ERROR: cannot init nonce manager, rc=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    return(rc);
  }

  rc = nasd_shutdown_proc(nasd_odc_shutdown, nasd_sec_nonce_mgr_shutdown,
    nasd_drive_nonce_mgr);
  if (rc) {
    nasd_printf("DRIVE ERROR: cannot add shutdown handler for nonce subsystem, rc=0x%x (%s)\n",
                rc, nasd_error_string(rc));
    nasd_sec_nonce_mgr_destroy(nasd_drive_nonce_mgr);
    return rc;
  }

  rc = nasd_dt_trace_init();
  if (rc) {
    nasd_printf("DRIVE ERROR: could not init trace subsys, rc=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    return(rc);
  }

#if NASD_DRIVE_PREREAD_NPT > 0
  nasd_printf("DRIVE: prereading %u NPT blocks\n", 2*nasd_odc_state->npt_sz);
  rc = nasd_fetch_npt();
  if (rc) {
    nasd_printf("DRIVE WARNING: got 0x%x (%s) prereading NPT\n",
      rc, nasd_error_string(rc));
  }
  nasd_printf("DRIVE: done prereading NPT\n");
#endif /* NASD_DRIVE_PREREAD_NPT > 0 */

  /* nasd_create_partition_on_format is 2 iff it's been requested and we've formatted */
  if (nasd_create_partition_on_format == 2) {
    nasd_status_t               rc;                 
    nasd_identifier_t           first_obj;
    nasd_key_t                  bogus_key;
    nasd_blkcnt_t               blocks = 
      (nasd_od_blocks - nasd_odc_state->disk->blocks_allocated) ;
    nasd_printf("DRIVE: creating maximally sized (%d blocks) unprotected partition on format\n",blocks);
    rc = nasd_od_create_partition(1, 
                                  blocks,
                                  NASD_NO_PROTECTION,
                                  bogus_key,
                                  bogus_key,
                                  bogus_key,
                                  &first_obj);
    if (rc) {
      nasd_printf("DRIVE ERROR: Could not create partition.  Continuing rc=0x%x (%s)\n",
                  rc, nasd_error_string(rc));
    }
  }
#if 1
  /* A cosmetic kludge; if we've created the partition, then the
     partition creation has already printed out the info */
  if (nasd_create_partition_on_format!=2) {
    nasd_od_show_info();
  }
#endif
  nasd_create_partition_on_format = 0;

  return(NASD_SUCCESS);
}

nasd_status_t
nasd_drive_nnpt_expected(
  nasd_blkcnt_t   nblocks,
  int            *expectedp)
{
  int nnpt, sect_align, ablks;
  nasd_uint64 expect_nodes;

  /*
   * expect_nodes is the maximum number of nodes we think
   * we're likely to see. It becomes the number of node
   * entries we can store in the level-1 node pagetable.
   */
  expect_nodes = (nasd_od_blocks * nasd_od_nnpt_pcg) / 100;
  nnpt = (expect_nodes / NASD_OD_NODES_PER_NPT_BLOCK) + 1;

  sect_align = nasd_od_force_align_disk_boundary >> NASD_OD_SECT_SHIFT;

  if (sect_align) {
    if (sect_align < NASD_OD_SECTORS_PER_BLK) {
      nasd_printf("DRIVE: "
        "cannot align blocks (%u sectors) to %u sector boundaries\n",
        NASD_OD_SECTORS_PER_BLK, sect_align);
      return(NASD_BAD_LEN);
    }
    if (sect_align % NASD_OD_SECTORS_PER_BLK) {
      nasd_printf("DRIVE: "
        "cannot align blocks (%u sectors) to %u sector boundaries\n",
        NASD_OD_SECTORS_PER_BLK, sect_align);
      return(NASD_BAD_LEN);
    }
    ablks = sect_align / NASD_OD_SECTORS_PER_BLK;
    if (nnpt % ablks) {
      nnpt += (ablks - (nnpt % ablks));
    }
    NASD_ASSERT(nnpt%ablks == 0);
  }

  *expectedp = nnpt;
  return(NASD_SUCCESS);
}

/*
 * Format a disk, and init in-memory data structures
 *
 * nasd_odc_state->dev bound before calling
 */
nasd_status_t
nasd_format_disk(
  nasd_od_config_t  *config)
{
  int i, nnpt, j, refd, ko, sect_align;
  nasd_uint64 can_nodes_blk2;
  nasd_blkno_t frb, lrb, kb;
  nasd_od_part_t *part;
  nasd_odc_ent_t *ent;
  nasd_sectno_t ks;
  nasd_status_t rc;

#if MJU_DEBUG
  nasd_printf("nasd_format_disk() called\n");
#endif /* MJU_DEBUG */

  sect_align = nasd_od_force_align_disk_boundary >> NASD_OD_SECT_SHIFT;

  rc = nasd_drive_nnpt_expected(nasd_od_blocks, &nnpt);
  if (rc)
    return(rc);

  can_nodes_blk2 = nnpt * 2;

  nasd_odc_need_recover = 0;
  nasd_odc_state->nvstate->key1 = NASD_C_KEY1;
  nasd_odc_state->nvstate->key2 = NASD_C_KEY2;
  nasd_gettime(&nasd_odc_state->nvstate->mod_time);
  for(i=0;i<NASD_ODC_T_NUM;i++) {
    nasd_odc_state->nvstate->dirty_counts[i] = 0;
  }

  nasd_odc_state->mod_complete = nasd_odc_state->nvstate->mod_time;

  nasd_odc_state->disk->mod_time = nasd_odc_state->nvstate->mod_time;
  nasd_odc_state->disk->format_time = nasd_odc_state->nvstate->mod_time;
  nasd_odc_state->disk->npt_ext.first = 1; /* we're in 1-based land */
  nasd_odc_state->disk->npt_ext.last = 1 + nnpt - 1;
  nasd_odc_state->npt_sz = nnpt;

  nasd_odc_state->disk->npt2_ext.first = nasd_odc_state->disk->npt_ext.last + 1;
  nasd_odc_state->disk->npt2_ext.last = nasd_odc_state->disk->npt2_ext.first + nnpt - 1;
  nasd_odc_state->disk->layout_type = config->layout_type;

  if (sect_align) {
    ks = nasd_odc_real_sectno(nasd_odc_state->disk->npt2_ext.last+1,
      NASD_ODC_T_ANON);
    ko = ks % sect_align;
    NASD_ASSERT(ko == 0);
  }

  kb = nasd_odc_state->disk->npt2_ext.last + 1;
  ks = nasd_odc_real_sectno(kb, NASD_ODC_T_ANON);
  nasd_printf("DRIVE: formatting with first anon at %u (%u)\n", kb, ks);
  nasd_printf("DRIVE: NPT1 %u..%u  NPT2 %u..%u\n",
    nasd_odc_state->disk->npt_ext.first,
    nasd_odc_state->disk->npt_ext.last,
    nasd_odc_state->disk->npt2_ext.first,
    nasd_odc_state->disk->npt2_ext.last);


  nasd_odc_state->disk->blocks_allocated = 2*nnpt;

  frb = NASD_ODC_REFBLK_OF(nasd_odc_state->disk->npt_ext.first);
  lrb = NASD_ODC_REFBLK_OF(nasd_odc_state->disk->npt2_ext.last);

  for(i=0;i<NASD_OD_MAXPARTS;i++) {
    part = &PART(i);
    bzero((char *)part, sizeof(nasd_od_part_t));
    part->generation = 1;
  }

  /*
   * Init refcounts. NPT blocks get 1, everything else gets 0.
   */
  refd = 0;
  for(i=0;i<nasd_odc_refblocks;i++) {
    rc = nasd_odc_block_get(NULL, (nasd_blkno_t)i,
        NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK|NASD_ODC_L_LOAD,
        &ent, NASD_ID_NULL, 0, NASD_ODC_T_REFCNT, NULL);
    if (rc != NASD_SUCCESS)
      NASD_PANIC();
    NASD_ODC_LOCK_BLOCK(ent);
    nasd_odc_wait_not_busy_invalid(ent);
    bzero((char *)ent->data.buf, NASD_OD_BASIC_BLOCKSIZE);
    NASD_ASSERT(i >= frb);
    if (i <= lrb) {
      for(j=0;((j<NASD_OD_REFS_PER_BLOCK)&&(refd<nasd_odc_state->disk->blocks_allocated));j++) {
        ent->data.cnt[j] = 1;
        refd++;
      }
    }
    nasd_odc_dirty_ent(ent);
    NASD_ODC_UNLOCK_BLOCK(ent);
    nasd_odc_block_release(ent);
  }

  /*
   * Init NPT blocks. All zeroes (no nodes).
   */
  for(i=nasd_odc_state->disk->npt_ext.first;
    i<=nasd_odc_state->disk->npt_ext.last;i++)
  {
    rc = nasd_odc_block_get(NULL, (nasd_blkno_t)i,
      NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK,
      &ent, NASD_ID_NULL, 0, NASD_ODC_T_NPT1, NULL);
    if (rc) {
      nasd_printf("DRIVE ERROR: could not get NPT block %u for init\n",
        (unsigned int)i);
      return(rc);
    }
    NASD_ODC_LOCK_BLOCK(ent);
    bzero((char *)ent->data.buf, NASD_OD_BASIC_BLOCKSIZE);
    ent->data_flags &= ~NASD_CD_INVALID;
    nasd_odc_dirty_ent(ent);
    NASD_ODC_UNLOCK_BLOCK(ent);
    NASD_BROADCAST_COND(ent->cond);
    nasd_odc_block_release(ent);
  }
  for(i=nasd_odc_state->disk->npt2_ext.first;
    i<=nasd_odc_state->disk->npt2_ext.last;i++)
  {
    rc = nasd_odc_block_get(NULL, (nasd_blkno_t)i,
      NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK,
      &ent, NASD_ID_NULL, 0, NASD_ODC_T_NPT2, NULL);
    if (rc) {
      nasd_printf("DRIVE ERROR: could not get NPT2 block %u for init\n",
        (unsigned int)i);
      return(rc);
    }
    NASD_ODC_LOCK_BLOCK(ent);
    bzero((char *)ent->data.buf, NASD_OD_BASIC_BLOCKSIZE);
    ent->data_flags &= ~NASD_CD_INVALID;
    nasd_odc_dirty_ent(ent);
    NASD_ODC_UNLOCK_BLOCK(ent);
    NASD_BROADCAST_COND(ent->cond);
    nasd_odc_block_release(ent);
  }

  /*
   * do any layout-specific formatting.
   */
  rc = nasd_od_layout_format(config, kb);
  if (rc) {
    nasd_printf("DRIVE ERROR: failed layout-specific format, rc=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    return(rc);
  }

  /*
   * Write out the new disk header
   */
  NASD_ODC_LOCK_DISK();
  rc = nasd_od_write_diskstate(1);
  NASD_ODC_UNLOCK_DISK();
  if (rc) {
    nasd_printf("DRIVE ERROR: writing disk state, rc=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    return(rc);
  }

  /*
   * Flush all the dirty blocks (which would be the above
   * dirty refblocks)
   */
  rc = nasd_odc_flush_dirty(1);
  if (rc) {
    nasd_printf("DRIVE ERROR: formatting disk, flushing refblocks, rc=0x%x (%s)\n",
      rc, nasd_error_string(rc));
    return(rc);
  }
  if (nasd_create_partition_on_format) {
    nasd_create_partition_on_format = 2;
  }
  nasd_printf("DRIVE: format complete\n");

  return(NASD_SUCCESS);
}


nasd_status_t
nasd_od_create_partition(
  int                 partnum,
  nasd_blkcnt_t       nblocks,
  nasd_uint16         min_protection,
  nasd_key_t          partition_key,
  nasd_key_t          red_key,
  nasd_key_t          black_key,
  nasd_identifier_t  *first_obj_idp)
{
  nasd_attribute_t in_attr, out_attr;
  nasd_fieldmask_t fieldmask;
  nasd_timespec_t cur_time;
  nasd_odc_icpart_t *icp;
  nasd_status_t rc, rc2;
  nasd_od_part_t *part;

  if (nblocks == 0)
    return(NASD_BAD_LEN);

  nasd_gettime(&cur_time);

  if (partnum >= NASD_OD_MAXPARTS)
    return(NASD_BAD_PARTITION);

  NASD_ODC_LOCK_DISK();

  icp = &nasd_odc_state->parts[partnum];
  part = &PART(partnum);

  NASD_ODC_ICPART_LOCK_WRITE(icp);
  if (part->part_size) {
    rc = NASD_BAD_PARTITION;
  }
  else {
    if (nblocks > (nasd_od_blocks - nasd_odc_state->disk->blocks_allocated)) {
      rc = NASD_NO_SPACE;
    }
    else {
      nasd_odc_state->disk->blocks_allocated += nblocks;
      part->part_size = nblocks;
      part->blocks_used = 0;
      part->blocks_allocated = 0;
      part->generation = 1;
      part->last_cr_del = cur_time;
      part->num_obj = 0;
      part->min_protection=min_protection;
      bcopy((char *)partition_key, (char *)part->partition_key,
            sizeof(nasd_key_t));
      bcopy((char *)red_key, (char *)part->red_key, sizeof(nasd_key_t));
      bcopy((char *)black_key, (char *)part->black_key, sizeof(nasd_key_t));
      fieldmask = 0;
      bzero((char *)&in_attr, sizeof(in_attr));
      rc = nasd_obj_create(partnum, &in_attr, fieldmask,
        &part->first_obj, &out_attr, 1);

      if (rc) {
        part->part_size = 0;
        part->num_obj = 0;
        nasd_odc_state->disk->blocks_allocated -= nblocks;
      }
      else {
        nasd_gettime(&part->create_time);
        part->mod_time = part->create_time;
        nasd_odc_state->nvstate->mod_time = part->create_time;
        nasd_od_write_diskstate(1);
        rc = NASD_SUCCESS;
      }
    }
  }
  NASD_ODC_ICPART_UNLOCK_WRITE(icp);

  NASD_ODC_UNLOCK_DISK();

  if (rc == NASD_SUCCESS) {
    nasd_printf("Created partition %d with %lu blocks\n", partnum, (u_long)nblocks);
    nasd_od_show_info();
  }

#if 1
  if (rc == NASD_SUCCESS) {
    rc2 = nasd_odc_flush_dirty(1);
    if (rc2) {
      nasd_printf("DRIVE ERROR: creating partition, flushing cache, rc=0x%x (%s)\n",
        rc2, nasd_error_string(rc2));
      return(rc2);
    }
  }
#endif

  return(rc);
}

nasd_status_t
nasd_od_change_partition(
  int                partnum,
  nasd_blkcnt_t      nblocks,
  nasd_uint16        min_protection,
  nasd_identifier_t  first_obj_id)
{
  nasd_timespec_t cur_time;

  if (nblocks == 0)
    return(NASD_BAD_LEN);

  nasd_gettime(&cur_time);

  return(NASD_OP_NOT_SUPPORTED);
}

/*
 * Debugging: dump disk/partition info
 */
void
nasd_od_show_info()
{
  nasd_od_part_t *p;
  nasd_od_disk_t *d;
  nasd_sectno_t ks;
  nasd_blkno_t kb;
  int i, j;

  kb = nasd_odc_state->disk->npt2_ext.last + 1;
  ks = nasd_odc_real_sectno(kb, NASD_ODC_T_ANON);

  d = nasd_odc_state->disk;
  nasd_printf("\nCurrent disk state:\n");
  nasd_printf("Node pagetable 1 at %lu..%lu (%u..%u)\n",
    (u_long)d->npt_ext.first, (u_long)d->npt_ext.last,
    nasd_odc_real_sectno(d->npt_ext.first, NASD_ODC_T_NPT1),
    nasd_odc_real_sectno(d->npt_ext.last, NASD_ODC_T_NPT1));
  nasd_printf("Node pagetable 2 at %lu..%lu (%u..%u)\n",
    (u_long)d->npt2_ext.first, (u_long)d->npt2_ext.last,
    nasd_odc_real_sectno(d->npt2_ext.first, NASD_ODC_T_NPT2),
    nasd_odc_real_sectno(d->npt2_ext.last, NASD_ODC_T_NPT2));
  nasd_printf("First anon block at %u (%u)\n", kb, ks);
  nasd_printf("Blocks allocated: %lu (%lu unallocated)\n",
    (u_long)d->blocks_allocated,
    (u_long)(nasd_od_blocks - d->blocks_allocated));
  for(j=i=0;i<NASD_OD_MAXPARTS;i++) {
    p = &d->partitions[i];
    if (p->part_size) {
      j++;
      nasd_printf("Partition %d:\n", i);
      nasd_printf("  part_size %lu\n", (u_long)p->part_size);
      nasd_printf("  blocks_used %lu\n", (u_long)p->blocks_used);
      nasd_printf("  blocks_allocated %lu\n", (u_long)p->blocks_allocated);
      nasd_printf("  first_obj 0x%" NASD_ID_FMT "\n", p->first_obj);
      nasd_printf("  num_obj %" NASD_64u_FMT "\n", p->num_obj);
      nasd_printf("  last_cr_del %d:%09d\n", p->last_cr_del.ts_sec,
        p->last_cr_del.ts_nsec);
      nasd_printf("  min_protection %x\n",p->min_protection);
      nasd_printf("  red_key %" NASD_64x_FMT "\n",
        *((nasd_uint64 *)p->red_key));
      nasd_printf("  black_key %" NASD_64x_FMT "\n",
             *((nasd_uint64 *)p->black_key));
      nasd_printf("  partition_key %" NASD_64x_FMT "\n",
             *((nasd_uint64 *)p->partition_key));
    }
  }
  if (j == 0) {
    nasd_printf("No partitions configured\n");
  }
}

/*
 * Caller must have icpart[partnum] write lock held
 */
void
nasd_part_modified(
  int  partnum)
{
  nasd_od_part_t *part;

  part = &PART(partnum);
  nasd_gettime(&part->mod_time);
}

/* how many NPT blocks to grab at a time */
#define MAX_NPT_FETCHCHUNK 16

void
nasd_fetch_npt_helper(
  nasd_odc_ent_t   *ichain,
  nasd_odc_ent_t  **ents,
  int               nents)
{
  nasd_odc_ent_t *il;
  int i;

  if (nents == 0) {
    NASD_ASSERT(ichain->inext == ichain);
    return;
  }


  /*
   * Actually issue any I/Os which may be required
   */
  if (ichain->inext != ichain) {
    il = ichain->inext;
    /* disconnect I/O chain */
    ichain->iprev->inext = NULL;
    ichain->inext->iprev = NULL;
    ichain->inext = ichain->iprev = ichain;
    nasd_od_io_enq(il, NASD_U_READ, NASD_IO_PRI_HI);
  }

  /*
   * Wait for I/Os to complete, dump the blocks
   */
  for(i=0;i<nents;i++) {
    nasd_odc_wait_not_busy_invalid(ents[i]);
    nasd_odc_block_release(ents[i]);
    ents[i] = NULL;
  }
}

nasd_status_t
nasd_fetch_npt()
{
  nasd_odc_ent_t *ents[MAX_NPT_FETCHCHUNK], ich;
  nasd_blkno_t blk;
  nasd_status_t rc;
  int i;

  ich.inext = ich.iprev = &ich;
  i = 0;
  rc = NASD_SUCCESS;

  for(blk=nasd_odc_state->disk->npt_ext.first;
    blk<=nasd_odc_state->disk->npt_ext.last;
    blk++)
  {
    rc = nasd_odc_block_get(NULL, blk,
      NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK|NASD_ODC_L_MLOAD,
      &ents[i], NASD_ID_NULL, 0, NASD_ODC_T_NPT1, &ich);
    if (rc)
      break;
    i++;
    if (i == MAX_NPT_FETCHCHUNK) {
      nasd_fetch_npt_helper(&ich, ents, i);
      i = 0;
    }
    NASD_ASSERT(i < MAX_NPT_FETCHCHUNK);
  }

  if (rc) {
    nasd_fetch_npt_helper(&ich, ents, i);
    return(rc);
  }

  for(blk=nasd_odc_state->disk->npt2_ext.first;
    blk<=nasd_odc_state->disk->npt2_ext.last;
    blk++)
  {
    rc = nasd_odc_block_get(NULL, blk,
      NASD_ODC_L_FORCE|NASD_ODC_L_BLOCK|NASD_ODC_L_MLOAD,
      &ents[i], NASD_ID_NULL, 0, NASD_ODC_T_NPT2, &ich);
    if (rc)
      break;
    i++;
    if (i == MAX_NPT_FETCHCHUNK) {
      nasd_fetch_npt_helper(&ich, ents, i);
      i = 0;
    }
    NASD_ASSERT(i < MAX_NPT_FETCHCHUNK);
  }

  nasd_fetch_npt_helper(&ich, ents, i);

  return(rc);
}

void
nasd_od_shutdown_drive_opholder_freelist(
  void  *ignored)
{
  NASD_FREELIST_DESTROY(nasd_drive_opholder_freelist,next,
    (nasd_drive_opholder_t *));
}

nasd_status_t
nasd_drive_rshutdown(
  nasd_drive_rshutdown_flags_t  flags)
{
  nasd_status_t rc;

  rc = nasd_od_sys_rshutdown(flags);
  return(rc);
}

nasd_status_t
nasd_drive_getinfo(
  nasd_drive_info_t  *info)
{
  nasd_gettime(&info->cur_time);

  return(NASD_SUCCESS);
}

/* Local Variables:  */
/* indent-tabs-mode: nil */
/* tab-width: 2 */
/* End: */
