/* genePred.c was originally generated by the autoSql program, which also 
 * generated genePred.h and genePred.sql.  This module links the database and the RAM 
 * representation of objects. */

#include "common.h"
#include "gff.h"
#include "jksql.h"
#include "psl.h"
#include "linefile.h"
#include "genePred.h"
#include "genbank.h"
#include "rangeTree.h"
#include "hdb.h"


/* SQL to create a genePred table */
static char *createSql = 
"CREATE TABLE %s ("
"   %s"                                 /* bin column goes here */
"   name varchar(255) not null,"	/* mrna accession of gene */
"   chrom varchar(255) not null,"	/* Chromosome name */
"   strand char(1) not null,"		/* + or - for strand */
"   txStart int unsigned not null,"	/* Transcription start position */
"   txEnd int unsigned not null,"	/* Transcription end position */
"   cdsStart int unsigned not null,"	/* Coding region start */
"   cdsEnd int unsigned not null,"	/* Coding region end */
"   exonCount int unsigned not null,"	/* Number of exons */
"   exonStarts longblob not null,"	/* Exon start positions */
"   exonEnds longblob not null,"	/* Exon end positions */
"   INDEX(name)";

static char *binFieldSql = 
"    bin smallint unsigned not null,"
"    INDEX(chrom,bin),";

static char *noBinIndexSql = 
"   INDEX(chrom,txStart),";

static char *scoreFieldSql = 
"   ,score int";

static char *name2FieldSql = 
"  ,name2 varchar(255) not null,"    /* Secondary name. (e.g. name of gene) or NULL if not available */
"   INDEX(name2)";

static char *cdsStatFieldSql = 
"  ,cdsStartStat enum('none', 'unk', 'incmpl', 'cmpl') not null,"    /* Status of cdsStart annotation */
"   cdsEndStat enum('none', 'unk', 'incmpl', 'cmpl') not null";     /* Status of cdsEnd annotation */

static char *exonFramesFieldSql = 
"   ,exonFrames longblob not null";    /* List of frame for each exon, or -1 if no frame or not known. NULL if not available. */

struct genePred *genePredLoad(char **row)
/* Load a genePred from row fetched with select * from genePred
 * from database.  Dispose of this with genePredFree(). */
{
struct genePred *ret;
int sizeOne;

AllocVar(ret);
ret->exonCount = sqlUnsigned(row[7]);
ret->name = cloneString(row[0]);
ret->chrom = cloneString(row[1]);
strcpy(ret->strand, row[2]);
ret->txStart = sqlUnsigned(row[3]);
ret->txEnd = sqlUnsigned(row[4]);
ret->cdsStart = sqlUnsigned(row[5]);
ret->cdsEnd = sqlUnsigned(row[6]);
sqlUnsignedDynamicArray(row[8], &ret->exonStarts, &sizeOne);
if (sizeOne != ret->exonCount)
    errAbort("genePred: %s number of exonStarts (%d) != number of exons (%d)",
             ret->name, sizeOne, ret->exonCount);
sqlUnsignedDynamicArray(row[9], &ret->exonEnds, &sizeOne);
if (sizeOne != ret->exonCount)
    errAbort("genePred: %s number of exonEnds (%d) != number of exons (%d)",
             ret->name, sizeOne, ret->exonCount);
return ret;
}

struct genePred *genePredLoadAll(char *fileName) 
/* Load all genePred from a whitespace-separated file.
 * Dispose of this with genePredFreeList(). */
{
struct genePred *list = NULL, *el;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[10];

while (lineFileRow(lf, row))
    {
    el = genePredLoad(row);
    slAddHead(&list, el);
    }
lineFileClose(&lf);
slReverse(&list);
return list;
}

struct genePred *genePredLoadAllByChar(char *fileName, char chopper) 
/* Load all genePred from a chopper separated file.
 * Dispose of this with genePredFreeList(). */
{
struct genePred *list = NULL, *el;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[10];

while (lineFileNextCharRow(lf, chopper, row, ArraySize(row)))
    {
    el = genePredLoad(row);
    slAddHead(&list, el);
    }
lineFileClose(&lf);
slReverse(&list);
return list;
}

struct genePred *genePredCommaIn(char **pS, struct genePred *ret)
/* Create a genePred out of a comma separated string. 
 * This will fill in ret if non-null, otherwise will
 * return a new genePred */
{
char *s = *pS;
int i;

if (ret == NULL)
    AllocVar(ret);
ret->name = sqlStringComma(&s);
ret->chrom = sqlStringComma(&s);
sqlFixedStringComma(&s, ret->strand, sizeof(ret->strand));
ret->txStart = sqlUnsignedComma(&s);
ret->txEnd = sqlUnsignedComma(&s);
ret->cdsStart = sqlUnsignedComma(&s);
ret->cdsEnd = sqlUnsignedComma(&s);
ret->exonCount = sqlUnsignedComma(&s);
s = sqlEatChar(s, '{');
AllocArray(ret->exonStarts, ret->exonCount);
for (i=0; i<ret->exonCount; ++i)
    {
    ret->exonStarts[i] = sqlUnsignedComma(&s);
    }
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
s = sqlEatChar(s, '{');
AllocArray(ret->exonEnds, ret->exonCount);
for (i=0; i<ret->exonCount; ++i)
    {
    ret->exonEnds[i] = sqlUnsignedComma(&s);
    }
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
*pS = s;
return ret;
}

void genePredFree(struct genePred **pEl)
/* Free a single dynamically allocated genePred such as created
 * with genePredLoad(). */
{
struct genePred *el;

if ((el = *pEl) == NULL) return;
freeMem(el->name);
freeMem(el->chrom);
freeMem(el->exonStarts);
freeMem(el->exonEnds);
freeMem(el->name2);
freeMem(el->exonFrames);
freez(pEl);
}

void genePredFreeList(struct genePred **pList)
/* Free a list of dynamically allocated genePred's */
{
struct genePred *el, *next;

for (el = *pList; el != NULL; el = next)
    {
    next = el->next;
    genePredFree(&el);
    }
*pList = NULL;
}

void genePredOutput(struct genePred *el, FILE *f, char sep, char lastSep) 
/* Print out genePred.  Separate fields with sep. Follow last field with lastSep. */
{
int i;
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->name);
if (sep == ',') fputc('"',f);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->chrom);
if (sep == ',') fputc('"',f);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->strand);
if (sep == ',') fputc('"',f);
fputc(sep,f);
fprintf(f, "%u", el->txStart);
fputc(sep,f);
fprintf(f, "%u", el->txEnd);
fputc(sep,f);
fprintf(f, "%u", el->cdsStart);
fputc(sep,f);
fprintf(f, "%u", el->cdsEnd);
fputc(sep,f);
fprintf(f, "%u", el->exonCount);
fputc(sep,f);
if (sep == ',') fputc('{',f);
for (i=0; i<el->exonCount; ++i)
    {
    fprintf(f, "%u", el->exonStarts[i]);
    fputc(',', f);
    }
if (sep == ',') fputc('}',f);
fputc(sep,f);
if (sep == ',') fputc('{',f);
for (i=0; i<el->exonCount; ++i)
    {
    fprintf(f, "%u", el->exonEnds[i]);
    fputc(',', f);
    }
if (sep == ',') fputc('}',f);

/* optional fields, >= test is used so unspecified coumns can be filled in */
if (el->optFields >= genePredScoreFld)
    {
    fputc(sep,f);
    fprintf(f, "%d", el->score);
    }
if (el->optFields >= genePredName2Fld)
    {
    fputc(sep,f);
    if (sep == ',') fputc('"',f);
    fprintf(f, "%s", ((el->name2 != NULL) ? el->name2 : ""));
    if (sep == ',') fputc('"',f);
    }
if (el->optFields >= genePredCdsStatFld)
    {
    fputc(sep,f);
    if (sep == ',') fputc('"',f);
    fprintf(f, "%s", genePredCdsStatStr(el->cdsStartStat));
    if (sep == ',') fputc('"',f);
    fputc(sep,f);
    if (sep == ',') fputc('"',f);
    fprintf(f, "%s", genePredCdsStatStr(el->cdsEndStat));
    if (sep == ',') fputc('"',f);
    }
if (el->optFields >= genePredExonFramesFld)
    {
    fputc(sep,f);
    if (sep == ',') fputc('{',f);
    for (i=0; i<el->exonCount; ++i)
        {
        fprintf(f, "%d", el->exonFrames[i]);
        fputc(',', f);
        }
    if (sep == ',') fputc('}',f);
    }
fputc(lastSep,f);
}

/* ---------  Start of hand generated code. ---------------------------- */

char *genePredCdsStatStr(enum cdsStatus stat)
/* get string value of a cdsStatus */
{
switch (stat) {
case cdsNone:
    return "none";
case cdsUnknown:
    return "unk";
case cdsIncomplete:
    return "incmpl";
case cdsComplete:
    return "cmpl";
default:
    errAbort("invalid cdsStatus enum value: %d", stat);
    return NULL;  /* make compiler happy */
}
}

static enum cdsStatus parseCdsStat(char *statStr)
/* parse a cdsStatus string */
{
if ((statStr == NULL) || sameString(statStr, "none"))
    return cdsNone;
if (sameString(statStr, "unk"))
    return cdsUnknown;
if (sameString(statStr, "incmpl"))
    return cdsIncomplete;
if (sameString(statStr, "cmpl"))
    return cdsComplete;

errAbort("invalid genePred cdsStatus: \"%s\"", statStr);
return cdsNone;  /* make compiler happy */
}

struct genePred *genePredExtLoad(char **row, int numCols)
/* Load a genePred with from a row, with optional fields.  The row must
 * contain columns in the order in the struct, and they must be present up to
 * the last specfied optional field.  Missing intermediate fields must have
 * zero or empty columns, they may not be omitted.  Fields at the end can be
 * omitted. Dispose of this with genePredFree(). */
{
struct genePred *ret;
int sizeOne, iCol;

AllocVar(ret);
ret->exonCount = sqlUnsigned(row[7]);
ret->name = cloneString(row[0]);
ret->chrom = cloneString(row[1]);
strcpy(ret->strand, row[2]);
ret->txStart = sqlUnsigned(row[3]);
ret->txEnd = sqlUnsigned(row[4]);
ret->cdsStart = sqlUnsigned(row[5]);
ret->cdsEnd = sqlUnsigned(row[6]);
sqlUnsignedDynamicArray(row[8], &ret->exonStarts, &sizeOne);
if (sizeOne != ret->exonCount)
    errAbort("genePred: %s number of exonStarts (%d) != number of exons (%d)",
             ret->name, sizeOne, ret->exonCount);
sqlUnsignedDynamicArray(row[9], &ret->exonEnds, &sizeOne);
if (sizeOne != ret->exonCount)
    errAbort("genePred: %s number of exonEnds (%d) != number of exons (%d)",
             ret->name, sizeOne, ret->exonCount);

iCol=GENEPRED_NUM_COLS;
if (iCol < numCols)
    {
    ret->score = sqlSigned(row[iCol++]);
    ret->optFields |= genePredScoreFld;
    }
if (iCol < numCols)
    {
    ret->name2 = cloneString(row[iCol++]);
    ret->optFields |= genePredName2Fld;
    }
if (iCol < numCols)
    {
    ret->cdsStartStat = parseCdsStat(row[iCol++]);
    ret->optFields |= genePredCdsStatFld;
    }
if (iCol < numCols)
    {
    ret->cdsEndStat = parseCdsStat(row[iCol++]);
    ret->optFields |= genePredCdsStatFld;
    }
if (iCol < numCols)
    {
    sqlSignedDynamicArray(row[iCol++], &ret->exonFrames, &sizeOne);
    if (sizeOne != ret->exonCount)
        errAbort("genePred: %s number of exonFrames (%d) != number of exons (%d)",
                 ret->name, sizeOne, ret->exonCount);
    ret->optFields |= genePredExonFramesFld;
    }
return ret;
}

struct genePred *genePredExtLoadAll(char *fileName)
/* Load all genePreds with from tab-separated file, possibly with optional
 * fields. Dispose of this with genePredFreeList(). */
{
struct genePred *list = NULL, *el;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[GENEPREDX_NUM_COLS];
int numCols;

while ((numCols = lineFileChopNextTab(lf, row, ArraySize(row))) > 0)
    {
    lineFileExpectAtLeast(lf, GENEPRED_NUM_COLS, numCols);
    el = genePredExtLoad(row, numCols);
    slAddHead(&list, el);
    }
lineFileClose(&lf);
slReverse(&list);
return list;
}

int genePredCmp(const void *va, const void *vb)
/* Compare to sort based on chromosome, txStart. */
{
const struct genePred *a = *((struct genePred **)va);
const struct genePred *b = *((struct genePred **)vb);
int dif = strcmp(a->chrom, b->chrom);
if (dif == 0)
    dif = a->txStart - b->txStart;
return dif;
}

int genePredNameCmp(const void *va, const void *vb)
/* Compare to sort based on name, then chromosome, txStart. */
{
const struct genePred *a = *((struct genePred **)va);
const struct genePred *b = *((struct genePred **)vb);
int dif = strcmp(a->name, b->name);
if (dif == 0)
    dif = strcmp(a->chrom, b->chrom);
if (dif == 0)
    dif = a->txStart - b->txStart;
return dif;
}

static boolean haveStartStopCodons(struct gffFile *gff)
/* For GFFs, determine if any of the annotations use start_codon or                                                                                                                 
 * stop_codon */
{
struct gffFeature *feature;
for(feature = gff->featureList; feature != NULL; feature = feature->next)
    {
    if(sameWord(feature->name, "start_codon") ||
       sameWord(feature->name, "start_codon"))
        return TRUE;
    }
return FALSE;
}

static boolean isExon(char *feat, boolean isGtf, char *exonSelectWord)
/* determine if a feature is an exon; different criteria for GFF ane GTF */
{
if (isGtf)
    {
    /* must check for stop_codon here, because GTF put it outside of the
     * CDS */
    return (sameWord(feat, "CDS") || sameWord(feat, "stop_codon")
            || sameWord(feat, "start_codon") || sameWord(feat, "exon")
            || sameWord(feat, "utr") || sameWord(feat, "5utr")
            || sameWord(feat, "3utr"));
    }
else
    {
    return ((exonSelectWord == NULL) || sameWord(feat, exonSelectWord));
    }
}

static boolean isStartStopCodon(char *feat)
/* determine if a feature is GTF style start/stop codon */
{
return sameWord(feat, "stop_codon") || sameWord(feat, "start_codon");
}

static boolean isCds(char *feat)
/* determine if a feature is CDS */
{
return sameWord(feat, "CDS") || isStartStopCodon(feat);
}

static void chkGroupLine(struct gffGroup *group, struct gffLine *gl, struct genePred *gp)
/* check that a gffLine is consistent with the genePred being built.  this
 * helps detect some problems that lead to corrupt genePreds */
{
if (!(sameString(gl->seq, gp->chrom) && (gl->strand == gp->strand[0])))
    {
    fprintf(stderr, "invalid gffGroup detected on line: ");
    gffTabOut(gl, stderr);
    errAbort("GFF/GTF group %s on %s%c, this line is on %s%c, all group members must be on same seq and strand",
             group->name, gp->chrom, gp->strand[0],
                     gl->seq, gl->strand);
    }
}

static int phaseToFrame(char phase)
/* convert GTF/GFF phase to frame */
{
switch (phase)
    {
    case '0':
        return 0;
    case '1':
        return 2;
    case '2':
        return 1;
    }
return -1;
}

#ifdef NOT_CURRENTLY_USED
static char frameToPhase(int frame)
/* convert GTF/GFF frame to phase */
{
switch (frame)
    {
    case 0:
        return '0';
    case 1:
        return '2';
    case 2:
        return '1';
    }
return -1;
}
#endif

static int incrFrame(int frame, int amt)
/* increment frame, no-op if frame is -1 */
{
if (frame < 0)
    return frame;
else if (amt >= 0)
    return (frame+amt) % 3;
else
    {
    int amt3 = (-amt) % 3;
    return (frame - (amt - amt3)) % 3;
    }
}

static int findPosExon(struct genePred *gp, int pos)
/* find exon containing the specified position */
{
int i;
for (i = 0; i < gp->exonCount; i++)
    {
    if ((gp->exonStarts[i] <= pos) && (pos < gp->exonEnds[i]))
        return i;
    }
errAbort("bug: position %d not found in exon of %s genePred", pos, gp->name);
return 0;
}

static int findLastFramedExon(struct genePred *gp)
/* locate the last exon with frame */
{
int iStart = (gp->strand[0] == '+') ? 0 : gp->exonCount-1;
int iStop = (gp->strand[0] == '+') ? gp->exonCount : -1;
int dir =  (gp->strand[0] == '+') ? +1 : -1;
int iPrevExon = -1, iExon;

// find beginning of frames
for (iExon = iStart; (iExon != iStop) && (gp->exonFrames[iExon] == -1); iExon += dir)
    continue;
// find last with frame
for (; (iExon != iStop) && (gp->exonFrames[iExon] != -1); iExon += dir)
    iPrevExon = iExon;
if (iPrevExon < 0)
    errAbort("bug: findLastFramedExon should have found an exon with frame");
return iPrevExon;
}

static void extendFramePos(struct genePred *gp)
/* extend frame missing from last exon(s) for positive strand genes, normally
 * caused by GTF stop_codon */
{
int iExon = findLastFramedExon(gp);
int frame = incrFrame(gp->exonFrames[iExon], (gp->exonEnds[iExon]-gp->exonStarts[iExon]));
for (iExon++; (iExon < gp->exonCount) && (gp->exonStarts[iExon] < gp->cdsEnd); iExon++)
    {
    assert(gp->exonFrames[iExon] < 0);
    gp->exonFrames[iExon] = frame;
    frame = incrFrame(gp->exonFrames[iExon], (gp->exonEnds[iExon]-gp->exonStarts[iExon]));
    }
}

static void extendFrameNeg(struct genePred *gp)
/* extend frame missing from last exon(s) for negative strand genes, normally
 * caused by GTF stop_codon */
{
int iExon = findLastFramedExon(gp);
int frame = incrFrame(gp->exonFrames[iExon], (gp->exonEnds[iExon]-gp->exonStarts[iExon]));
for (iExon--; (iExon >= 0) && (gp->exonEnds[iExon] > gp->cdsStart); iExon--)
    {
    assert(gp->exonFrames[iExon] < 0);
    gp->exonFrames[iExon] = frame;
    frame = incrFrame(gp->exonFrames[iExon], (gp->exonEnds[iExon]-gp->exonStarts[iExon]));
    }
}

static void fixStopFrame(struct genePred *gp)
/* Handle nasty corner case: GTF with the all or part of the stop codon as the
 * only codon in an exon, we must set the frame on these exons.  Some ENSEMBL
 * GTFs have single base `exons' with just a base of the stop codon, so must
 * check every base.  While less than ideal, we just extend the frame past the
 * last exon with frame, so a conflicting frame on the stop codon will be
 * missed.  Just switching to GFF3 fixes this mess. */
{
if (gp->strand[0] == '+')
    extendFramePos(gp);
else
    extendFrameNeg(gp);
}

static boolean adjImpliedStopPos(struct genePred *gp)
/* implied stop codon adjustment for positive strand gene */
{
int iExon = findPosExon(gp, gp->cdsEnd-1);
unsigned space = (gp->exonEnds[iExon]-gp->cdsEnd);
if (space >= 3)
    {
    // room in current exon
    gp->cdsEnd += 3;
    return TRUE;
    }
else if ((iExon < gp->exonCount)
         && ((gp->exonEnds[iExon+1]-gp->exonStarts[iExon+1])+space) >= 3)
    {
    // room including next exon
    gp->cdsEnd = gp->exonStarts[iExon+1] + (3-space);
    return TRUE;
    }
return FALSE;
}

static boolean adjImpliedStopNeg(struct genePred *gp)
/* implied stop codon adjustment for negative strand gene */
{
int iExon = findPosExon(gp, gp->cdsStart);
unsigned space = (gp->cdsStart-gp->exonStarts[iExon]);
if (space >= 3)
    {
    // room in current exon
    gp->cdsStart -= 3;
    return TRUE;
    }
else if ((iExon > 0)
         && ((gp->exonEnds[iExon-1]-gp->exonStarts[iExon-1])+space) >= 3)
    {
    // room including previous exon
    gp->cdsStart = gp->exonEnds[iExon-1] - (3-space);
    return TRUE;
    }
return FALSE;
}

static void adjImpliedStopAfterCds(struct genePred *gp)
/* adjust CDS bounds to include implied stop codon outside of CDS. */
{
if (gp->strand[0] == '+')
    {
    if (adjImpliedStopPos(gp))
        {
        if (gp->optFields & genePredCdsStatFld)
            gp->cdsEndStat = cdsComplete;
        }
    }
else
    {
    if (adjImpliedStopNeg(gp))
        {
        if (gp->optFields & genePredCdsStatFld)
            gp->cdsStartStat = cdsComplete;
        }
    }
}

static void checkForNoFrames(struct genePred *gp)
/* check for requesting frame from a file without frames. */
{
int i;
/* Complain if we have a CDS but exonFrames are all -1: */
if (gp->cdsStart < gp->cdsEnd)
    {
    boolean foundReal = FALSE;
    for (i = 0;  i < gp->exonCount;  i++)
        {
        if (gp->exonFrames[i] >= 0 && gp->exonFrames[i] <= 2)
            {
            foundReal = TRUE;
            break;
            }
        }
    if (! foundReal)
        {
        errAbort("Error: exonFrames field is being added, but I found a "
                 "gene (%s) with CDS but no valid frames.  "
                 "This can happen if program is invoked with -genePredExt "
                 "but no valid frames are given in the file.  If the 8th "
                 "field of GFF/GTF file is always a placeholder, then don't use "
                 "-genePredExt.",
                 gp->name);
        }
    }
}

static boolean isGtfFeature(char *feat)
/* check if a feature is one of the recognized features.  All otherse
 * should be ignored. http://mblab.wustl.edu/GTF22.html */
{
static char *gtfFeats[] = {"CDS", "start_codon", "stop_codon", "5UTR", "3UTR",
                           "inter", "inter_CNS", "intron_CNS", "exon", NULL};
int i;
for (i = 0; gtfFeats[i] != NULL; i++)
    {
    if (sameString(feat, gtfFeats[i]))
        return TRUE;
    }
return FALSE;
}

static boolean ignoreGxfLine(struct gffLine *gl, boolean isGtf)
/* should the specified line be skipped? */
{
if (isGtf)
    return !isGtfFeature(gl->feature);
else
    return FALSE;
}

static boolean allowedFrameFeature(struct gffLine *gl, boolean isGtf)
/* should this feature be used to assign frame? */
{
if (isStartStopCodon(gl->feature) || ignoreGxfLine(gl, isGtf))
    return FALSE;

// for GFF, any features with frame can be used to set frame,
// with GTF, there are some bogus files that set frame on UTR features,
// so only use CDS
if (isGtf)
    return sameString(gl->feature, "CDS");
else
    return (gl->frame != '.');
}

static struct gffLine *assignFrameForCdsExon(int start, int end, int *framePtr,
                                             struct gffLine *gl, boolean isGtf)
/* set frame from any overlapping records with frame. Don't include
 * start/stop_codon, as it isn't a full exon. */
{
/* skip lines preceeding this exon */
while (gl != NULL)
    {
    if (allowedFrameFeature(gl, isGtf) && (gl->end >= start))
        break;
    gl = gl->next;
}
/* set frame from any overlapping records with frame. */
while ((gl != NULL) && (rangeIntersection(gl->start, gl->end, start, end) > 0))
    {
    if (allowedFrameFeature(gl, isGtf))
        {
        int frame = phaseToFrame(gl->frame);
        if (frame >= 0)
            *framePtr = frame;
        }
    gl = gl->next;
    }
return gl;
}

static void assignFrame(boolean isGtf, struct gffGroup *group, struct genePred *gp)
/* Assign frame from GFF after genePred has been built.*/
{
struct gffLine *gl = group->lineList;
int start, end, i;
boolean haveFrame = FALSE;
for (i = 0; i < gp->exonCount; i++)
    {
    if (genePredCdsExon(gp, i, &start, &end))
        {
        gl = assignFrameForCdsExon(start, end, &(gp->exonFrames[i]), gl, isGtf);
        if (gp->exonFrames[i] >= 0)
            haveFrame = TRUE;
        }
    }

/* make sure stop has frame if some exons in the gene had frame */
if (haveFrame)
    fixStopFrame(gp);
checkForNoFrames(gp);
}

static struct genePred *mkFromGroupedGxf(struct gffFile *gff, struct gffGroup *group, char *name,
                                         boolean isGtf, char *exonSelectWord, unsigned optFields,
                                         unsigned options)
/* common function to create genePreds from GFFs or GTFs.  This is a little
 * ugly with to many check of isGtf, however the was way to much identical
 * code the other way. Options are from genePredFromGxfOpts */
{
struct genePred *gp;
int stopCodonStart = -1, stopCodonEnd = -1;
int cdsStart = BIGNUM, cdsEnd = -BIGNUM;
int exonCount = 0;
boolean haveStartCodon = FALSE, haveStopCodon = FALSE;
struct gffLine *gl;
unsigned *eStarts, *eEnds;
int i;

/* should we count on start/stop codon annotation in GFFs? */
boolean useStartStops = isGtf || haveStartStopCodons(gff);

/* Count up exons and figure out cdsStart and cdsEnd. */
for (gl = group->lineList; gl != NULL; gl = gl->next)
    {
    if (ignoreGxfLine(gl, isGtf))
        continue;
    if (isExon(gl->feature, isGtf, exonSelectWord))
	++exonCount;
    if (isCds(gl->feature))
        {
	if (gl->start < cdsStart)
            cdsStart = gl->start;
	if (gl->end > cdsEnd)
            cdsEnd = gl->end;
	}
    if (sameWord(gl->feature, "start_codon"))
        haveStartCodon = TRUE;
    if (sameWord(gl->feature, "stop_codon"))
        {
        /* stop_codon can be split, need bounds for adjusting CDS below */
        if ((stopCodonStart < 0) || (gl->start < stopCodonStart))
            stopCodonStart = gl->start;
        if ((stopCodonEnd < 0) || (gl->end > stopCodonEnd))
            stopCodonEnd = gl->end;
        haveStopCodon = TRUE;
        }
    }
if (exonCount == 0)
    return NULL;
if (cdsStart > cdsEnd)
    {
    /* no cds annotated */
    cdsStart = 0;
    cdsEnd = 0;
    }
else if (stopCodonStart >= 0)
    {
    /* adjust CDS to include stop codon as in GTF */
    if (group->strand == '+')
        {
        if (stopCodonEnd > cdsEnd)
            cdsEnd = stopCodonEnd;
        }
    else
        {
        if (stopCodonStart < cdsStart)
            cdsStart = stopCodonStart;
        }
    }

/* Allocate genePred and fill in values. */
AllocVar(gp);
gp->name = cloneString(name);
gp->chrom = cloneString(group->seq);
gp->strand[0] = group->strand;
gp->txStart = group->start;
gp->txEnd = group->end;
if (cdsStart < cdsEnd)
    {
    gp->cdsStart = cdsStart;
    gp->cdsEnd = cdsEnd;
    }
else
    {
    // no CDS, set to txEnd
    gp->cdsStart = gp->txEnd;
    gp->cdsEnd = gp->txEnd;
    }
gp->exonStarts = AllocArray(eStarts, exonCount);
gp->exonEnds = AllocArray(eEnds, exonCount);
gp->optFields = optFields;

if (optFields & genePredName2Fld)
    {
    if (options & genePredGxfGeneNameAsName2)
        {
        if (group->lineList->geneName != NULL)
            gp->name2 = cloneString(group->lineList->geneName);
        }
    else
        {
        if (group->lineList->geneId != NULL)
            gp->name2 = cloneString(group->lineList->geneId);
        }
    if (gp->name2 == NULL)
        gp->name2 = cloneString("");
    }
if (optFields & genePredCdsStatFld)
    {
    if (cdsStart < cdsEnd)
        {
        if (!useStartStops)
            {
            /* GFF doesn't require start or stop, if not used, assume complete */
            gp->cdsStartStat = cdsComplete;
            gp->cdsEndStat = cdsComplete;
            }
        else if (group->strand == '+')
            {
            gp->cdsStartStat = (haveStartCodon ? cdsComplete : cdsIncomplete);
            gp->cdsEndStat = (haveStopCodon ? cdsComplete : cdsIncomplete);;
            }
        else
            {
            gp->cdsEndStat = (haveStartCodon ? cdsComplete : cdsIncomplete);
            gp->cdsStartStat = (haveStopCodon ? cdsComplete : cdsIncomplete);;
            }
        }
    else
        {
        gp->cdsStartStat = cdsNone;
        gp->cdsEndStat = cdsNone;
        }
    }
if (optFields & genePredExonFramesFld)
    {
    AllocArray(gp->exonFrames, exonCount);
    for (i = 0; i < exonCount; i++)
        gp->exonFrames[i] = -1;
    }


/* adjust tx range to include stop codon */
if ((group->strand == '+') && (gp->txEnd == stopCodonStart))
     gp->txEnd = stopCodonEnd;
else if ((group->strand == '-') && (gp->txStart == stopCodonEnd))
    gp->txStart = stopCodonStart;

i = -1; /* before first exon */
/* fill in exons, merging overlaping and adjacent exons */
for (gl = group->lineList; gl != NULL; gl = gl->next)
    {
    if (ignoreGxfLine(gl, isGtf))
        continue;
    if (isExon(gl->feature, isGtf, exonSelectWord) || (isGtf && isCds(gl->feature)))
        {
        chkGroupLine(group, gl, gp);
        if ((i < 0) || (gl->start > eEnds[i]))
            {
            /* start a new exon */
            ++i;
            assert(i < exonCount);
            eStarts[i] = gl->start;
            eEnds[i] = gl->end;
            }
        else
            {
            /* overlap, extend exon, picking the largest of ends */
            assert(i < exonCount);
            assert(gl->start >= eStarts[i]);
            if (gl->end > eEnds[i])
                eEnds[i] = gl->end;
            }
        }
    }
gp->exonCount = i+1;

/* adjust for flybase type of GFFs, with stop codon implied and outside of
 * CDS. */
if ((options & genePredGxfImpliedStopAfterCds) && !haveStopCodon)
    adjImpliedStopAfterCds(gp);

if (optFields & genePredExonFramesFld)
    assignFrame(isGtf, group, gp);

return gp;
}

struct genePred *genePredFromGroupedGff(struct gffFile *gff, struct gffGroup *group, char *name,
                                        char *exonSelectWord, unsigned optFields, unsigned options)
/* Convert gff->groupList to genePred list. See genePred.h for details. */
{
struct gffLine *gl;
boolean anyExon = FALSE;

/* Look to see if any exons.  If not allow CDS to be used instead. */
if (exonSelectWord)
    {
    for (gl = group->lineList; gl != NULL; gl = gl->next)
	{
	if (sameWord(gl->feature, exonSelectWord))
	    {
	    anyExon = TRUE;
	    break;
	    }
	}
    }
else
    anyExon = TRUE;
if (!anyExon)
    exonSelectWord = "CDS";

return mkFromGroupedGxf(gff, group, name, FALSE, exonSelectWord, optFields,
                        options);
}

struct genePred *genePredFromGroupedGtf(struct gffFile *gff, struct gffGroup *group, char *name,
                                        unsigned optFields, unsigned options)
/* Convert gff->groupList to genePred list, using GTF feature conventions.
 * See genePred.h for details. */
{
return mkFromGroupedGxf(gff, group, name, TRUE, NULL, optFields,
                        options);
}

static void mapCdsToGenome(struct psl *psl, struct genbankCds* cds,
                           struct genePred* gene)
/* Convert set cdsStart/end from mrna to genomic coordinates. */
{
struct genbankCds genomeCds = genbankCdsToGenome(cds, psl);
if (genomeCds.start < genomeCds.end)
    {
    gene->cdsStart = genomeCds.start;
    gene->cdsEnd = genomeCds.end;
    if (gene->optFields & genePredCdsStatFld)
        {
        if (gene->strand[0] == '+')
            {
            gene->cdsStartStat = (genomeCds.startComplete) ? cdsComplete : cdsIncomplete;
            gene->cdsEndStat = (genomeCds.endComplete) ? cdsComplete : cdsIncomplete;;
            }
        else
            {
            gene->cdsStartStat = (genomeCds.endComplete) ? cdsComplete : cdsIncomplete;;
            gene->cdsEndStat = (genomeCds.startComplete) ? cdsComplete : cdsIncomplete;
            }
        }
    }
else
    {
    /* CDS not aligned */
    gene->cdsStart = gene->txEnd;
    gene->cdsEnd = gene->txEnd;
    if (gene->optFields & genePredCdsStatFld)
        {
        gene->cdsStartStat = cdsUnknown;
        gene->cdsEndStat = cdsUnknown;
        }
    }
}

void genePredAddGenbankCds(struct psl *psl, struct genbankCds* cds, 
	struct genePred *gene)
/* Convert cdsStart/End from mrna to genomic coordinates. 
 * Note that the genePred blocks need not be filled in before
 * this call. */
{
if (cds == NULL)
    {
    /* no CDS, set to zero-length at end */
    gene->cdsStart = psl->tEnd;
    gene->cdsEnd = psl->tEnd;
    if (gene->optFields & genePredCdsStatFld)
        {
        gene->cdsStartStat = cdsNone;
        gene->cdsEndStat = cdsNone;
        }
    }
else if ((cds->start < 0) || (cds->end <= 0))
    {
    /* unknown CDS, set to end */
    gene->cdsStart = psl->tEnd;
    gene->cdsEnd = psl->tEnd;
    if (gene->optFields & genePredCdsStatFld)
        {
        gene->cdsStartStat = cdsUnknown;
        gene->cdsEndStat = cdsUnknown;
        }
    }
else 
    {
    /* have CDS annotation, make sure it's in bounds */
    struct genbankCds adjCds = *cds;
    if (adjCds.end > psl->qSize)
        {
        adjCds.end = psl->qSize;
        adjCds.endComplete = FALSE;
        }
    mapCdsToGenome(psl, &adjCds, gene);
    }
}

static int getFrame(struct psl *psl, int start, int end,
                    struct genbankCds* cds)
/* get the starting frame for an exon of a mRNA.  start and end are
 * the bounds of the exon within the mRNA.  This may cover multiple psl
 * blocks due to merging of small gaps. */
{
/* use the 3' end is used if it's complete, as it is more often accurate when
 * genes are defined from mRNAs sequenced with reverse-transcriptase. */
int frame = -1;
/* map to mRNA coords in CDS since frame for an exon is in direction of
 * transcription. */
if (psl->strand[0] == '-')
    reverseIntRange(&start, &end, psl->qSize);
if (start < cds->start)
    start = cds->start;
if (end > cds->end)
    end = cds->end;

if (start < end)
    {
    /* compute from end if it is complete in mRNA */
    if (cds->endComplete)
        {
        int fr = (cds->end-start) % 3;
        frame = (fr == 2) ? 1 : ((fr == 1) ? 2 : 0);
        }
    else
        frame = (start-cds->start) % 3;
    }
return frame;
}

static boolean shouldMergeBlocks(struct genePred *gene, int iExon,
                                 unsigned tStart, unsigned options,
                                 int cdsMergeSize, int utrMergeSize)
/* determine if a new block starting at tStart whould be merged with
 * the preceeding exon, indicated by iExon.
 */
{
/* nothing to check if no exons have been added */
if (iExon >= 0)
    {
    boolean inCds = (gene->cdsStart <= tStart) && (tStart < gene->cdsEnd);
    int gapSize = (tStart - gene->exonEnds[iExon]);
    if (inCds)
        {
        if ((options & genePredPslCdsMod3) && ((gapSize % 3) != 0))
            return FALSE;  /* not a multiple of three */
        if ((cdsMergeSize >= 0) && (gapSize <= cdsMergeSize))
            return TRUE;
        }
    else 
        {
        if ((utrMergeSize >= 0) && (gapSize <= utrMergeSize))
            return TRUE;
        }
    }
return FALSE; /* don't merge */
}

static void pslToExons(struct psl *psl, struct genePred *gene,
                       struct genbankCds* cds, unsigned options,
                       int cdsMergeSize, int utrMergeSize)
/* Convert psl alignment blocks to genePred exons, merging together blocks by
 * the specified paraemeters.  Optionally add frame information. */
{
int iBlk, iExon;
int startIdx, stopIdx, idxIncr;

/* this is bigger than needed if blocks merge */
gene->exonStarts = needMem(psl->blockCount*sizeof(unsigned));
gene->exonEnds = needMem(psl->blockCount*sizeof(unsigned));
if (gene->optFields & genePredExonFramesFld)
    {
    /* default frames to indicate no frame */
    gene->exonFrames = needMem(psl->blockCount*sizeof(unsigned));
    for (iExon = 0; iExon < psl->blockCount; iExon++)
        gene->exonFrames[iExon] = -1;
    }

/* traverse psl in postive target order */
if (psl->strand[1] == '-')
    {
    startIdx = psl->blockCount-1;
    stopIdx = -1;
    idxIncr = -1;
    }
else
    {
    startIdx = 0;
    stopIdx = psl->blockCount;
    idxIncr = 1;
    }

iExon = -1;  /* indicate none have been added */
for (iBlk = startIdx; iBlk != stopIdx; iBlk += idxIncr)
    {
    int tStart = psl->tStarts[iBlk];
    int tEnd = tStart + psl->blockSizes[iBlk];
    if (psl->strand[1] == '-')
        reverseIntRange(&tStart, &tEnd, psl->tSize);
    if (!shouldMergeBlocks(gene, iExon, tStart, options,
                           cdsMergeSize, utrMergeSize))
        {
        /* new exon */
        iExon++;
        gene->exonStarts[iExon] = tStart;
	}
    gene->exonEnds[iExon] = tEnd;
    /* Set the frame. We have to deal with multiple blocks being merged.  For
     * positive strand, the first block frame is used, for negative strand,
     * the last is used. */
    if ((gene->optFields & genePredExonFramesFld) && (cds != NULL)
        && (((psl->strand[0] == '+') && (gene->exonFrames[iExon] < 0))
            || (psl->strand[0] == '-')))
	{
        int fr = getFrame(psl, psl->qStarts[iBlk], psl->qStarts[iBlk]+psl->blockSizes[iBlk], cds);
        if (fr >= 0)
	    gene->exonFrames[iExon] = fr;
	}
    }
gene->exonCount = iExon+1;
assert(gene->exonCount <= psl->blockCount);
}

struct genePred *genePredFromPsl3(struct psl *psl,  struct genbankCds* cds, 
                                  unsigned optFields, unsigned options,
                                  int cdsMergeSize, int utrMergeSize)
/* Convert a PSL of an mRNA alignment to a genePred, converting a genbank CDS
 * specification string to genomic coordinates. Small genomic inserts are
 * merged based on the mergeSize parameters.  Gaps no larger than the
 * specified merge sizes result in the adjacent blocks being merged into a
 * single exon.  Gaps in CDS use cdsMergeSize, in UTR use utrMergeSize.  If
 * the genePredPslCdsMod3 option is specified, then CDS gaps are only merged
 * if a multiple of three.  A negative merge sizes disables merging of blocks.
 * This differs from specifying zero in that adjacent blocks will not be
 * merged. The optfields field is a set from genePredFields, indicated what
 * fields to create.  Zero-length CDS, or null cds, creates without CDS
 * annotation.  If cds is null, it will set status fields to cdsNone.  */
{
struct genePred *gene;
#if 0
/* FIXME; the browser calls this on a protein psl when rendering the amino
 * acids of a protein alignment track.  Some of this code doesn't really make
 * sense for proteins and I really don't see how this produces the right
 * results, however the check is disabled until this can be investigated
 * because it *seems* to work.  markd 2006-05-22 */
if (pslIsProtein(psl))
    errAbort("can't convert protein psls to genePreds");
#endif

AllocVar(gene);
gene->name = cloneString(psl->qName);
gene->chrom = cloneString(psl->tName);
gene->txStart = psl->tStart;
gene->txEnd = psl->tEnd;
gene->optFields = optFields;

/* get strand in genome that the positive version mRNA aligns to */
if (psl->strand[1] == '\0')
    {
    /* assumed pos target strand, so neg query would be pos target */
    gene->strand[0] = psl->strand[0];
    }
else 
    {
    /* query and target strand are different; will be neg when query pos */
    gene->strand[0] = ((psl->strand[0] != psl->strand[1]) ? '-' : '+');
    }

genePredAddGenbankCds(psl, cds, gene);
pslToExons(psl, gene, cds, options, cdsMergeSize, utrMergeSize);
return gene;
}

struct genePred *genePredFromPsl2(struct psl *psl, unsigned optFields,
                                  struct genbankCds* cds, int insertMergeSize)
/* Compatibility function, genePredFromPsl3 is prefered.  See that function's
 * documentation for details. This calls genePredFromPsl3 with no options
 * and insertMergeSize set for CDS and UTR.
 */
{
return genePredFromPsl3(psl, cds, optFields, genePredPslDefaults, insertMergeSize, insertMergeSize);
}

struct genePred *genePredFromPsl(struct psl *psl, int cdsStart, int cdsEnd,
                                 int insertMergeSize)
/* Compatibility function, genePredFromPsl3 is prefered.  See that function's
 * documentation for details. This calls genePredFromPsl3 with no options.
 */
{
struct genbankCds cds;
ZeroVar(&cds);
cds.start = cdsStart;
cds.end = cdsEnd;
return genePredFromPsl3(psl, &cds, 0, genePredPslDefaults, insertMergeSize, insertMergeSize);
}

char* genePredGetCreateSql(char* table, unsigned optFields, unsigned options,
                           int chromIndexLen)
/* Get SQL required to create a genePred table. optFields is a bit set
 * consisting of the genePredFields values. Options are a bit set of
 * genePredCreateOpts. Returned string should be freed.  This will create all
 * optional fields that preceed the highest optFields column.  chromIndexLen
 * is now ignored.. */
{
/* the >= is used so that we create preceeding fields. */
char sqlCmd[1024];

safef(sqlCmd, sizeof(sqlCmd), createSql, table,
      ((options & genePredWithBin) ? binFieldSql : noBinIndexSql));
if (optFields >= genePredScoreFld)
    safecat(sqlCmd, sizeof(sqlCmd), scoreFieldSql);
if (optFields >= genePredName2Fld)
    safecat(sqlCmd, sizeof(sqlCmd), name2FieldSql);
if (optFields >= genePredCdsStatFld)
    safecat(sqlCmd, sizeof(sqlCmd), cdsStatFieldSql);
if (optFields >= genePredExonFramesFld)
    safecat(sqlCmd, sizeof(sqlCmd), exonFramesFieldSql);
safecat(sqlCmd, sizeof(sqlCmd), ")");
return cloneString(sqlCmd);
}

// FIXME: this really doesn't belong in this module
struct genePred *getOverlappingGene(char *db, struct genePred **list, char *table, char *chrom, int cStart, int cEnd, char *name, int *retOverlap)
{
/* read all genes from a table find the gene with the biggest overlap. 
   Cache the list of genes to so we only read it once */

char query[256];
struct genePred *gene;
struct sqlConnection *conn;
struct sqlResult *sr;
char **row;
struct genePred *el = NULL, *bestMatch = NULL, *gp = NULL;
int overlap = 0 , bestOverlap = 0, i;
struct psl *psl;
int *eFrames;


if (*list == NULL)
    {
    printf("Loading Predictions from %s\n",table);
    AllocVar(*list);
    conn = hAllocConn(db);
    AllocVar(gene);
    safef(query, sizeof(query), "select * from %s", table);
    sr = sqlGetResult(conn, query);
    while ((row = sqlNextRow(sr)) != NULL)
        {
        if (!sameString(table,"all_mrna"))
            {
            el = genePredLoad(row);
            }
        else
            {
            psl = pslLoad(row);
            el = genePredFromPsl2(psl, 0, NULL, genePredStdInsertMergeSize);
            }
        slAddHead(list, el);
        }
    slReverse(list);
    sqlFreeResult(&sr);
    hFreeConn(&conn);
    }
for (el = *list; el != NULL; el = el->next)
    {
    if (chrom != NULL && el->chrom != NULL)
        {
        overlap = 0;
        if ( sameString(chrom, el->chrom))
            {
            for (i = 0 ; i<(el->exonCount); i++)
                {
                overlap += positiveRangeIntersection(cStart,cEnd, el->exonStarts[i], el->exonEnds[i]) ;
                }
            if (overlap > 20 && sameString(name, el->name))
                {
                bestMatch = el;
                bestOverlap = overlap;
                *retOverlap = bestOverlap;
                }
            if (overlap > bestOverlap)
                {
                bestMatch = el;
                bestOverlap = overlap;
                *retOverlap = bestOverlap;
                }
            }
        }
    }
if (bestMatch != NULL)
    {
    /* Allocate genePred and fill in values. */
    AllocVar(gp);
    gp->name = cloneString(bestMatch->name);
    gp->chrom = cloneString(bestMatch->chrom);
    gp->strand[1] = bestMatch->strand[1];
    gp->strand[0] = bestMatch->strand[0];
    gp->txStart = bestMatch->txStart;
    gp->txEnd = bestMatch->txEnd;
    gp->cdsStart = bestMatch->cdsStart;
    gp->cdsEnd = bestMatch->cdsEnd;
    gp->exonCount = bestMatch->exonCount;
    AllocArray(gp->exonStarts, bestMatch->exonCount);
    AllocArray(gp->exonEnds, bestMatch->exonCount);
    for (i=0; i<bestMatch->exonCount; ++i)
        {
        gp->exonStarts[i] = bestMatch->exonStarts[i] ;
        gp->exonEnds[i] = bestMatch->exonEnds[i] ;
        }
    gp->optFields = bestMatch->optFields;
    gp->score = bestMatch->score;

    if (bestMatch->optFields & genePredName2Fld)
        gp->name2 = cloneString(bestMatch->name2);
    else
        gp->name2 = NULL;
    if (bestMatch->optFields & genePredCdsStatFld)
        {
        gp->cdsStartStat = bestMatch->cdsStartStat;
        gp->cdsEndStat = bestMatch->cdsEndStat;
        }
    if (bestMatch->optFields & genePredExonFramesFld)
        {
        gp->exonFrames = AllocArray(eFrames, bestMatch->exonCount);
        for (i = 0; i < bestMatch->exonCount; i++)
            gp->exonFrames[i] = bestMatch->exonFrames[i];
        }
    eFrames = gp->exonFrames;
    }

return gp;
}
int genePredBases(struct genePred *gp)
/* count coding and utr bases in a gene prediction */
{
int count = 0, i;

if (gp == NULL) return 0;

for (i=0; i<gp->exonCount; i++)
    {
    count += gp->exonEnds[i] - gp->exonStarts[i] ;
    }
return count;
}

int genePredCodingBases(struct genePred *gp)
/* Count up the number of coding bases in gene prediction. */
{
int i, exonCount = gp->exonCount;
int cdsStart = gp->cdsStart, cdsEnd = gp->cdsEnd;
int baseCount = 0;
for (i=0; i<exonCount; ++i)
    {
    baseCount += positiveRangeIntersection(cdsStart,cdsEnd,
    	gp->exonStarts[i], gp->exonEnds[i]);
    }
return baseCount;
}

boolean genePredCdsExon(struct genePred *gp, int iExon, int *startPtr, int *endPtr)
/* Get the CDS range in an exon.  If there is no CDS, return FALSE and then
 * set start == end */
{
assert((iExon >= 0) && (iExon < gp->exonCount));
int start = gp->exonStarts[iExon];
int end = gp->exonEnds[iExon];

if (start < gp->cdsStart)
    start = gp->cdsStart;
if (end > gp->cdsEnd)
    end = gp->cdsEnd;
if (startPtr != NULL)
    *startPtr = start;
if (endPtr != NULL)
    *endPtr = end;
return (start < end);
}

/* state for genePredCheck */
static int gpErrorCnt = 0;  /* error count for gpError */
static FILE *gpErrFh = NULL;  /* error output for gpError */

static void gpError(char *format, ...)
/* print and count an error */
{
va_list args;
fprintf(gpErrFh, "Error: ");
va_start(args, format);
vfprintf(gpErrFh, format, args);
va_end(args);
fputc('\n', gpErrFh);
gpErrorCnt++;
}

static void checkExon(char *desc, struct genePred* gp, int iExon)
/* check one exon of a genePred */
{
unsigned exonStart = gp->exonStarts[iExon];
unsigned exonEnd = gp->exonEnds[iExon];
if (exonStart >= exonEnd)
    gpError("%s: %s exon %u start %u >= end %u", desc, gp->name,
            iExon, exonStart, exonEnd);
if (exonStart < gp->txStart)
    gpError("%s: %s exon %u start %u < txStart %u", desc, gp->name,
            iExon, exonStart, gp->txStart);
if (exonEnd > gp->txEnd)
    gpError("%s: %s exon %u end %u > txEnd %u", desc, gp->name,
            iExon, exonEnd, gp->txEnd);
if (iExon > 0)
    {
    /* other than first exon */
    unsigned prevExonEnd = gp->exonEnds[iExon-1];
    if (exonStart < prevExonEnd)
        gpError("%s: %s exon %u overlaps previous exon", desc,
                gp->name, iExon);
    }

if (gp->optFields & genePredExonFramesFld)
    {
    int frame = gp->exonFrames[iExon];
    if ((frame < -1) || (frame > 2))
        gpError("%s: %s invalid exonFrame: %d", desc, gp->name, frame);
    if ((exonEnd > gp->cdsStart) && (exonStart < gp->cdsEnd))
        {
        if (frame == -1)
            gpError("%s: %s no exonFrame on CDS exon %d", desc, gp->name, iExon);
        }
    else
        {
        if (frame != -1)
            gpError("%s: %s exonFrame on non-CDS exon %d", desc, gp->name, iExon);
        }
    }
}

static void checkCdsBounds(char *desc, FILE* out, struct genePred* gp)
/* check cdsStart/cdsEnd */
{
int iExon;
boolean foundCdsStart = FALSE, foundCdsEnd = FALSE;
if (gp->cdsStart > gp->cdsEnd)
    gpError("%s: %s cdsStart %u > cdsEnd %u", desc, gp->name,
            gp->cdsStart, gp->cdsEnd);
if ((gp->cdsStart < gp->txStart) || (gp->cdsStart > gp->txEnd))
    gpError("%s: %s cdsStart %u not in tx bounds %u-%u", desc,
            gp->name, gp->cdsStart, gp->txStart, gp->txEnd);
if ((gp->cdsEnd < gp->txStart) || (gp->cdsEnd > gp->txEnd))
    gpError("%s: %s cdsEnd %u not in tx bounds %u-%u", desc, 
            gp->name, gp->cdsEnd, gp->txStart, gp->txEnd);
/*  is cdsStart/cdsEnd in an exon?  */
for (iExon = 0; iExon < gp->exonCount; iExon++)
    {
    if ((gp->cdsStart >= gp->exonStarts[iExon])
        && (gp->cdsStart < gp->exonEnds[iExon]))
        foundCdsStart = TRUE;
    if ((gp->cdsEnd > gp->exonStarts[iExon])
        && (gp->cdsEnd <= gp->exonEnds[iExon]))
        foundCdsEnd = TRUE;
    }
if (!foundCdsStart)
    gpError("%s: %s cdsStart %u not in an exon", desc, gp->name, gp->cdsStart);
if (!foundCdsEnd)
    gpError("%s: %s cdsEnd %u not in an exon", desc, gp->name, gp->cdsEnd);
}

int genePredCheck(char *desc, FILE* out, int chromSize, 
                  struct genePred* gp)
/* Validate a genePred for consistency.  desc is printed the error messages
 * to file out (open /dev/null to discard).  chromSize should contain
 * size of chromosome, or 0 if chrom is not valid, or -1 to not check
 * chromosome bounds. Returns count of errors. */
{
int iExon;
gpErrorCnt = 0;
gpErrFh = out;
if (!(sameString(gp->strand, "+") || sameString(gp->strand, "-")))
    gpError("%s: %s invalid strand: \"%s\"", desc, gp->name, gp->strand);

/* check chrom */
if (chromSize == 0)
    gpError("%s: %s chrom not a valid chromosome: \"%s\"", desc, gp->name, gp->chrom);
else if (chromSize > 0)
    {
    if (gp->txEnd > chromSize)
        gpError("%s: %s txEnd %u >= chromSize %u", desc, gp->name,
                gp->txEnd, chromSize);
    }

/* check internal consistency */
if (gp->txStart >= gp->txEnd)
    gpError("%s: %s txStart %u >= txEnd %u", desc, gp->name,
            gp->txStart, gp->txEnd);

/* no CDS is indicated by cdsStart == cdsEnd */
if (gp->cdsStart != gp->cdsEnd)
    checkCdsBounds(desc, out, gp);

/* make sure first/last exons match tx range */
if (gp->txStart != gp->exonStarts[0])
    gpError("%s: %s first exon start %u doesn't match txStart %u", desc, 
            gp->name, gp->exonStarts[0], gp->txStart);
if (gp->txEnd != gp->exonEnds[gp->exonCount-1])
    gpError("%s: %s last exon end %u doesn't match txEnd %u", desc, 
            gp->name, gp->exonEnds[gp->exonCount-1], gp->txEnd);


/* check each exon */
for (iExon = 0; iExon < gp->exonCount; iExon++)
    checkExon(desc, gp, iExon);

return gpErrorCnt;
}

boolean genePredNmdTarget(struct genePred *gp) 
/* Return TRUE if cds end is more than 50bp upstream of
   last intron. */
{
int gpSize = 0;
int i = 0;
int startDist = 0, endDist = 0;
int blockSize = 0;
if(gp->exonCount < 2)
    return FALSE;
for(i = 0; i < gp->exonCount; i++)
    {
    int blockStart = gp->exonStarts[i];
    int blockEnd = gp->exonEnds[i];
    blockSize = blockEnd - blockStart;
    if( blockStart <= gp->cdsStart && gp->cdsStart <= blockEnd)
	startDist += blockSize - (blockEnd - gp->cdsStart);
    else if(blockEnd <= gp->cdsStart)
	startDist += blockSize;
    if( blockStart <= gp->cdsEnd && gp->cdsEnd <= blockEnd)
	endDist += blockSize - (blockEnd - gp->cdsEnd);
    else if(blockEnd <= gp->cdsEnd)
	endDist += blockSize;
    gpSize += blockSize;
    }
/*  xxxxxXXX----XXXXXXXXXXXXX--------XXXXXXX----XXXxxxxxxxx-------xxxxxxxxx  */                                           
if(sameString(gp->strand, "+"))
    {
    blockSize = gp->exonEnds[gp->exonCount-1] - gp->exonStarts[gp->exonCount-1];
    return endDist < (gpSize - blockSize - 50); 
    }
else if(sameString(gp->strand, "-"))
    {
    blockSize = gp->exonEnds[0] - gp->exonStarts[0];
    return startDist > (gp->exonEnds[1] + 50);
    }
else 
    errAbort("genePredNmdsTarget() - Don't recognize strand: %s\n", gp->strand);
return FALSE;
}

void genePredAddExonFrames(struct genePred *gp)
/* Add exonFrames array to a genePred that doesn't have it. Frame is assumed
 * to be contiguous. */
{
int iExon, start, end, iBase = 0;
int iStart, iEnd, iDir;

/* initial array to all -1, for no frame */
AllocArray(gp->exonFrames, gp->exonCount);
gp->optFields |= genePredExonFramesFld;
for (iExon = 0; iExon < gp->exonCount; iExon++)
    gp->exonFrames[iExon] = -1;

if (sameString(gp->strand, "+"))
    {
    iStart = 0;
    iEnd = gp->exonCount;
    iDir = 1;
    }
else
    {
    iStart = gp->exonCount-1;
    iEnd = -1;
    iDir = -1;
    }
for (iExon = iStart; iExon != iEnd; iExon += iDir)
    {
    if (genePredCdsExon(gp, iExon, &start, &end))
        {
        gp->exonFrames[iExon] = iBase % 3;
        iBase += (end - start);
        }
    }
}

void genePredRc(struct genePred *gp, int chromSize)
/* Reverse complement a genePred (project it to the opposite strand).  Useful
 * when doing analysis that is simplified by having things on the same strand.
 */
{
int iExon;
gp->strand[0] = (gp->strand[0] == '+') ? '-' : '+';
reverseUnsignedRange(&gp->txStart, &gp->txEnd, chromSize);
reverseUnsignedRange(&gp->cdsStart, &gp->cdsEnd, chromSize);
for (iExon = 0; iExon < gp->exonCount; iExon++)
    reverseUnsignedRange(&gp->exonStarts[iExon], &gp->exonEnds[iExon], chromSize);
reverseUnsigned(gp->exonStarts, gp->exonCount);
reverseUnsigned(gp->exonEnds, gp->exonCount);
if (gp->optFields & genePredCdsStatFld)
    {
    enum cdsStatus hold =  gp->cdsStartStat;
    gp->cdsStartStat = gp->cdsEndStat;
    gp->cdsEndStat = hold;
    }
if (gp->optFields & genePredExonFramesFld)
    reverseInts(gp->exonFrames, gp->exonCount);
}

int genePredCdsSize(struct genePred *gp)
/* compute the number of bases of CDS */
{
int iExon, start, end, cdsBases = 0;

for (iExon = 0; iExon < gp->exonCount; iExon++)
    {
    if (genePredCdsExon(gp, iExon, &start, &end))
        cdsBases += (end - start);
    }
return cdsBases;
}

struct genePred *genePredNew(char *name, char *chrom, char strand,
                             unsigned txStart, unsigned txEnd,
                             unsigned cdsStart, unsigned cdsEnd,
                             unsigned optFields, unsigned exonSpace)
/* create a new gene with space for the specified number of exons allocated.
 * genePredGrow maybe used to expand this space if needed. */
{
struct genePred *gp;
AllocVar(gp);
assert(exonSpace > 0);

gp->name = cloneString(name);
gp->chrom  = cloneString(chrom);
gp->strand[0] = strand;
gp->txStart = txStart;
gp->txEnd = txEnd;
gp->cdsStart  = cdsStart;
gp->cdsEnd = cdsEnd;
gp->optFields = optFields;
AllocArray(gp->exonStarts, exonSpace);
AllocArray(gp->exonEnds, exonSpace);
if (gp->optFields & genePredExonFramesFld)
    AllocArray(gp->exonFrames, exonSpace);
return gp;
}

void genePredGrow(struct genePred *gp, unsigned *exonSpacePtr)
/* Increase memory allocated to a psl to hold more exons.  exonSpacePtr
 * should point the the current maximum number of exons and will be
 * updated to with the new amount of space. */
{
int exonSpace = *exonSpacePtr;
int newSpace = 2 * exonSpace;
ExpandArray(gp->exonStarts, exonSpace, newSpace);
ExpandArray(gp->exonEnds, exonSpace, newSpace);
if (gp->optFields & genePredExonFramesFld)
    ExpandArray(gp->exonFrames, exonSpace, newSpace);
*exonSpacePtr = newSpace;
}

struct rbTree *genePredToRangeTree(struct genePred *gp, boolean cdsOnly)
/* Convert genePred into a range tree. */
{
struct rbTree *rangeTree = rangeTreeNew();
int i;
for (i=0; i<gp->exonCount; ++i)
    {
    int start = gp->exonStarts[i];
    int end = gp->exonEnds[i];
    if (cdsOnly)
        {
	start = max(start, gp->cdsStart);
	end = min(end, gp->cdsEnd);
	if (start >= end)
	    continue;
	}
    rangeTreeAdd(rangeTree, start, end);
    }
return rangeTree;
}

void gpPartOutAsBed(struct genePred *gp, int start, int end, FILE *f, 
	char *type, int id, int minSize)
/* Write out part of gp as bed12. */
{
/* Figure out # of blocks and min/max of area inside start/end */
int blockCount = 0;
int newStart = gp->txEnd, newEnd = gp->txStart;
int size = 0;
int i;
for (i=0; i<gp->exonCount; ++i)
    {
    int exonStart = gp->exonStarts[i];
    int exonEnd = gp->exonEnds[i];
    exonStart = max(start, exonStart);
    exonEnd = min(end, exonEnd);
    if (exonStart < exonEnd)
        {
	++blockCount;
	newStart = min(exonStart, newStart);
	newEnd = max(exonEnd, newEnd);
	size += exonEnd - exonStart;
	}
    }

/* Output first 10 fields of bed12. */
if (size > minSize)
    {
    fprintf(f, "%s\t%d\t%d\t", gp->chrom, newStart, newEnd);
    fprintf(f, "%s_%d_%s\t", type, id, gp->name);
    fprintf(f, "0\t%s\t0\t0\t0\t%d\t", gp->strand, blockCount);

    /* Output blockSizes field */
    for (i=0; i<gp->exonCount; ++i)
	{
	int exonStart = gp->exonStarts[i];
	int exonEnd = gp->exonEnds[i];
	exonStart = max(start, exonStart);
	exonEnd = min(end, exonEnd);
	if (exonStart < exonEnd)
	    fprintf(f, "%d,", exonEnd - exonStart);
	}
    fprintf(f, "\t");

    /* Output chromStarts field */
    for (i=0; i<gp->exonCount; ++i)
	{
	int exonStart = gp->exonStarts[i];
	int exonEnd = gp->exonEnds[i];
	exonStart = max(start, exonStart);
	exonEnd = min(end, exonEnd);
	if (exonStart < exonEnd)
	    fprintf(f, "%d,", exonStart - newStart);
	}
    fprintf(f, "\n");
    }
}

boolean codonToPos(struct genePred *gp, unsigned num, int *chromStart, int *chromEnd)
{
// map 1-based codon to genomic coordinates. If the codon crosses an exon junction, we return just the beginning (LHS) of the codon.
// Returns true if we find the codon in given gene predition; chromStart and end are set to appropriate three base region.

int pos = -1;
int i;
int offset = -1;  // current 1-based offset in bases (not codons)
if(gp->strand[0] == '+')
    {
    for(i = 0; i < gp->exonCount; i++)
        {
        if(gp->exonEnds[i] > gp->cdsStart && gp->exonStarts[i] < gp->cdsEnd)
            {
            int start, end;
            if(offset == -1 && gp->cdsStart <= gp->exonEnds[i])
                {
                // start counting
                start = gp->cdsStart;
                offset = 1;
                }
            else
                start = gp->exonStarts[i];
            if(gp->cdsEnd < gp->exonEnds[i])
                end = gp->cdsEnd;
            else
                end = gp->exonEnds[i];
            int next = offset + end - start;
            if(next > (num * 3 - 2))
                {
                pos = start + (((num - 1) * 3 + 1) - offset);
                break;
                }
            else
                offset = next;
            }
        }
    }
else
    {
    for(i = gp->exonCount - 1; i >= 0; i--)
        {
        if(gp->exonStarts[i] < gp->cdsEnd && gp->exonEnds[i] > gp->cdsStart)
            {
            int start, end; // start here is really the RHS, and end is the LHS
            if(offset == -1 && gp->cdsEnd >= gp->exonStarts[i])
                {
                // start counting
                start = gp->cdsEnd;
                offset = 1;
                }
            else
                start = gp->exonEnds[i];
            if(gp->cdsStart > gp->exonStarts[i])
                end = gp->cdsStart;
            else
                end = gp->exonStarts[i];
            int next = offset + start - end;
            if(next > num * 3)
                {
                pos = start - (num*3 - offset) - 1;
                break;
                }
            else
                offset = next;
            }
        }
    }
if(pos == -1)
    return FALSE;
else
    {
    *chromStart = pos;
    *chromEnd = pos + 3;
    return TRUE;
    }
}

boolean exonToPos(struct genePred *gp, unsigned num, int *chromStart, int *chromEnd)
{
// map 1-based exon number to genomic coordinates.
// Returns true if we find the exon in given gene predition; start and end are set to appropriate region.

if(num == 0 || num > gp->exonCount)
    return FALSE;
else if(gp->strand[0] == '+')
    {
    *chromStart = gp->exonStarts[num - 1];
    *chromEnd = gp->exonEnds[num - 1];
    }
else
    {
    *chromStart = gp->exonStarts[gp->exonCount - num];
    *chromEnd = gp->exonEnds[gp->exonCount - num];
    }
return TRUE;
}
