/* ----------------------------------------------------------------------
 * matrix
 * Program searching for promoters using the Staden matrix
 * Copyright (C) 2000 January Weiner III
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
 * USA.
 ---------------------------------------------------------------------- */

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <math.h>
#include <locale.h>

#include "genpak.h"
#include "gp_getopt.h"


#define PROGNAME "gp_matrix"
#define VERSION "1.0"

#define TRESHOLD -99
#define MAX_1 12    /* lengths of the four matrices */
#define MAX_10 19
#define MAX_35 25
#define MAX_S 20
#define MINGAP10 10

int m1_size = MAX_1, m10_size = MAX_10, m35_size = MAX_35, mgap_size = MAX_S ; 

typedef struct {
	long leng ;
	double *tbl[4] ; } gcdistro_s  ;

struct promoter_ss {
	char name[100] ;
	char sequ[150] ;
	long pos ;
	int gap1 ;
	int gap10 ;
	double score ;
	double score1 ;
	double score10 ;
	double score35 ;
	struct promoter_ss * next ; } ;

typedef struct promoter_ss promoter_s ;

typedef struct {
	double m1[MAX_1][4] ;
	double m10[MAX_10][4] ;
	double m35[MAX_35][4] ;
	double ms[MAX_S] ; } Matrix ;

typedef struct {
	FILE *in ;  /* files */
	FILE *out ;
	FILE *matrix ;
	FILE *gcdistro_f ;
	gcdistro_s gcdistro ;
	int min10 ; /* gap limits */
	int max10 ;
	int min1 ;
	int max1 ;
	int gap ; 				 /* show gaps? */
	int experimental ; /* assume experimental? */
	int showsequence ; /* show promoter sequences? */
	int showmatrix ;   /* computate new matrix? */
	int shownames ;    /* show sequence names? */
	int usegcdistro ;  /* use a GC distribution? */
	int showpositions ; /* show positions instead of sequences */
	double treshold ;  /* treshold value */
	double gccontents ;} options_s ;

int GetMatrix(FILE *mt, Matrix *mtx) ;
int GetGCDistro(options_s opt, gcdistro_s *gcd ) ;
void GetTheOptions(options_s *option, int argc, char *argv[]) ;
void EraseAllPromoters(promoter_s *p) ;

int PrintAllPromoters(options_s opt, promoter_s *p) ;
int PrintMatrix(FILE *out, Matrix *matrix) ;
int PrintPromoter(options_s opt, promoter_s *prom) ;
void PrintStatistics(options_s o,promoter_s *p) ;

int ComputateMatrix(int N, Matrix* m) ;
int ScaleMatrix(Matrix* m, double gc) ;
int ZeroMatrix(Matrix *mtx) ;
char MaksValue(double i[4]) ;

promoter_s *EvaluateSequence(options_s option, 
	sekw* s, Matrix* im, Matrix *om) ;

/*
 * -----------------------------------------------------------------------
 *
 * -----------------------------------------------------------------------
 */

int main(int argc, char *argv[]) {

	options_s option ;

	Matrix inmatrix, outmatrix ;
	sekw* inseq ;

	promoter_s *temp, *current, *promoter ;
	gcdistro_s *gcdistro ;

	int nseq = 0, licznik = 0 ;

	allwarnings = NULL ;
	promoter = NULL ;
	progname = argv[0] ;

	/* Initializing matrices */

	ZeroMatrix(&inmatrix) ;
	ZeroMatrix(&outmatrix) ;

	setlocale(LC_ALL,"") ;

	GetTheOptions(&option,argc,argv) ;

	if(option.usegcdistro) GetGCDistro(option,&option.gcdistro) ;

	(void) GetMatrix(option.matrix,&inmatrix) ;
	(void) ScaleMatrix(&inmatrix,option.gccontents) ;

	gcdistro = &(option.gcdistro) ;

	/* if(debug) PrintMatrix(option->out, &inmatrix) ;*/


	/* Main program loop, creating a linked list of promoters */

	if(debug) gp_warn("Collecting maximal score promoters from sequences") ;
	while( (inseq = gp_seq_read_fragment(option.in,0,0,0)) != NULL) { 

		nseq++ ;

		if( option.usegcdistro && inseq->leng > gcdistro->leng) {
			gp_warn("Skipping sequence %i: sequence longer then %i",
				nseq,gcdistro->leng) ;
			temp = NULL ;
		} else {
			temp = EvaluateSequence(option,inseq,&inmatrix,&outmatrix) ;
		}

		if(temp != NULL) {
			licznik++ ;
			if(promoter == NULL) {
				promoter = temp ;
				current = promoter ;
			} else {
				current->next = temp ;
				current = current->next ;
				current->next = NULL ;
			}
		} 
	}

	if(debug) gp_warn("%i promoters collected",licznik) ;

	PrintStatistics(option,promoter) ;

	if(option.showsequence) PrintAllPromoters(option,promoter) ;


	if(option.showmatrix) {
		ComputateMatrix(licznik,&outmatrix) ;
		PrintMatrix(option.out,&outmatrix) ;
	}

	EraseAllPromoters(promoter) ;
	if(html) gp_warn_print_all(option.out) ;
	if(html) fprintf(option.out,"</PRE>") ;
	
	fclose(option.out) ;
	fclose(option.in) ;
	return EXIT_SUCCESS ;
}


/* 
 * ---------------------------------------------------------------------- 
 * Getting all program options
 * ---------------------------------------------------------------------- 
 */

void GetTheOptions(options_s *option, int argc, char *argv[]) {

	extern int optind ;
	extern char *optarg ;
	int errflg = 0 ;
	char c ;
	int t ;

	char optstring[] = "pD:X:NG:HSMT:tgm:qvdh" ;

	/* Initiating options */
	option->in = NULL ;
	option->out = NULL ;
	option->matrix = NULL ;
	option->gcdistro_f = NULL ;
	option->gap = TRUE ;
	option->experimental = FALSE ;
	option->showsequence = TRUE ;
	option->showmatrix = FALSE ;
	option->gccontents = 50 ;
	option->usegcdistro = FALSE ;
	option->treshold = TRESHOLD ;
	option->shownames = FALSE ;
	option->showpositions = FALSE ;
	option->min1= -3; 
	option->max1= 4 ; 
	option->min10= 0 ; 
	option->max10= 5 ; 

	while ((c = gp_getopt(argc, argv, optstring)) != EOF)
		switch(c) {
		case 'X':
			t = -100 ;
			if(sscanf(optarg,"min1= %i",&t) == 1) {
				option->min1= t-4 ;
				if(debug) gp_warn("option.min1= %i",option->min1) ;
			}
			if(sscanf(optarg,"max1= %i",&t) == 1) {
				option->max1= t-3 ;
				if(debug) gp_warn("option.max1= %i",option->max1) ;
			}
			if(sscanf(optarg,"min10= %i",&t) == 1) {
				option->min10= t-15 ;
				if(debug) gp_warn("option.min10= %i",option->min10) ;
			}
			if(sscanf(optarg,"max10= %i",&t) == 1) {
				option->max10= t-14 ;
				if(debug) gp_warn("option.max10= %i",option->max10) ;
			}
			if(t == -100) gp_warn("I don't understand \"%s\"",optarg) ;
		break ;
		case 'p':
			gp_warn("Showing position of +1 rather then the sequnece") ;
			option->showpositions = TRUE ;
			break ;
		case 'N':
			gp_warn("Will show sequence names") ;
			option->shownames = TRUE ;
			break ;
		case 'G':
			if(sscanf(optarg,"%lf",&option->gccontents) != 1) 
				gp_error("Could not read the command line parameter GC%") ;
			break ;
		case 'H':
			html = TRUE ;
			gp_warn("Will format in HTML") ;
			break ;
		case 'S':
			gp_warn("Will not show the promoter sequences") ;
			option->showsequence = FALSE ;
			break ;
		case 'M':
			gp_warn("Will computate a new matrix") ;
			option->showmatrix = TRUE ;
			break ;
		case 'T':
			if(sscanf(optarg,"%lf",&option->treshold) != 1) 
				gp_error("Could not read the command line parameter Treshold") ;
			break ;
		case 't':
			option->experimental = TRUE ;
			gp_warn("Assuming the sequences have experimental 5' ends") ;
			break ;
		case 'g':
			option->gap = FALSE ;
			gp_warn("Ignoring the gap penalties") ;
			break ;
		case 'D':
			gp_warn("Reading GC distribution from file %s",optarg) ;
			option->gcdistro_f = gp_file_open(optarg,"r") ;
			option->usegcdistro = TRUE ;
			break ;
		case 'm':
			if(debug) gp_warn("Reading matrix from file %s",optarg) ;
			option->matrix = gp_file_open(optarg,"r") ;
			break ;
		case 'q':
			quiet = TRUE ;
			break ;
		case 'v':
			gp_warn("%s version %s\n",progname,VERSION) ;
			exit(0) ;
			break ;
		case 'd':
			debug = TRUE ;
			gp_warn("Running in debug mode") ;
			break ;
		case 'h':
			Help() ;
			break ;
		default:
			errflg++ ;
			break;
		}

	if(errflg) {
			gp_error("Type '%s -h' for help",progname) ;
	}


/* open the file pointer to read the sequences 
 * from: standard input or a file provided? */
	if(optind >= argc) option->in = stdin ;
	else option->in = gp_file_open(argv[optind],"r") ;

/* opening the file pointer to write the output: 
 * standard output or file provided? */
	optind++ ;

	if(optind >= argc) option->out = stdout ;
	else option->out = gp_file_open(argv[optind],"wb") ;

	if(option->matrix == NULL) {
		option->matrix = stdin ;
		fprintf(option->out,"# reading matrix from standard input\n") ;
	}

	optind-- ;

	if(html) fprintf(option->out,"<PRE>") ;
	if(optind < argc)
		fprintf(option->out,"# sequences read from file %s\n",argv[(optind)]) ;
	fprintf(option->out,"# Parameters:\n# gap - %s; assume experimental - %s\n",
		yesno[option->gap],yesno[option->experimental]) ;
	fprintf(option->out,"# Treshold set to %.2f\n",option->treshold) ;
	if(option->usegcdistro)
		fprintf(option->out,"# Adjusting matrix with a GC distribution\n") ;
	fprintf(option->out,"# GC content assumed to be %.2f\n",option->gccontents) ;
	fprintf(option->out,"# Gap +1/-10: %i to %i, Gap -10/-35: %i to %i\n",
		option->min1+4, option->max1+3, option->min10+15, option->max10+14) ;
	if(html) fprintf(option->out,"</PRE>") ;

}



/* Prints result statistics */

void PrintStatistics(options_s o,promoter_s *p) {

	double score_m = 0.0, score_se = 0.0, 
	score1_m = 0.0, score1_se = 0.0, 
	score10_m = 0.0, score10_se = 0.0, 
	score35_m = 0.0, score35_se = 0.0, 
		gap1_m = 0.0, gap1_se = 0.0,
		gap10_m = 0.0, gap10_se = 0.0 ;
	int i = 0 ;

	promoter_s *t ;

	t = p ;

	while(t != NULL) {
		score_m += t->score ;
		score_se += t->score * t->score ;
		score1_m += t->score1 ;
		score1_se += t->score1 * t->score1 ;
		score10_m += t->score10 ;
		score10_se += t->score10 * t->score10 ;
		score35_m += t->score35 ;
		score35_se += t->score35 * t->score35 ;
		gap1_m += t->gap1 + 4 ;
		gap1_se += (t->gap1 +4 ) * (t->gap1 +4 ) ;
		gap10_m += t->gap10 + 15 ;
		gap10_se += (t->gap10 +15) * (t->gap10 + 15);
		i++ ;
		t = t->next ;
	}

	if(i>1) {
		score_se = pow(gp_variance_e(score_m,score_se,i),0.5) ;
		score1_se = pow(gp_variance_e(score1_m,score1_se,i),0.5) ;
		score10_se = pow(gp_variance_e(score10_m,score10_se,i),0.5) ;
		score35_se = pow(gp_variance_e(score35_m,score35_se,i),0.5) ;
		gap1_se = pow(gp_variance_e(gap1_m,gap1_se,i),0.5) ;
		gap10_se = pow(gp_variance_e(gap10_m,gap10_se,i),0.5) ;
	} else {
		score_se = 0.0 ; gap1_se = 0.0 ; gap10_se = 0.0 ;
	}

	score_m /= i ;
	score1_m /= i ;
	score10_m /= i ;
	score35_m /= i ;
	gap1_m /= i ;
	gap10_m /= i ;

	if(html) {
		fprintf(o.out,"<HR><BR><H2>Results statistics:</H2>") ;
		fprintf(o.out,"Number of sequences evaluated: %i<BR>\n",i) ;
		fprintf(o.out,"<TABLE BORDER= 1>\n") ;
		fprintf(o.out,"<TR><TH>variable</TH><TH>mean</TH><TH>SE</TH></TR>\n") ;
		fprintf(o.out,"<TR><TD>score</TD><TD>%2.2f</TD><TD>%2.2f</TD></TR>\n",
			score_m, score_se) ;
		fprintf(o.out,"<TR><TD>gap +1/-10</TD><TD>%2.2f</TD><TD>%2.2f</TD></TR>\n",
			gap1_m, gap1_se) ;
		fprintf(o.out,"<TR><TD>gap -10/-35</TD><TD>%2.2f</TD><TD>%2.2f</TD></TR>\n",
			gap10_m, gap10_se) ;
		fprintf(o.out,"</TABLE>\n") ;
	} else {
		fprintf(o.out,"# Result statistics: \n") ;
		fprintf(o.out,"#        total |  +1   | -10   | -35  \n") ;
		fprintf(o.out,"# -------------+-------+-------+------- \n") ;
		fprintf(o.out,"# score  %2.2f | %2.2f | %2.2f | %2.2f \n",
			score_m, score1_m, score10_m, score35_m ) ;
		fprintf(o.out,"# SE:    %2.2f | %2.2f | %2.2f | %2.2f\n",
			score_se, score1_se, score10_se, score35_se ) ;
		fprintf(o.out,"# Mean gap sizes: \n") ;
		fprintf(o.out,"# gap +1/ -10: %2.2f +- %2.2f\n",gap1_m,gap1_se) ;
		fprintf(o.out,"# gap -10/ -35: %2.2f +- %2.2f\n",gap10_m,gap10_se) ;
		fprintf(o.out,"# Total promoters evaluated: %i: \n",i) ;
	}
		

}


/* free memory allocated for the promoter linked list */
void EraseAllPromoters(promoter_s *p) {
	promoter_s *t ;
	while(p != NULL ) {
		t = p ;
		p = p->next ;
		free(t) ;
	}
}


/* Print out all the promoters */
int PrintAllPromoters(options_s opt, promoter_s *p) {

	promoter_s *temp ;

	if(html) {
		fprintf(opt.out,"<HR><BR><H2>Promoter sequences</H2>\n") ;
		fprintf(opt.out,"<TABLE BORDER= 1><TR><TH BGCOLOR= \"00FF00\">\n") ;
		fprintf(opt.out,"score</TH><TH> </TH>") ;
		fprintf(opt.out,"<TH BGCOLOR= \"00FF00\">-35</TH><TH> </TH>") ;
		fprintf(opt.out,"<TH BGCOLOR= \"00FF00\">-10</TH>") ;
		fprintf(opt.out,"<TH> </TH><TH BGCOLOR= \"00FF00\">+1</TH>") ;
		fprintf(opt.out,"<TH> </TH>") ;
		if(opt.shownames)
			fprintf(opt.out,"<TH BGCOLOR= \"00FF00\">name</TH>") ;
		fprintf(opt.out,"</TR>") ;
	}

	temp = p ;
	while(temp != NULL) {
		PrintPromoter(opt,temp) ;
		temp = temp->next ;
	}

	if(html) fprintf(opt.out,"</TABLE>\n") ;

	return EXIT_SUCCESS ;

}


/* =======================================================================
 * Here we take a sequence s, and run it against our current matrix.
 * gap is TRUE if we use gap penalty, and FALSE if not.
 * The result, formatted, is sent to FILE *out.
 * in the om matrix structure the frequencies of the defined promoters are 
 * stored.
 * =======================================================================   */


promoter_s * EvaluateSequence(options_s opt, sekw* s, Matrix* im, Matrix *om) {

	/* scan using promoter_s *t, maximal values stored in promoter_s *res */

	promoter_s *t, *res ;
	long i,kk,pos; /* counters */
	long blocksize ;

	gcdistro_s *gc ;

	/* for easy accessing of score matrices using direct chars... */
	int c[128], index ; 
	c['A'] = 0 ; c['C'] = 1 ; c['G'] = 2; c['T'] = 3 ;

	/* initializing two promoter variables */
	t = calloc(1,sizeof(*t)) ;
	res = calloc(1,sizeof(*res)) ;
	res->score = -999 ;
	res->score1= 0 ;
	res->score10= 0 ;
	res->score35= 0 ;
	res->gap1= 0 ;
	res->gap10= 0;
	res->pos = 0;
	res->sequ[0] = '\0' ;

	gc = &(opt.gcdistro) ;

	/* checking whether sequence is longer then a minimal promoter region*/
	if(s->leng < (m1_size + opt.min1 + m10_size + opt.min10 + m35_size )) {
		gp_warn("Sequence %s too short to be evaluated",s->name) ;
		free(res) ;
		free(t) ;
		return NULL ;
	}

	/* calculating the score for sequence 
	 * t->pos - distance from sequence start
	 * t->gap1 - actual gap1 (between +1 and -10 matrices)
	 * t->gap10 - actual gap10 (between -10 and -35 matrices) */

	for(t->pos = 0 ; t->pos < s->leng ; t->pos++) 
		for(t->gap1= opt.min1; t->gap1<opt.max1; t->gap1++)
			for(t->gap10 = opt.min10; t->gap10 < opt.max10; t->gap10++) {

				blocksize = t->gap1 + t->gap10 + m10_size + m35_size + m1_size ;

				if(opt.experimental && 
					((t->pos  + blocksize) != (s->leng-1))) continue ;
		
				if((t->pos + blocksize) > s->leng) continue ;


				/* setting the initial scores, and maybe adding the gap scores*/

				if(opt.gap)  
					t->score = im->ms[t->gap10 + 15 - MINGAP10] ; 
				else 
					t->score = 0.0 ;
				t->score1 = 0 ;
				t->score10 = 0 ;
				t->score35 = 0 ;

				/* evaluating the -35 matrix */
				for(kk = 0; kk < m35_size;kk++) {
					pos = t->pos + kk ;
					index = c[s->sequ[pos]] ;
					t->score35 += im->m35[kk][index] ;
					if(opt.usegcdistro) t->score35 += gc->tbl[index][kk] ;
				}


				/* Evaluating the -10 region */
				for(kk = 0; kk < m10_size; kk++) {
					pos = t->pos + m35_size + t->gap10 + kk ;
					index = c[s->sequ[pos]] ;
					t->score10 += im->m10[kk][index] ;
					if(opt.usegcdistro) t->score10 += gc->tbl[index][kk] ;
				}


				/* Evaluating the +1 region */
				for(kk = 0; kk < m1_size; kk++) {
					pos = t->pos + m35_size + t->gap10 + m10_size + t->gap1 + kk ;
					index = c[s->sequ[pos]] ;
					t->score1 = t->score1 + im->m1[kk][index] ;
					if(opt.usegcdistro) t->score1 += gc->tbl[index][kk] ;
				}

				t->score += t->score10 ;
				t->score += t->score35 ;

				if(!opt.experimental) {
					t->score = t->score + t->score1 ;
				}

				if(t->score > res->score) {
					res->score = t->score ;
					res->score1= t->score1 ;
					res->score10= t->score10 ;
					res->score35= t->score35 ;
					res->pos = t->pos ;
					res->gap1= t->gap1 ;
					res->gap10= t->gap10 ; 
				}
		}	

	if(opt.experimental) {
		res->score = res->score + res->score1 ;
	}

	blocksize = (m1_size + m10_size + m35_size + res->gap1 + res->gap10) ;
	for(kk = 0 ; kk < blocksize ; kk++ ) 
		res->sequ[kk] = s->sequ[kk + res->pos] ;
	res->sequ[kk+1] = '\0' ;

	strcpy(res->name,s->name) ;

	
	/* All right. Up to now, we know where the promoter combination with the
	 * highest score is. */
	
	if(debug) gp_warn("%s Max = %f, pos_max = %li, gap1_max = %i, gap10_max = %i",
		s->name,res->score,res->pos,res->gap1,res->gap10) ;
	if(debug) gp_warn("Promoter: %s\n",res->sequ) ;

	if(res->score < opt.treshold) {
		free(res) ;
		free(t) ;
		return NULL ;
	}

	/* 
	 * There is yet one thing left to do. We must update the output matrix
	 * structure, &om, to record the frequencies of single nucleotides. 
	 * We do it only in the case when score_max is greater then the treshold
	 * set for the current run 
	 */

	for(i = 0;i<m35_size;i++) 
		om->m35[i][c[s->sequ[(res->pos + i)]]] += 1 ;

	for(i = 0;i<m10_size;i++)
		om->m10[i][c[s->sequ[(res->pos + m35_size + res->gap10 + i)]]] += 1 ;

	blocksize = res->pos + m35_size + m10_size + res->gap10 + res->gap1 ;
	for(i = 0;i<m1_size;i++)
		om->m1[i][c[s->sequ[(blocksize + i)]]] += 1 ;

	if(((res->gap10+15-MINGAP10) >= 0) && 
		((res->gap10+15-MINGAP10) < mgap_size))
			om->ms[(res->gap10+15-MINGAP10)]++ ;

	free(t) ;
		
	return res ;
}



/*
 * matrices read from input are assumed to be scaled to 50% GC contents,
 * that is, each nucleotide is expected to be found with equal probability.
 * This does not hold for organisms with other GC contents. Therefore,
 * apropriate scaling is needed.
 */

int ScaleMatrix(Matrix* m, double gc) {

	int i ;
	double a, c, g, t ;

	a = log((0.5*(100-gc)/100)/0.25) ;
	t = a ;
	c = log((0.5*gc/100)/0.25) ;
	g = c ;

	if(debug) gp_warn("Freqs: A: %f, C: %f, G: %f, T: %f",a,c,g,t) ;
	
	for(i = 0;i<m35_size;i++) {
			m->m35[i][0] = m->m35[i][0]-a ;
			m->m35[i][1] = m->m35[i][1]-c ;
			m->m35[i][2] = m->m35[i][2]-g ;
			m->m35[i][3] = m->m35[i][3]-t ;
		}
	
	for(i = 0;i<m10_size;i++) {
			m->m10[i][0] = m->m10[i][0]-a ;
			m->m10[i][1] = m->m10[i][1]-c ;
			m->m10[i][2] = m->m10[i][2]-g ;
			m->m10[i][3] = m->m10[i][3]-t ;
		}

	for(i = 0;i<m1_size;i++) {
			m->m1[i][0] = m->m1[i][0]-a ;
			m->m1[i][1] = m->m1[i][1]-c ;
			m->m1[i][2] = m->m1[i][2]-g ;
			m->m1[i][3] = m->m1[i][3]-t ;
		}

	return(EXIT_SUCCESS) ;
}




/*
 * We have set the occurences of nucleotides in the matrix, but we have 
 * to computate ln(freq) for each matrix cell, where freq = Number of
 * occurences of this base/ total number of sequences 
 * The expected frequencies are assumed to be 0.25.
 */

int ComputateMatrix(int N, Matrix* m) {

int i,j ;
	
	for(i = 0;i<m35_size;i++) 
		for(j = 0;j<4;j++) {
			if(m->m35[i][j]<0.1)
				m->m35[i][j] = 0.01 ;
			m->m35[i][j] = log((m->m35[i][j]/N)/0.25) ;
		}
	
	for(i = 0;i<m10_size;i++) 
		for(j = 0;j<4;j++) {
			if(m->m10[i][j]<0.1)
				m->m10[i][j] = 0.01 ;
			m->m10[i][j] = log((m->m10[i][j]/N)/0.25) ;
		}

	for(i = 0;i<m1_size;i++) 
		for(j = 0;j<4;j++) {
			if(m->m1[i][j]<0.1)
				m->m1[i][j] = 0.01 ;
			m->m1[i][j] = log((m->m1[i][j]/N)/0.25) ;
		}

	for(i = 0;i<mgap_size;i++) {
		if(m->ms[i]<0.01)
			m->ms[i] = 0.01 ;
		m->ms[i] = log(m->ms[i]/N) ;
	}
	return(EXIT_SUCCESS) ;
}



/* ----------------------------------------------------------------------
 * We have to load the matrix from a file. This will be not easy 
 * ---------------------------------------------------------------------- */

int GetMatrix(FILE *mt, Matrix *mtx) {

	int i = 0, j = 0, line = 0 ;
	int rez = 0 ;
	char bufor[BUFSIZ] ;
	double a,b,c,d ;

/* Read the first three matrices */

	if(debug) gp_warn("Reading matrix +1") ;

	for (j = 0;j<m1_size;j++) {
		do {
			line++ ;
			if(fgets(bufor,BUFSIZ,mt) == NULL) {
				gp_error("Unexpected end of matrix file at line %i",line) ;
			}
		} while (bufor[0] == '#' || bufor[0] == '\n' 
			|| bufor[0] == ' ' || bufor[0] == '\t') ; 
	
		if(sscanf(bufor," %lf %lf %lf %lf", &a, &b, &c, &d) != 4) {
			gp_error("Could not read matrix file line %i",line);
		} else {
			mtx->m1[j][0] = a ;
			mtx->m1[j][1] = b ;
			mtx->m1[j][2] = c ;
			mtx->m1[j][3] = d ;
			/* if(debug) gp_warn("Read %s -> %f,%f,%f",bufor,a,b,c,d) ;*/
		}
	}
	
	if(debug) gp_warn("Reading matrix -10") ;
	
	for (j = 0;j<m10_size;j++) {
		do {
			line++ ;
			if(fgets(bufor,BUFSIZ,mt) == NULL) {
				gp_error("Unexpected end of matrix file at line %i",line) ;
			}
		} while (bufor[0] == '#' || bufor[0] == '\n' 
			|| bufor[0] == ' ' || bufor[0] == '\t') ; 
	
		if(sscanf(bufor," %lf %lf %lf %lf",
				&mtx->m10[j][0],
				&mtx->m10[j][1],
				&mtx->m10[j][2],
				&mtx->m10[j][3]) != 4) {
					gp_error("Could not read matrix file line %i",line);
				}
	}
	
	if(debug) gp_warn("Reading matrix -35") ;
	
	for (j = 0;j<m35_size;j++) {
		do {
			line++ ;
			if(fgets(bufor,BUFSIZ,mt) == NULL) {
				gp_error("Unexpected end of matrix file at line %i",line) ;
			}
		} while (bufor[0] == '#' || bufor[0] == '\n' 
			|| bufor[0] == ' ' || bufor[0] == '\t') ; 
	
		if(sscanf(bufor,"%lf %lf %lf %lf",
				&mtx->m35[j][0],
				&mtx->m35[j][1],
				&mtx->m35[j][2],
				&mtx->m35[j][3]) != 4) {
					gp_error("Could not read matrix file line %i",line);
				}
	}

	if(debug) gp_warn("Reading gap penalties") ;
	
	for (j = 0;j<mgap_size;j++) {
		do {
			line++ ;
			if(fgets(bufor,BUFSIZ,mt) == NULL || bufor[0] == '\n') {
				gp_warn("Not all gap penalties have been read") ;
				j = mgap_size ;
			}
		} while (bufor[0] == '#' || bufor[0] == ' ' || bufor[0] == '\t') ; 
	
		if(sscanf(bufor,"%i %lf", &i, &a) != 2) {
					gp_error("Could not read matrix file line %i",line);
		} else { 
			mtx->ms[(i-MINGAP10)] = a ;
		}
	}
	
	return rez ;
	
}




/* Read the GC distribution file and set the correction table gcd */

int GetGCDistro(options_s opt, gcdistro_s *gcd ) {

	char bufor[BUFSIZ+1] ;
	FILE *in ;
	long i,line = 1 ;
	double t,gc ;

	in = opt.gcdistro_f ;

	if(fgets(bufor,BUFSIZ,in) == NULL)
		gp_error("Premature end of the GC distribution file") ;
	++line ;

	/* skipping what's not important */
	while(bufor[0] == '#' || bufor[0] == '\n') {
		++line ;
		if(fgets(bufor,BUFSIZ,in) == NULL)
			gp_error("Premature end of the specified GC distribution file") ;
	}

	/* reading span of distribution */
	if(sscanf(bufor," %li",&(gcd->leng)) != 1)
		gp_error("Could not read line %i from GC distribution file",line) ;

	if(debug) gp_warn("Reading %i GC distribution parameters",gcd->leng) ;

	/* allocating space for the distribution of the specified span */
	for(i = 0 ; i < 4 ; i++)
		gcd->tbl[i] = malloc(gcd->leng * sizeof(*(gcd->tbl[i]))) ;

	/* reading the distribution itself */
	for(i = 0 ; i < gcd->leng ; i++) {

		/* skipping comments */

		do {
			++line ;
			if(fgets(bufor,BUFSIZ,in) == NULL)
				gp_error("Premature end of GC distribution file at line %i",line);
		} while(bufor[0] == '#' || bufor[0] == '\n') ;

		if(sscanf(bufor," %lf",&t) != 1 ) 
			gp_error("Could not read line %i from GC distribution file",line) ;

		t = 0.5 * t / 100 ;  /* from % to per nucleotide frequency */

		/* correction table */

		gcd->tbl[1][i] = gcd->tbl[2][i] = log( 0.25 / t ) ;
		gcd->tbl[0][i] = gcd->tbl[3][i] = log( 0.25 / ( 0.5 - t) ) ;
			
		if(debug) gp_warn("Read t = %f : gc = %f , at = %f",
			t,gcd->tbl[1][i],gcd->tbl[0][i]) ;
	}

	return EXIT_SUCCESS ;

}




/* Zeroing a matrix */
int ZeroMatrix(Matrix* mtx) {

	int i ;

	for(i = 0;i<m1_size;i++) {
		mtx->m1[i][0] = 0.0 ; 
		mtx->m1[i][1] = 0.0 ; 
		mtx->m1[i][2] = 0.0 ; 
		mtx->m1[i][3] = 0.0 ;
	}

	for(i = 0;i<m10_size;i++) {
		mtx->m10[i][0] = 0.0 ; 
		mtx->m10[i][1] = 0.0 ; 
		mtx->m10[i][2] = 0.0 ; 
		mtx->m10[i][3] = 0.0 ;
	}

	for(i = 0;i<m35_size;i++) {
		mtx->m35[i][0] = 0.0 ; 
		mtx->m35[i][1] = 0.0 ; 
		mtx->m35[i][2] = 0.0 ; 
		mtx->m35[i][3] = 0.0 ;
	}

	for(i = 0;i<mgap_size;i++) mtx->ms[i] = 0.0 ;



return EXIT_SUCCESS ;

}


/* Print out the matrix so it can be read into another run of matrix */
int PrintMatrix(FILE *out, Matrix *mtx) {


	int i ;
	char sep[15] = " " ;

	if(html) fprintf(out,"<BR><HR><BR><H2>Matrices:</H2><BR><PRE>\n") ;
	fprintf(out,"# + 1 region\n") ;

	for(i = 0;i<m1_size;i++) {
		fprintf(out,"%4f%s%4f%s%4f%s%4f%s%c\n",
			mtx->m1[i][0],sep, 
			mtx->m1[i][1],sep, 
			mtx->m1[i][2],sep, 
			mtx->m1[i][3],sep,MaksValue(mtx->m1[i]) ) ;
	}

	fprintf(out,"# ") ;
	fprintf(out, "-10 region\n") ;

	for(i = 0;i<m10_size;i++) {
		fprintf(out,"%4f%s%4f%s%4f%s%4f%s%c\n",
			mtx->m10[i][0],sep, 
			mtx->m10[i][1],sep, 
			mtx->m10[i][2],sep, 
			mtx->m10[i][3],sep,MaksValue(mtx->m10[i]) ) ;
	}

	fprintf(out,"# -35 region\n") ;

	for(i = 0;i<m35_size;i++) {
		fprintf(out,"%4f%s%4f%s%4f%s%4f%s%c\n",
			mtx->m35[i][0],sep, 
			mtx->m35[i][1],sep, 
			mtx->m35[i][2],sep, 
			mtx->m35[i][3],sep,MaksValue(mtx->m10[i]) ) ;
	}

	fprintf(out,"# gap penalties\n") ;

	for(i = 0;i<mgap_size;i++) {
		fprintf(out,"%i %7.4f\n", (i+MINGAP10), mtx->ms[i]) ;
	}

	if(html) fprintf(out,"</PRE>\n") ;

	return(0) ;

}


char MaksValue(double i[4]) {

	int j = 0,jm = 0 ;
	double maks ;
	char conv[4] = {'A','C','G','T' } ;


	maks = i[0] ;

	for(j = 1;j<4;j++) {
		if(i[j]>maks) {
			jm = j ;
			maks = i[j] ;
		}
	}
	return conv[jm] ;
}



/* printing out nicely a single promoter */
int PrintPromoter(options_s opt, promoter_s *prom) {

	char s1[55], s2[55]; /* spacers */
	int i ;
	long pos ;

	if(html) {
		strcpy(s1,"</TD><TD BGCOLOR= \"yellow\"><B>") ;
		strcpy(s2,"</B></TD><TD>") ;
		fprintf(opt.out,"<TR><TD>%2.2f</TD><TD>",prom->score) ;
	} else {
		strcpy(s1,"[") ;
		strcpy(s2,"]") ;
		fprintf(opt.out,"# %2.2f ",prom->score) ;
		fprintf(opt.out,"%2.2f ",prom->score1) ;
		fprintf(opt.out,"%2.2f ",prom->score10) ;
		fprintf(opt.out,"%2.2f ",prom->score35) ;
	}

	pos = prom->pos + m35_size + prom->gap10 + m10_size + prom->gap1 + 3 ;
	if(opt.showpositions) 
		fprintf(opt.out,"%li %li %li ",
			prom->pos + 15, prom->pos + m35_size + prom->gap10 + 10, pos) ;

	for(i = 0;i<strlen(prom->sequ);i++){
		if(i == 14) fprintf(opt.out,"%s",s1) ;
		if(i == 20) fprintf(opt.out,"%s",s2) ;
		if(i == (m35_size + prom->gap10 + 10))
			fprintf(opt.out,"%s",s1) ;
		if(i == (m35_size + prom->gap10 + 16))
			fprintf(opt.out,"%s",s2) ;
		if(i == (m35_size + prom->gap10 + m10_size + prom->gap1 + 2))
			fprintf(opt.out,"%s",s1) ;
		if(i == (m35_size + prom->gap10 + m10_size + prom->gap1 + 3))
			fprintf(opt.out,"%s",s2) ;
		fprintf(opt.out,"%c",prom->sequ[i]) ;
	}

	if(opt.shownames) {
		if(html) fprintf(opt.out,"</TD><TD>%s",prom->name) ;
		else fprintf(opt.out," %s",prom->name) ;	
	}
	
	if(html) fprintf(opt.out,"</TD></TR>") ;

	fprintf(opt.out,"\n") ;


	return EXIT_SUCCESS ;
}



void Help()
{
printf("\n");
printf("%s version %s - search for promoters using the Hertz matrix",PROGNAME,VERSION);
printf("\n");
printf("  Usage:\n");
printf("     %s [ options ] [ input sequence ] [ output file ]\n",progname);
printf("\n");
printf("  Biological options:\n");
printf("   -M         : computate a new matrix\n") ;
printf("   -m m_file  : take the matrix values from file m_file\n") ;
printf("   -G value   : adjust matrix to GC contents of [value] %%\n");
printf("   -g         : ignore gap penalties\n");
printf("   -t         : assume that the transcription start") ;
printf(" is experimentally defined\n");
printf("   -T value   : set the treshold value (default %i)\n",TRESHOLD);
printf("   -X limit = value : set gap limits. 'limit' can be 'min1', 'max1',\n") ;
printf("             'min10', 'max10'.\n") ;
printf("   -D file    : read GC distribution from a file\n\n") ;

printf("  Output options:\n");
printf("   -H         : format output in HTML\n") ;
printf("   -N         : sequence names will be shown\n") ;
printf("   -S         : do not show the sequences\n") ;
printf("   -q         : run in quiet mode\n");
printf("   -v         : print version information & exit\n");
printf("   -h         : print this help screen & exit\n");
printf("   -e         : suppress error messages\n\n");
exit(0);
}


			
