/* ----------------------------------------------------------------------
 * gp_findorf -- print out ORFs found in a DNA or RNA sequence
 * Copyright (C) 2000 January Weiner III
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
 * USA.
 ---------------------------------------------------------------------- */

#include <stdio.h>
#include <ctype.h>
#include <stdlib.h>
#include <time.h>
#include <string.h>

#include "genpak.h"
#include "gp_getopt.h"

#define VERSION "0.3"
#define PROGNAME "gp_findorf"

#define MINSEQLEN 300
#define MAXSEQLEN 0

char *progname ;

struct options_s {
	long minlen ;
	long maxlen ;
	FILE* out ;
	codont* table ;
	int width ;
	char otype ;
} ;

/* Function prototypes */

codont* ReadCodonTable(FILE *in, codont* outtable) ;
int PrintCodonTable(FILE* out, codont *intable,int type) ;
int ExtractOrfs(sekw *inseq, struct options_s opts) ;


/*
 *
 */

int main(int argc, char *argv[])
{
	
	FILE *in = NULL, *out = NULL, *codet = NULL ;
	sekw *inseq ;
	codont *tabela ;

	extern char* optarg ;
	extern int optind ;

	int onlyprint = FALSE, tableformat = 1 ; 
	int optionaltable = FALSE ;

	int c, nseq = 0, norfs = 0;
	int errflg = 0 ;
	

	/* options structure holds all options -- to simplify argument passing */
	struct options_s options ;

	options.minlen = MINSEQLEN ;
	options.maxlen = MAXSEQLEN ;
	options.width = 70 ;
	options.otype = 0 ;

	/* Initializing structure holding all warnings */
	allwarnings = NULL ;

	/* Load the translation to three letter AA code */
	gp_codon_init_conversion() ;

	/* Load the standard code table */
	tabela = gp_codon_table_load_std() ;

	progname = argv[0] ;

	while ((c = getopt(argc, argv, "o:m:M:pc:vHhqd")) != EOF)
		switch(c) {
		case 'o':
			if(strchr("sn",optarg[0]) == NULL) 
				gp_error("Wrong option for -o") ;
			options.otype = optarg[0] ;
			break ;
		case 'M':
			if(sscanf(optarg,"%li",&options.maxlen) != 1)
				gp_error("Could not load parameter -M %s",optarg) ;
			break ;
		case 'm':
			if(sscanf(optarg,"%li",&options.minlen) != 1)
				gp_error("Could not load parameter -m %s",optarg) ;
			break ;
		case 'p':
			onlyprint = TRUE ;
			if(optind<argc) {
				if(optarg[0] == 'u') {
					tableformat = 0 ;
				} else if(optarg[0] == 'n') {
					tableformat = 1 ;
				}
			}
			break ;
		case 'c':
			optionaltable = TRUE ;
			if( (codet = gp_file_open(optarg,"r")) == NULL) {
				gp_error("Failed to open file %s for reading", optarg) ;
			} else {
				(void) ReadCodonTable(codet,tabela) ;
				gp_warn("Loaded optional code table from file %s",optarg) ;
			}

			break ;
		case 'H':
			html = TRUE ;
			break ;
		case 'q':
			quiet = TRUE ;
			break ;
		case 'v':
			fprintf(stderr,"%s version %s\n",progname,VERSION) ;
			exit(EXIT_SUCCESS) ;
			break ;
		case 'd':
			debug = TRUE ;
			gp_warn("Running in debug mode") ;
			break ;
		case 'h':
			Help() ;
			break ;
		default:
			errflg++ ;
			break;
		}

	if(errflg) {
			gp_error("Type '%s -h' for help",progname) ;
	}

	/* open the file pointer to read the sequences 
 	 * from: standard input or a file provided? 
 	 * We must only do it if we _need_ any input */

	if(onlyprint == FALSE) {
		if(optind >= argc) {
			in = stdin ;
		} else {
			in = gp_file_open(argv[optind],"r") ;
		}
	}


/* 
 * opening the file pointer to write the output: 
 * standard output or file provided? 
 */

	optind++ ;

	if(optind >= argc) {
		out = stdout ;
	} else {
		out = gp_file_open(argv[optind],"wb") ;
	}


	
	if(onlyprint == TRUE) {
		(void) PrintCodonTable(out,tabela,tableformat) ;
		free(tabela) ;
		if(codet != NULL) fclose(codet) ;
		return(EXIT_SUCCESS) ;
	}

	/* all optional elements are now defined. We can get them together */

	options.minlen = (options.minlen)/3 ;
	options.maxlen = (options.maxlen)/3 ;
	options.out = out ;
	options.table = tabela ;

	/* Main program loop, executed as long there are sequences on input */

	while( (inseq = gp_seq_read_fragment(in,0,0,0)) != NULL) {
	
		nseq++ ;
		if(debug) gp_warn("Processing sequence %i",nseq) ;
		norfs += ExtractOrfs(inseq,options) ;
	}
	
	gp_warn("%i sequence(s) processed, total ORFs found %i",nseq,norfs) ;

	free(tabela) ;
	if(codet!= NULL) fclose(codet) ;
	if(html) gp_warn_print_all(out) ;
	fclose(out) ;
	fclose(in) ;
	return(EXIT_SUCCESS);
}




/* 
 * Given a sequence, ExtractOrfs processes and prints out all ORFSs founds,
 * and returns the number of all ORFs that have been found. 
 */

int ExtractOrfs(sekw *inseq, struct options_s opt) {

	FILE *out ;
	int norfs = 0 ;
	int width = 70 ;
	sekw *tempseq, *outseq ;
	int h,j ;
	long i ;
	char z ;

	out = opt.out ;

	/* first, the forward sequence, then the reverse */
	for(h = 0;h<2;h++) {

		if(h == 0) {
			tempseq = inseq ;
			z = ' ' ;
		} else {
			tempseq = gp_seq_reverse(inseq) ;
			z = '-' ;
		}
	
		/* j stands for three reading frames possible for each strang */
		for(j = 0;j<3;j++) {
		if(debug) gp_warn("reading frame %c%i",z,j) ;

			/* scanning the whole sequence */
			for(i = j;i<tempseq->leng-6;i += 3) {
	
				if(gp_codon_isstart(tempseq,i) && 
					(outseq = gp_seq_dna_to_protein(tempseq,opt.table,i+1,TRUE)) != NULL) { 

	
					/* end of the if(outseq != NULL) clause */
					/* an ORF is born... */
					if(outseq->leng > opt.minlen && 
						(opt.maxlen == 0 || outseq->leng < opt.maxlen) ) {
						norfs++ ;
						if(debug) gp_warn("position %i",i+1) ;
						/* What's that? Where does it come from, goddamit?*/
						/* (void) gp_seq_print_fasta(out,outseq,opt.width) ; */
						if(z == ' ')
							fprintf(out,"%li %li\n",i+1,i+3+3*outseq->leng) ;
						else fprintf(out,"%li %li\n",
							tempseq->leng - i,tempseq->leng-(i+2+3*outseq->leng)) ;
					}
					
					/* skip smaller orfs */
					i += (outseq->leng+1)*3 ;
					free(outseq->sequ) ;
					free(outseq) ;
				}   /* end of the IsStartCodon phrase */
			}     /* end of scanning sequence in one frame */
		}       /* end of the three frames for() clasue */
	}					/* end of extracting both directions */

	free(tempseq->sequ) ;
	free(tempseq) ;
	free(inseq->sequ) ;
	free(inseq) ;
	return(norfs) ;
}


/* 
 * Read the codon table from a file. The provided outtable will be directly
 * modified. Actually, making this procedure codont* makes no sense, but
 * it's because of backward compatibility.
 */

codont* ReadCodonTable(FILE *in, codont* outtable) {
	int i,j ;
	char t,coord[3] ;
	char bufor[BUFSIZ] ;
	int codons_read = 0, lines_read = 0, Conv[128] ;
	int check = TRUE ;


	/* 
	 * The Conv matrix converts the nucleic acid letter to 
	 * apriopriate coordinate for the codont matrix 
	 */

	for(i = 0;i<128;i++) Conv[i] = 99 ;
	Conv['A'] = 0 ; Conv['C'] = 1; Conv['G'] = 2 ; Conv['T'] = 3 ; Conv['U'] = 3 ;

	while(fgets(bufor,BUFSIZ,in) != NULL) {

		/* skipping comments and blank lines */
		lines_read++ ;
		if(bufor[0] == '#' || bufor [0] == '\n') continue ; 

		/* reading the codon coordinates */
		for(i = 0,j = 0;i<3;i++,j++) {
			/* skipping blanks */
			while(bufor[j] == ' ' || bufor[j] == '\t') j++ ;
			t = toupper(bufor[j]) ;

			/* checking if the bases are in "ATCGU" */
			if(Conv[t] > 3 || Conv[t] < 0) {
				check = FALSE ;
			} else {
				coord[i] = Conv[t] ;
			}
		}
		
		/* reading the corresponding amino acid letter */
		/* skipping blanks */
		while(bufor[j] == ' ' || bufor[j] == '\t') j++ ;
		t = toupper(bufor[j]) ;
		if(check == FALSE || ((char*) strchr("0GAVLIPCMFWSTYNQKRHDE",t) == NULL)) {
  		printf("t = %c\n",t) ;
			gp_warn("Problems reading codon table") ;
		} else {
			outtable->tbl[coord[0]][coord[1]][coord[2]] = t ;
			codons_read++ ;
		}

	}

	if(debug) 
		fprintf(stderr,"%i lines read, %i codons read\n",
						lines_read, codons_read) ;

	return(outtable);

}



/*
 * Printing out the codon table. Currently supported formats:
 * 0 - each codon followed by the AA 1 letter code in a separate line
 * 1 - nice ASCII table
 */

int PrintCodonTable(FILE* out, codont *intable, int type) {

	int i,j,k ;
	char Conv[4] ;

	Conv[0] = 'A' ;
	Conv[1] = 'C' ;
	Conv[2] = 'G' ;
	Conv[3] = 'U' ;

	/* standard .cdn format, just like the one seq2prot can read */
	if(type == 0) {
		fprintf(out,"# Codon table \n") ;
		for(i = 0;i<4;i++)
			for(j = 0;j<4;j++)
				for(k = 0;k<4;k++) {
					fprintf(out,"%c%c%c %c\n",
						Conv[i], Conv[j], Conv[k], 
						intable->tbl[i][j][k]) ;
				}
	} else {
		fprintf(out,"\n\n") ;
		fprintf(out, 
		"                              2nd position of codon\n") ;
		fprintf(out, 
		"1st                A              C              G              U\n") ;
		fprintf(out, 
		"position ------------------------------------------------------------\n") ;
		fprintf(out, "of codon\n") ;


		for(i = 0;i<4;i++) {
			for(k = 0;k<4;k++) {

				if(k == 1)fprintf(out, "     %c ",Conv[i]) ;
				else    fprintf(out, "       ") ;

				for(j = 0;j<4;j++) {

					/* Print the code and amino acid / STOP */
					if(intable->tbl[i][j][k] == '0') {
						fprintf(out, "    %c%c%c   STOP ",
							Conv[i],Conv[j],Conv[k] ) ;
					} else {
						fprintf(out, "    %c%c%c  %c(%s)",
							Conv[i],Conv[j],Conv[k],
							intable->tbl[i][j][k],
					 		one2three[intable->tbl[i][j][k]]) ;
					}

				}

			fprintf(out,"\n") ; /* end of line */
			}
		fprintf(out,"\n") ;
		}
	fprintf(out,"\n\n") ;
	} /* end of the else clause */
		
	return(EXIT_SUCCESS) ;
}





void Help()
{
printf("\n");
printf("%s version %s - print ORFs found in a sequence",PROGNAME,VERSION);
printf("\n");
printf(" Usage:\n");
printf("     %s [options] [ input file ] [ output file ]\n",progname);
printf("\n");
printf(" Options:\n");
printf("  -m value  : set minimal ORF length to value (default 100 bases)\n");
printf("  -M value  : set maximal ORF length to value (default - unlimited)\n");
printf("  -o s      : print the ORF protein sequences\n");
printf("  -o n      : show the ORF positions\n");
printf("  -c file   : read the optional codon usage table \n");
printf("  -p n[ice] : will print out a formated codon table & exit(default)\n");
printf("  -p u[gly] : will print out the codon table & exit\n");
printf("  -H        : run in HTML mode (see manual)\n");
printf("  -q        : run in quiet mode\n");
printf("  -v        : print version information & exit\n");
printf("  -h        : print this help screen & exit\n\n");
exit(0);
}
