/*************************************************************************

   Program:    ProFit
   File:       NWAlign.c
   
   Version:    V2.2
   Date:       20.12.01
   Function:   Protein Fitting program. 
   
   Copyright:  SciTech Software 1992-2001
   Author:     Dr. Andrew C. R. Martin
   Address:    SciTech Software
               23, Stag Leys,
               Ashtead,
               Surrey,
               KT21 2TD.
   Phone:      +44 (0)1372 275775
   EMail:      andrew@bioinf.org.uk
               
**************************************************************************

   This program is not in the public domain.

   It may not be copied or made available to third parties, but may be
   freely used by non-profit-making organisations who have obtained it
   directly from the author or by FTP.

   You are requested to send EMail to the author to say that you are 
   using this code so that you may be informed of future updates.

   The code may not be made available on other FTP sites without express
   permission from the author.

   The code may be modified as required, but any modifications must be
   documented so that the person responsible can be identified. If
   someone else breaks this code, the author doesn't want to be blamed
   for code that does not work! You may not distribute any
   modifications, but are encouraged to send them to the author so
   that they may be incorporated into future versions of the code.

   Such modifications become the property of Dr. Andrew C.R. Martin and
   SciTech Software though their origin will be acknowledged.

   The code may not be sold commercially or used for commercial purposes
   without prior permission from the author.
   
**************************************************************************

   Description:
   ============

**************************************************************************

   Usage:
   ======

**************************************************************************

   Revision History:
   =================
   V0.1  25.09.92 Original
   V0.5  08.10.93 Various tidying for Unix & chaned for booklib 
   V0.6  05.01.94 Modified MDMFILE for Unix getenv()
   V0.7  24.11.94 The DATAENV environment variable is now handled by code
                  in bioplib/align.c/ReadMDM; Checks the return from this
                  Fixed bug in multi-zone align
   V0.8  17.07.95 Replaced screen() stuff with printf()
                  Only allowed on single chains
   V1.0  18.07.95 Insert codes now work.
                  First official release (at last!).
   V1.1  20.07.95 Skipped
   V1.2  22.07.95 Added GAPPEN command making gap penalty global variable
   V1.3  31.07.95 Skipped
   V1.4  14.08.95 Skipped
   V1.5  21.08.95 Fixed bug in mapping alignment to zones. Also bug in
                  Bioplib align() routine
   V1.5b 15.11.95 Now also prints a score normalised by the length of the
                  shorter sequence.
   V1.6  20.11.95 Added ReadAlignment() code
   V1.6b 22.11.95 Added check in SetNWZones() for a deletion in both
                  sequences
   V1.6c 13.12.95 The check added in 1.6b wasn't working. Fixed!
   V1.6g 18.06.96 Changed MODE_* to ZONE_MODE_*
   V1.7  23.07.96 Skipped
   V1.7g 06.05.98 Rewrite of SetNWZones()
   V1.8  07.05.98 Skipped for release
   V2.0  01.03.01 Additions for iterative zone updating
   V2.1  28.03.01 Parameter for ITERATE and added CENTRE command
   V2.2  20.12.01 Skipped for release

*************************************************************************/
/* Includes
*/
#include "ProFit.h"


/************************************************************************/
/*>NWAlign(int strucnum)
   ---------------------
   28.09.92 Framework
   09.10.92 Original
   05.01.94 Modified to get data directory name from environment variable
            under Unix
   17.07.95 Replaced screen() with printf()
            Check only one chain in each structure.
   22.07.95 Made gap penalty a variable
   15.11.95 Also prints a score normalised by the length of the shorter 
            sequence.
   01.02.01 Added strucnum parameter
*/
void NWAlign(int strucnum)
{
   static   int FirstCall = TRUE;
   int      ref_len,
            mob_len,
            align_len,
            offset,
            score,
            i,    j,
            ai,   aj;
   char     *ref_align = NULL,
            *mob_align = NULL;
         
   printf("   Performing N&W alignment...\n");

   if(FirstCall)
   {
      if(!ReadMDM(MDMFILE))
      {
         printf("   Error==> Unable to read mutation data matrix\n");
         return;
      }
      
      FirstCall = FALSE;
   }
   
   /* Make checks that structures read                                  */
   if(gRefSeq==NULL || gMobSeq[strucnum]==NULL)
   {
      printf("   Error==> Structures have not been read!\n");
      return;
   }

   /* Check for numbers of chains                                       */
   if((countchar(gRefSeq,'*') > 0) || 
      (countchar(gMobSeq[strucnum],'*') > 0))
   {
      printf("   Error==> Structures must have only one chain for \
alignment\n");
      return;
   }

   /* Find sequence lengths                                             */
   ref_len = strlen(gRefSeq);
   mob_len = strlen(gMobSeq[strucnum]);
   
   /* Allocate memory for alignment sequences                           */
   if((ref_align = (char *)malloc((ref_len+mob_len)*sizeof(char)))==NULL)
   {
      printf("   Warning==> No memory for alignment!\n");
      return;
   }
   if((mob_align = (char *)malloc((ref_len+mob_len)*sizeof(char)))==NULL)
   {
      printf("   Warning==> No memory for alignment!\n");
      free(ref_align);
      return;
   }
   
   /* Perform the alignment                                             */
   score = align(gRefSeq, ref_len, gMobSeq[strucnum], mob_len, FALSE, 
                 FALSE, gGapPen, ref_align, mob_align, &align_len);
   if(!score)
   {
      printf("   Error==> Unable to perform alignment!\n");
      return;
   }
   
   /* Display the fitted sequences                                      */
   offset = 0;
   printf("   ");
   for(i=0,ai=0,aj=0; ai<align_len; ai++)    /* Prints ref sequence     */
   {
      char  buffer[8];
      
      if(++i>60)  /* If printed 60 chars, print equiv section of mob seq*/
      {
         i=1;
         printf("\n   ");
         for(j=offset; j<60+offset; j++)
         {
            sprintf(buffer,"%c",mob_align[j]);
            printf(buffer);
         }
         printf("\n\n   ");
         offset += 60;
      }
      printf("%c",ref_align[ai]);
   }
   printf("\n   ");

   for(j=offset; j<align_len; j++)           /* Print remains of mob seq*/
   {
      printf("%c",mob_align[j]);
   }
   printf("\n\n   ");
   
   printf("Score: %d Normalised score: %.2f\n",
          score,
          (REAL)score/(REAL)(MIN(ref_len,mob_len)));
   
   /* Clear any current fitting zones                                   */
   SetFitZone("CLEAR", strucnum);
   
   /* Now set zones based on alignment                                  */
   SetNWZones(ref_align, mob_align, align_len, NULL, NULL, strucnum);
   
   /* Free allocated memory                                             */
   free(ref_align);
   free(mob_align);
   
   return;
}


/************************************************************************/
/*>void ReadAlignment(char *alnfile)
   ---------------------------------
   Read the first two sequences out of an alignment file in PIR format
   and set up zones based on the alignment.

   20.11.95 Original    By: ACRM
   01.02.01 Modified to cope with multiple structures.
*/
void ReadAlignment(char *alnfile)
{
   FILE *fp;
   char *seqa[MAXCHAIN],
        *seqb[MAXCHAIN];
   BOOL punct, error;
   int  i,
        nchain,
        strucnum = 0;
   
   /* Open the PIR alignment file for reading                           */
   if((fp=fopen(alnfile,"r"))==NULL)
   {
      printf("   Error==> Unable to read alignment file (%s)\n", alnfile);
      return;
   }
   
   /* Read the first sequence from the file                             */
   nchain = ReadPIR(fp, TRUE, seqa, MAXCHAIN, NULL, &punct, &error);
   /* Check there was only one chain                                    */
   if(nchain > 1)
   {
      printf("   Error==> Alignment only available with a single \
chain.\n");
      fclose(fp);
      for(i=0; i<nchain; i++)
         free(seqa[i]);
      return;
   }
   if(nchain == 0)
   {
      printf("   Error==> No sequence read from alignment file.\n");
      fclose(fp);
      return;
   }

   /* Terminate sequence at the *                                       */
   TERMAT(seqa[0], '*');
   

   /* Read the second sequence from the file                            */
   while((nchain = ReadPIR(fp, TRUE, seqb, MAXCHAIN, NULL, 
                           &punct, &error)))
   {
      /* Check there was only one chain                                 */
      if(nchain > 1)
      {
         printf("   Error==> Alignment only available with a single \
chain\n");
         fclose(fp);
         for(i=0; i<nchain; i++)
            free(seqb[i]);
         return;
      }
      if(nchain == 0)
      {
         printf("   Error==> No sequence read from alignment file.\n");
         fclose(fp);
         return;
      }
      
      /* Terminate sequences at the *                                   */
      TERMAT(seqb[0], '*');
      
      /* Clear any current fitting zones                                */
      SetFitZone("CLEAR", strucnum);
      
      /* Remove any deletions which appear in both sequences            */
      if(!RemoveDoubleDeletions(seqa[0], seqb[0]))
      {
         printf("   Warning==> No memory to remove double deletions.\n");
         printf("              Will try to remove them as we go...\n");
      }
      
      /* Now set zones based on alignment                               */
      SetNWZones(seqa[0], seqb[0], MIN(strlen(seqa[0]), strlen(seqb[0])),
                 NULL, NULL, strucnum);
      
      free(seqb[0]);

      if(++strucnum > gMultiCount)
      {
         printf("   Warning==> Alignment file contains more sequences than there\n");
         printf("              are structures.\n");
         break;
      }
      
   }

   if(strucnum < gMultiCount)
   {
      printf("   Warning==> Insufficient sequences in alignment file.\n");
      printf("              Fitting may fail!\n");
   }
   
   /* Free allocated memory and close file                              */
   free(seqa[0]);
   fclose(fp);
}


/************************************************************************/
/*>BOOL RemoveDoubleDeletions(char *seqa, char *seqb)
   --------------------------------------------------
   Remove deletions which appear in both sequences when reading an
   alignment file. This often occurs when the two sequences have come
   from part of a multiple alignment.

   13.12.95 Original    By: ACRM
*/
BOOL RemoveDoubleDeletions(char *seqa, char *seqb)
{
   char *copya = NULL,
        *copyb = NULL;
   int  i, j,
        lena,
        lenb;

   lena = strlen(seqa);
   lenb = strlen(seqb);

   /* Create temporary storage for the sequences                        */
   copya = (char *)malloc((lena+1) * sizeof(char));
   copyb = (char *)malloc((lenb+1) * sizeof(char));
   if(copya==NULL || copyb==NULL)
   {
      if(copya!=NULL) free(copya);
      if(copyb!=NULL) free(copyb);
      return(FALSE);
   }

   /* Copy in the sequences skipping any double deletions               */
   for(i=0, j=0; i<MAX(lena, lenb); i++)
   {
      if((seqa[i] != '-') || (seqb[i] != '-'))
      {
         copya[j] = seqa[i];
         copyb[j] = seqb[i];
         j++;
      }
   }
   copya[j] = copyb[j] = '\0';
   
   /* Copy back into the original strings                               */
   strcpy(seqa, copya);
   strcpy(seqb, copyb);
   
   /* Free up the temporary storage                                     */
   free(copya);
   free(copyb);
   
   return(TRUE);
}

/************************************************************************/
/*>SetNWZones(char *ref_align, char *mob_align, int align_len,
              PDB **RefIndex, PDB **MobIndex, int strucnum)
   -----------------------------------------------------------
   Searches through the N&W sequence alignment and creates fitting zones
   from the equivalent regions.

   09.10.92 Original
   24.11.94 Fixed bug causing it to lose first zone in multi-zone match
   17.07.95 Replaced screen() with printf()
   18.07.95 Added initialisation of inserts in zones
   21.08.95 Fixed bug in additional non-existant zone being added when
            last zone not at end of chain
   22.11.95 Added check on deletion in both sequences
   13.12.95 Wasn't doing this check when stepping through a block of 
            deleteions. Fixed.
   18.06.96 Changed MODE_* to ZONE_MODE_*

   06.05.98 Completely rewritten! New version is 27% shorter, MUCH
            simpler and fixes a bug which occurred when a zone had only
            one residue.
   15.01.01 Simplified even further by making each residue an individual
            zone. this is not as elegant, but makes the implementation
            of distance checking much easier. If RefIndex and MobIndex
            are NULL, it behaves as before. If not then the distance
            between an atom pair is checked before adding the residue
            pair to the zone. Finally calls MergeZones() to merge
            adjacent zones.
   01.02.01 Added strucnum parameter
   20.02.01 Added check on gLimit[]
*/
void SetNWZones(char *ref_align, char *mob_align, int align_len,
                PDB **RefIndex, PDB **MobIndex, int strucnum)
{
   int   i,
         start,
         stop,
         ref_resnum  = 0,
         mob_resnum  = 0;
   ZONE  *z;

   if(gZoneList[strucnum])
   {
      FREELIST(gZoneList[strucnum], ZONE);
      gZoneList[strucnum] = NULL;
   }
   
   /* Get the start and stop of the region we are going to look at from
      gLimit[] if it has been specified. Otherwise just use the whole 
      alignment length
   */
   start = ((gLimit[0] < 1)||(gLimit[1] < 1)) ? 
            0 : (gLimit[0] - 1);
   stop  = ((gLimit[0] < 1)||(gLimit[1] < 1)) ? 
            align_len : (gLimit[1]);

   if(start > align_len)
      start = align_len-1;
   if(stop > align_len)
      stop  = align_len;

   for(i=0; i<start; i++)
   {
      /* Find offsets for first zone                                    */
      if(ref_align[i] != '-') ref_resnum++;
      if(mob_align[i] != '-') mob_resnum++;
   }

   for(i=start; i<stop; i++)
   {
      /* Find the residue number in each structure                      */
      if(ref_align[i] != '-') ref_resnum++;
      if(mob_align[i] != '-') mob_resnum++;
      
      if((ref_align[i] != '-') && (mob_align[i] != '-'))
      {
         if(((RefIndex==NULL) && (MobIndex==NULL)) ||
            (DISTSQ(RefIndex[ref_resnum-1], MobIndex[mob_resnum-1]) <=
             gMaxEquivDistSq))
         {
            /* Allocate and store the zone                              */
            if(gZoneList[strucnum])
            {
               /* Move to end of zone list                              */
               z=gZoneList[strucnum];
               LAST(z);
               ALLOCNEXT(z,ZONE);
            }
            else
            {
               INIT(gZoneList[strucnum],ZONE);
               z = gZoneList[strucnum];
            }
            if(z==NULL)
            {
               printf("   Error==> No memory for N&W fitting zones!\n");
               return;
            }
            
            z->chain1       = ' ';
            z->start1       = ref_resnum;
            z->startinsert1 = ' ';
            z->stop1        = ref_resnum;
            z->stopinsert1  = ' ';
            z->chain2       = ' ';
            z->start2       = mob_resnum;
            z->startinsert2 = ' ';
            z->stop2        = mob_resnum;
            z->stopinsert2  = ' ';
            z->mode         = ZONE_MODE_SEQUENTIAL;
         }
      }
   }

   MergeZones(strucnum);
   
   /* Set fitting flags                                                 */
   gFitted      = FALSE;
   gUserFitZone = TRUE;
}


/************************************************************************/
/*>void MergeZones(int strucnum)
   -----------------------------
   Merges zones describing sequentially numbered adjacent amino acids 

   15.01.01 Original   By: ACRM
   01.02.01 Added strucnum parameter
*/
void MergeZones(int strucnum)
{
   ZONE *z  = NULL,
        *zn = NULL;
   BOOL converged = TRUE;
   
   if(gZoneList[strucnum])
   {
      do
      {
         /* Assume we have converged                                    */
         converged = TRUE;
         for(z=gZoneList[strucnum]; z!=NULL; NEXT(z))
         {
            zn = z->next;
            if(zn)
            {
               /* If both zones are in sequential mode                  */
               if((z->mode  == ZONE_MODE_SEQUENTIAL) &&
                  (zn->mode == ZONE_MODE_SEQUENTIAL))
               {
                  /* See if the two zones are sequential                */
                  if((zn->start1 == (z->stop1 + 1)) &&
                     (zn->start2 == (z->stop2 + 1)))
                  {
                     z->stop1 = zn->stop1;
                     z->stop2 = zn->stop2;
                     z->next = zn->next;
                     free(zn);
                     converged = FALSE;
                  }
               }
            }
         }
      }  while(!converged);
   }
}



