C Steven G. Johnson (stevenj@alum.mit.edu) 7/6/97
C
C This file contains FFT subroutines by D. H. Bailey extracted from his
C MPFUN package.  They have been slightly modified to allow me to call
C them from our C benchmark program.
C
C
C*****************************************************************************
C
C   MPFUN: A MULTIPLE PRECISION FLOATING POINT COMPUTATION PACKAGE
C
C   Standard Fortran-77 version
C   Version Date:  March 14, 1995
C
C   Author:
C
C      David H. Bailey                 Telephone:   415-604-4410
C      NASA Ames Research Center       Facsimile:   415-604-3957
C      Mail Stop T045-1                Internet:    dbailey@nas.nasa.gov
C      Moffett Field, CA 94035
C      USA
C
C   Restrictions:
C
C   This software has now been approved by NASA for unrestricted distribution.
C   However, usage of this software is subject to the following:
C
C   1. This software is offered without warranty of any kind, either expressed
C      or implied.  The author would appreciate, however, any reports of bugs
C      or other difficulties that may be encountered.
C   2. If modifications or enhancements to this software are made to this
C      software by others, NASA Ames reserves the right to obtain this enhanced
C      software at no cost and with no restrictions on its usage.
C   3. The author and NASA Ames are to be acknowledged in any published paper
C      based on computations using this software.  Accounts of practical
C      applications or other benefits resulting from this software are of
C      particular interest.  Please send a copy of such papers to the author.
C
C   Description:
C
C   The following information is a brief description of this program.  For
C   full details and instructions for usage, see the paper "A Portable High
C   Performance Multiprecision Package", available from the author.
C
C   This package of Fortran subroutines performs multiprecision floating point
C   arithmetic.  If sufficient main memory is available, the maximum precision
C   level is at least 16 million digits.  The maximum dynamic range is at
C   least 10^(+-14,000,000).  It employs advanced algorithms, including an
C   FFT-based multiplication routine and some recently discovered
C   quadratically convergent algorithms for pi, exp and log.  The package also
C   features extensive debug and self-checking facilities, so that it can be
C   used as a rigorous system integrity test.  All of the routines in this
C   package have been written to facilitate vector and parallel processing.
C
C   For users who do not wish to manually write code that calls these routines,
C   an automatic translator program is available from the author that converts
C   ordinary Fortran-77 code into code that calls these routines.  Contact the
C   author for details.
C
C   This package should run correctly on any computer with a Fortran-77
C   compiler that meets certain minimal floating point accuracy standards.
C   Any system based on the IEEE floating point standard, with a 25 bit
C   mantissa in single precision and a 53 bit mantissa in double precision,
C   easily meets these requirements.  All DEC VAX systems meet these
C   requirements.  All IBM mainframes and workstations meet these requirements.
C   Cray systems meet all of these requirements with double precision disabled
C   (i.e. by using only single precision).
C
C   Machine-specific tuning notes may be located by searching for the text
C   string C> in this program file.  It is highly recommended that these notes
C   be read before running this package on a specific system.  Also,
C   certain vectorizable DO loops that are often not recognized as such by
C   vectorizing compilers are prefaced with Cray CDIR$ IVDEP directives.  On
C   other vector systems these directives should be replaced by the
C   appropriate equivalents.
C
C   Instructions for compiling and testing this program are included in the
C   readme file that accompanies this file.
C
C*****************************************************************************
      BLOCK DATA
      IMPLICIT DOUBLE PRECISION (A-H, O-Z)
      PARAMETER (BBXC = 4096.D0, NBTC = 24, NPRC = 32, MCRC = 7,
     $  BDXC = BBXC ** 2, BX2C = BDXC ** 2, RBXC = 1.D0 / BBXC,
     $  RDXC = RBXC ** 2, RX2C = RDXC ** 2, RXXC = 16.D0 * RX2C)
      END
C
      SUBROUTINE MPINIX (M, U)
C
C   This initializes the double precision array U in common MPCOM5 with roots
C   of unity required by the FFT routines, which are called by MPMULX.  Before
C   calling any of the advanced MP routines (i.e. those whose names end in X),
C   this routine must be called with M set to MX, where MX is defined as the
C   integer such that 2 ^ MX = NX, and where NX is the largest precision level
C   NW that will be used in the subsequent application.  Before calling MPINIX,
C   the user must allocate at least 2^(M + 3) double precision cells in common
C   MPCOM5, which must be placed in the user's main program.  Also, at least
C   12 * NW + 6 double precision cells must be allocated in common MPCOM4.
C   Only one call to MPINIX is required, no matter how many advanced routines
C   are called.  It is not necessary for the user to call MPINIX, to allocate
C   space in MPCOM5 or to allocate more than NW + 6 cells in MPCOM4 if the
C   advanced routines are not called.
C
      DOUBLE PRECISION PI, T1, T2, U
      PARAMETER (PI = 3.141592653589793238D0)
C     COMMON /MPCOM5/ U(1024)
      DIMENSION U(*)
C
C   Initialize the U array with sines and cosines in a manner that permits
C   stride one access at each FFT iteration.
C
      MM = M + 2
      N = 2 ** MM
      NU = N
      U(1) = 64 * N + MM
      KU = 2
      KN = KU + NU
      LN = 1
C
      DO 110 J = 1, MM
        T1 = PI / LN
C
CDIR$ IVDEP
        DO 100 I = 0, LN - 1
          T2 = I * T1
          U(I+KU) = COS (T2)
          U(I+KN) = SIN (T2)
 100    CONTINUE
C
        KU = KU + LN
        KN = KU + NU
        LN = 2 * LN
 110  CONTINUE
C
      RETURN
      END
C
      SUBROUTINE MPCFFT (IS, M, X, Y, U)
C
C   This routine computes the 2^M -point complex-to-complex FFT of X.  See
C   article by DHB in Intl. J. of Supercomputer Applications, Spring 1988,
C   p. 82 - 87).  X and Y are double precision.  X is both the input and the
C   output array, while Y is a scratch array.  Both X and Y must be
C   dimensioned with 2 * N cells, where N = 2^M.  The data in X are assumed
C   to have real and imaginary parts separated by N cells.  A call to MPCFFT
C   with IS = 1 (or -1) indicates a call to perform a FFT with positive (or
C   negative) exponentials.  M must be at least two.  Before calling MPCRFT,
C   the array in MPCOM5 must be initialized by calling MPINIX.
C
C   In this application, MPCFFT is called by MPRCFT and MPCRFT, which are in
C   turn called by MPMULX.  This routine is not intended to be called directly
C   by the user.
C
      IMPLICIT DOUBLE PRECISION (A-H, O-Z)
      DIMENSION X(*), Y(*), U(*)
C
      N = 2 ** M
C>
C   For Cray computers, it is most efficient to limit M1 to 6.  For most
C   scalar computers, it is best to limit M1 to 2.  Uncomment whichever of the
C   next two lines is appropriate.
C
C      M1 = MIN (M / 2, 6)
      M1 = MIN (M / 2, 2)
      M2 = M - M1
      N2 = 2 ** M1
      N1 = 2 ** M2
C
C   Perform one variant of the Stockham FFT.
C
      DO 100 L = 1, M1, 2
        CALL MPFFT1 (IS, L, M, X, Y, U)
        IF (L .EQ. M1) GOTO 120
        CALL MPFFT1 (IS, L + 1, M, Y, X, U)
 100  CONTINUE
C
C   Perform a transposition of X treated as a N2 x N1 x 2 matrix.
C
      CALL MPTRAN (N1, N2, X, Y)
C
C   Perform second variant of the Stockham FFT from Y to X and X to Y.
C
      DO 110 L = M1 + 1, M, 2
        CALL MPFFT2 (IS, L, M, Y, X, U)
        IF (L .EQ. M) GOTO 160
        CALL MPFFT2 (IS, L + 1, M, X, Y, U)
 110  CONTINUE
C
      GOTO 140
C
C   Perform a transposition of Y treated as a N2 x N1 x 2 matrix.
C
 120  CALL MPTRAN (N1, N2, Y, X)
C
C   Perform second variant of the Stockham FFT from X to Y and Y to X.
C
      DO 130 L = M1 + 1, M, 2
        CALL MPFFT2 (IS, L, M, X, Y, U)
        IF (L .EQ. M) GOTO 140
        CALL MPFFT2 (IS, L + 1, M, Y, X, U)
 130  CONTINUE
C
      GOTO 160
C
C   Copy Y to X.
C
 140  DO 150 I = 1, 2 * N
        X(I) = Y(I)
 150  CONTINUE
C
 160  RETURN
      END
C
      SUBROUTINE MPFFT1 (IS, L, M, X, Y, U)
C
C   Performs the L-th iteration of the first variant of the Stockham FFT.
C   This routine is called by MPCFFT.  It is not intended to be called directly
C   by the user.
C
      IMPLICIT DOUBLE PRECISION (A-H, O-Z)
      DIMENSION X(*), Y(*), U(*)
C     COMMON /MPCOM5/ U(1024)
C
C   Set initial parameters.
C
      N = 2 ** M
      K = U(1)
      NU = K / 64
      N1 = N / 2
      LK = 2 ** (L - 1)
      LI = 2 ** (M - L)
      LJ = 2 * LI
      KU = LI + 1
      KN = KU + NU
C
      DO 100 K = 0, LK - 1
        I11 = K * LJ + 1
        I12 = I11 + LI
        I21 = K * LI + 1
        I22 = I21 + N1
C
CDIR$ IVDEP
        DO 100 I = 0, LI - 1
          U1 = U(KU+I)
          U2 = IS * U(KN+I)
          X11 = X(I11+I)
          X12 = X(I11+I+N)
          X21 = X(I12+I)
          X22 = X(I12+I+N)
          T1 = X11 - X21
          T2 = X12 - X22
          Y(I21+I) = X11 + X21
          Y(I21+I+N) = X12 + X22
          Y(I22+I) = U1 * T1 - U2 * T2
          Y(I22+I+N) = U1 * T2 + U2 * T1
 100  CONTINUE
C
      RETURN
      END
C
      SUBROUTINE MPFFT2 (IS, L, M, X, Y, U)
C
C   Performs the L-th iteration of the second variant of the Stockham FFT.
C   This routine is called by MPCFFT.  It is not intended to be called directly
C   by the user.
C
      IMPLICIT DOUBLE PRECISION (A-H, O-Z)
      DIMENSION X(*), Y(*), U(*)
C     COMMON /MPCOM5/ U(1024)
C
C   Set initial parameters.
C
      N = 2 ** M
      K = U(1)
      NU = K / 64
      N1 = N / 2
      LK = 2 ** (L - 1)
      LI = 2 ** (M - L)
      LJ = 2 * LK
      KU = LI + 1
C
      DO 100 I = 0, LI - 1
        I11 = I * LK + 1
        I12 = I11 + N1
        I21 = I * LJ + 1
        I22 = I21 + LK
        U1 = U(KU+I)
        U2 = IS * U(KU+I+NU)
C
CDIR$ IVDEP
        DO 100 K = 0, LK - 1
          X11 = X(I11+K)
          X12 = X(I11+K+N)
          X21 = X(I12+K)
          X22 = X(I12+K+N)
          T1 = X11 - X21
          T2 = X12 - X22
          Y(I21+K) = X11 + X21
          Y(I21+K+N) = X12 + X22
          Y(I22+K) = U1 * T1 - U2 * T2
          Y(I22+K+N) = U1 * T2 + U2 * T1
 100  CONTINUE
C
      RETURN
      END
C
      SUBROUTINE MPTRAN (N1, N2, X, Y)
C
C   Performs a transpose of the vector X, returning the result in Y.  X is
C   treated as a N1 x N2 complex matrix, and Y is treated as a N2 x N1 complex
C   matrix.  The complex data is assumed stored with real and imaginary parts
C   separated by N1 x N2 locations.
C
C   This routine is called by MPCFFT.  It is not intended to be called directly
C   by the user.
C
      IMPLICIT DOUBLE PRECISION (A-H, O-Z)
      PARAMETER (NA = 32, NC = 32)
      DIMENSION X(2*N1*N2), Y(2*N1*N2), Z(NC,2*NC)
C
      N = N1 * N2
C>
C   Use different techniques, depending on the system, N1 and N2.  For Cray
C   systems, uncomment the next line.
C
C      GOTO 100
C
C   This strategy is good for many scalar cache memory computers.  The
C   value of NC (i.e. the size of Z) may have to be changed depending on
C   how large the cache is.
C
      IF (N1 .LE. NC .OR. N2 .LE. NC) THEN
        IF (N1 .GE. N2) THEN
          GOTO 110
        ELSE
          GOTO 130
        ENDIF
      ELSE
        GOTO 150
      ENDIF
C
C   This strategy is best for Cray systems.
C
 100  IF (N1 .LT. NA .OR. N2 .LT. NA) THEN
        IF (N1 .GE. N2) THEN
          GOTO 110
        ELSE
          GOTO 130
        ENDIF
      ELSE
        GOTO 220
      ENDIF
C
C   Scheme 1:  Perform a simple transpose in the usual way.
C
 110  DO 120 J = 0, N2 - 1
        J1 = J + 1
        J2 = J * N1 + 1
C
CDIR$ IVDEP
        DO 120 I = 0, N1 - 1
          Y(I*N2+J1) = X(I+J2)
          Y(I*N2+J1+N) = X(I+J2+N)
 120  CONTINUE
C
      GOTO 260
C
C   Scheme 2:  Perform a simple transpose with the loops reversed.
C
 130  DO 140 I = 0, N1 - 1
        I1 = I * N2 + 1
        I2 = I + 1
C
CDIR$ IVDEP
        DO 140 J = 0, N2 - 1
          Y(J+I1) = X(J*N1+I2)
          Y(J+I1+N) = X(J*N1+I2+N)
 140  CONTINUE
C
      GOTO 260
C
C   Scheme 3:  Perform a transpose using the intermediate array Z.  This gives
C   better performance than schemes 1 and 2 on certain cache memory systems.
C   The size of the array Z (i.e. the parameter NC above) may have to be
C   adjusted for optimal performance.
C
 150  DO 210 JJ = 0, N2 - 1, NC
        DO 200 II = 0, N1 - 1, NC
C
          DO 170 J = 1, NC
            J1 = II + (J - 1 + JJ) * N1
C
CDIR$ IVDEP
            DO 160 I = 1, NC
              Z(J,I) = X(I+J1)
              Z(J,I+NC) = X(I+J1+N)
 160        CONTINUE
C
 170      CONTINUE
C
          DO 190 I = 1, NC
            I1 = JJ + (I - 1 + II) * N2
C
CDIR$ IVDEP
            DO 180 J = 1, NC
              Y(J+I1) = Z(J,I)
              Y(J+I1+N) = Z(J,I+NC)
 180        CONTINUE
C
 190      CONTINUE
C
 200    CONTINUE
 210  CONTINUE
C
      GOTO 260
C
C   Scheme 4:  Perform the transpose along diagonals to insure odd strides.
C   This works well on moderate vector, variable stride computers, when both
C   N1 and N2 are divisible by reasonably large powers of two (32 or larger on
C   Cray computers).
C
 220  N11 = N1 + 1
      N21 = N2 + 1
      IF (N1 .GE. N2) THEN
        K1 = N1
        K2 = N2
        I11 = N1
        I12 = 1
        I21 = 1
        I22 = N2
      ELSE
        K1 = N2
        K2 = N1
        I11 = 1
        I12 = N2
        I21 = N1
        I22 = 1
      ENDIF
C
      DO 230 J = 0, K2 - 1
        J1 = J * I11 + 1
        J2 = J * I12 + 1
C
CDIR$ IVDEP
        DO 230 I = 0, K2 - 1 - J
          Y(N21*I+J2) = X(N11*I+J1)
          Y(N21*I+J2+N) = X(N11*I+J1+N)
 230  CONTINUE
C
      DO 240 J = 1, K1 - K2 - 1
        J1 = J * I21 + 1
        J2 = J * I22 + 1
C
CDIR$ IVDEP
        DO 240 I = 0, K2 - 1
          Y(N21*I+J2) = X(N11*I+J1)
          Y(N21*I+J2+N) = X(N11*I+J1+N)
 240  CONTINUE
C
      DO 250 J = K1 - K2, K1 - 1
        J1 = J * I21 + 1
        J2 = J * I22 + 1
C
CDIR$ IVDEP
        DO 250 I = 0, K1 - 1 - J
          Y(N21*I+J2) = X(N11*I+J1)
          Y(N21*I+J2+N) = X(N11*I+J1+N)
 250  CONTINUE
C
 260  RETURN
      END
