!===============================================================================
! Copyright 2020-2022 Intel Corporation.
!
! This software and the related documents are Intel copyrighted  materials,  and
! your use of  them is  governed by the  express license  under which  they were
! provided to you (License).  Unless the License provides otherwise, you may not
! use, modify, copy, publish, distribute,  disclose or transmit this software or
! the related documents without Intel's prior written permission.
!
! This software and the related documents  are provided as  is,  with no express
! or implied  warranties,  other  than those  that are  expressly stated  in the
! License.
!===============================================================================

!  Content:
!      Intel(R) oneAPI Math Kernel Library (oneMKL)
!      FORTRAN OpenMP offload examples for DOMATADD_BATCH_STRIDED
!*******************************************************************************

include "mkl_omp_offload.f90"
include "common_blas.f90"

program domatadd_batch_strided_example
#if defined(MKL_ILP64)
use onemkl_blas_omp_offload_ilp64
#else
use onemkl_blas_omp_offload_lp64
#endif
use common_blas  

character*1 :: ordering = 'C', transa = 'T', transb = 'N'
integer :: row = 5, col = 3, batch_size = 10
integer :: lda, ldb, ldc, stridea, strideb, stridec, passed
double precision :: alpha = 1.2
double precision :: beta = -0.7
double precision,allocatable :: a(:,:), b(:,:), c(:,:), c_ref(:,:)


lda = col
ldb = row
ldc = row
stridea = lda * row
strideb = ldb * col
stridec = ldc * col

allocate(a(stridea,batch_size))
allocate(b(strideb,batch_size))
allocate(c(stridec,batch_size))
allocate(c_ref(stridec,batch_size))

if (.not. allocated(a)) goto 998
if (.not. allocated(b)) then
   deallocate(a)
   goto 998
end if
if (.not. allocated(c)) then
   deallocate(a)
   deallocate(b)
   goto 998
end if
if (.not. allocated(c_ref)) then
   deallocate(a)
   deallocate(b)
   deallocate(c)
   goto 998
end if

! initialize matrices
call dinit_matrix('N', stridea, batch_size, stridea, a)
call dinit_matrix('N', strideb, batch_size, strideb, b)
call dinit_matrix('N', stridec, batch_size, stridec, c)
call dcopy_matrix(stridec, batch_size, stridec, c, c_ref)

! Calling domatadd_batch_strided on the CPU
call mkl_domatadd_batch_strided(ordering, transa, transb, row, col, alpha, a, lda, stridea, beta, b, ldb, strideb, c_ref, ldc, stridec, batch_size)

! Calling domatadd_batch_strided on the GPU
!$omp target data map(a,b,c)
#if defined(ONEMKL_USE_OPENMP_VERSION) && (ONEMKL_USE_OPENMP_VERSION >= 202011)
!$omp dispatch
#else
!$omp target variant dispatch device(0) use_device_ptr(a,b,c)
#endif
call mkl_domatadd_batch_strided(ordering, transa, transb, row, col, alpha, a, lda, stridea, beta, b, ldb, strideb, c, ldc, stridec, batch_size)
#if !defined(ONEMKL_USE_OPENMP_VERSION) || (ONEMKL_USE_OPENMP_VERSION < 202011)
!$omp end target variant dispatch
#endif
!$omp end target data

! Compare result of CPU and GPU implementation

passed = dcheck_matrix(stridec, batch_size, stridec, c, c_ref)

deallocate(a)
deallocate(b)
deallocate(c)
deallocate(c_ref)

if (passed.ne.0) then
   goto 999
else
   print *, "PASSED"
end if


stop

998 print *, 'Error: cannot allocate matrices' 
999 stop 1
end program
