#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A))
! (C) Copyright 2000- ECMWF.
! (C) Copyright 2000- Meteo-France.
! (C) Copyright 2022- NVIDIA.
! 
! This software is licensed under the terms of the Apache Licence Version 2.0
! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
! In applying this licence, ECMWF does not waive the privileges and immunities
! granted to it by virtue of its status as an intergovernmental organisation
! nor does it submit to any jurisdiction.
!

MODULE TRLTOM_PACK_UNPACK
  USE BUFFERED_ALLOCATOR_MOD, ONLY: ALLOCATION_RESERVATION_HANDLE
  USE PARKIND_ECTRANS,        ONLY: JPIM
  IMPLICIT NONE

  PRIVATE
  PUBLIC :: TRLTOM_PACK_HANDLE, PREPARE_TRLTOM_PACK, TRLTOM_PACK
  PUBLIC :: TRLTOM_UNPACK_HANDLE, PREPARE_TRLTOM_UNPACK, TRLTOM_UNPACK

  TYPE TRLTOM_PACK_HANDLE
    TYPE(ALLOCATION_RESERVATION_HANDLE) :: HFOUBUF_IN
  END TYPE
  TYPE TRLTOM_UNPACK_HANDLE
    TYPE(ALLOCATION_RESERVATION_HANDLE) :: HINPS_AND_ZINPA
  END TYPE

  INTEGER(KIND=JPIM) :: A = 8 !Alignment

CONTAINS
  FUNCTION PREPARE_TRLTOM_PACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_PACK)
    USE PARKIND_ECTRANS,        ONLY: JPIM, JPRBT, JPIB
    USE TPM_DISTR,              ONLY: D
    USE ISO_C_BINDING,          ONLY: C_SIZEOF
    USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE

    IMPLICIT NONE

    TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR
    INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS
    TYPE(TRLTOM_PACK_HANDLE) :: HTRLTOM_PACK

    REAL(KIND=JPRBT) :: DUMMY

    HTRLTOM_PACK%HFOUBUF_IN = RESERVE(ALLOCATOR, 2_JPIB*D%NLENGT0B*KF_FS*C_SIZEOF(DUMMY), "HTRLTOM_PACK%HFOUBUF_IN")
  END FUNCTION PREPARE_TRLTOM_PACK

  SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS)
    !**** *TRLTOM_PACK* - Copy fourier data from local array to buffer

    !     Purpose.
    !     --------
    !        Routine for copying fourier data from local array to buffer

    !**   Interface.
    !     ----------
    !     CALL TRLTOM_PACK(...)

    !     Explicit arguments :  PREEL - local fourier/GP array
    !     --------------------  KF_FS - number of fields
    !
    !     Externals.  None.
    !     ----------

    !     Author.
    !     -------
    !        Mats Hamrud *ECMWF*

    !     ------------------------------------------------------------------

    USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION
    USE PARKIND_ECTRANS,        ONLY: JPIM, JPRBT, JPIB
    USE TPM_DISTR,              ONLY: D, MYSETW
    USE TPM_GEOMETRY,           ONLY: G
    USE TPM_DIM,                ONLY: R
    USE ISO_C_BINDING,          ONLY: C_SIZEOF
    !

    IMPLICIT NONE

    REAL(KIND=JPRBT), INTENT(IN) :: PREEL_COMPLEX(:)
    REAL(KIND=JPRBT), POINTER, INTENT(OUT) :: FOUBUF_IN(:)
    INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS
    TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR
    TYPE(TRLTOM_PACK_HANDLE), INTENT(IN) :: HTRLTOM_PACK

    INTEGER(KIND=JPIM) :: JM,JF,IGLG,OFFSET_VAR,KGL
    INTEGER(KIND=JPIB) :: IOFF_LAT,ISTA

    REAL(KIND=JPRBT)    :: SCAL

    ASSOCIATE(D_NSTAGTF=>D%NSTAGTF, D_NPNTGTB0=>D%NPNTGTB0, D_NPTRLS=>D%NPTRLS, &
            & D_NDGL_FS=>D%NDGL_FS, G_NMEN=>G%NMEN, G_NLOEN=>G%NLOEN, R_NSMAX=>R%NSMAX)

    CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRLTOM_PACK%HFOUBUF_IN),&
        & 1_JPIB, 2_JPIB*D%NLENGT0B*KF_FS*C_SIZEOF(FOUBUF_IN(1)))

#ifdef OMPGPU
   !$OMP TARGET DATA MAP(PRESENT,ALLOC:G,G_NMEN,D,D_NPNTGTB0,FOUBUF_IN,PREEL_COMPLEX,D_NSTAGTF,&
   !$OMP& D_NDGL_FS,G_NLOEN,R,R_NSMAX)
#endif
#ifdef ACCGPU
    !$ACC DATA PRESENT(G,G_NMEN,D,D_NPNTGTB0,FOUBUF_IN,PREEL_COMPLEX,D_NSTAGTF,D_NDGL_FS,G_NLOEN, R,R_NSMAX) ASYNC(1)
#endif

    ! scale results and move into next transformation buffer

    OFFSET_VAR=D_NPTRLS(MYSETW)

#ifdef OMPGPU
    !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(IGLG,IOFF_LAT,ISTA,SCAL) &
    !$OMP& SHARED(D,R,KF_FS,OFFSET_VAR,G,&
    !$OMP& PREEL_COMPLEX,FOUBUF_IN) MAP(TO:KF_FS,OFFSET_VAR)
#endif
#ifdef ACCGPU
    !$ACC PARALLEL LOOP PRIVATE(IGLG,IOFF_LAT,ISTA,SCAL) FIRSTPRIVATE(KF_FS,OFFSET_VAR) &
    !$ACC& TILE(32,16,1) DEFAULT(NONE) &
#ifndef _CRAYFTN
    !$ACC& ASYNC(1)
#else
    !$ACC&
#endif
#endif
    DO KGL=1,D_NDGL_FS
      DO JM=0,R_NSMAX !(note that R_NSMAX <= G_NMEN(IGLG) for all IGLG)
        DO JF=1,KF_FS
          IGLG = OFFSET_VAR+KGL-1
          IF (JM <= G_NMEN(IGLG)) THEN
            IOFF_LAT = KF_FS*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL))

            SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT)
            ISTA  = 2_JPIB*D_NPNTGTB0(JM,KGL)*KF_FS

            FOUBUF_IN(ISTA+2*JF-1) = SCAL * PREEL_COMPLEX(IOFF_LAT+2*JM+1)
            FOUBUF_IN(ISTA+2*JF  ) = SCAL * PREEL_COMPLEX(IOFF_LAT+2*JM+2)
          ENDIF
        ENDDO
      ENDDO
    ENDDO
#ifdef ACCGPU
    !$ACC END DATA

    !$ACC WAIT(1)
#endif
#ifdef OMPGPU
    !$OMP END TARGET DATA
#endif
    END ASSOCIATE
  END SUBROUTINE TRLTOM_PACK

  FUNCTION PREPARE_TRLTOM_UNPACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_UNPACK)
    USE PARKIND_ECTRANS,        ONLY: JPIM, JPRBT, JPRD, JPIB
    USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE
    USE LEDIR_MOD,              ONLY: LEDIR_STRIDES
    USE ISO_C_BINDING,          ONLY: C_SIZEOF

    IMPLICIT NONE

    TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR
    INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS
    TYPE(TRLTOM_UNPACK_HANDLE) :: HTRLTOM_UNPACK

    INTEGER(KIND=JPIM)  :: IIN_STRIDES0
    INTEGER(KIND=JPIB)  :: IIN_SIZE
    INTEGER(KIND=JPIM)  :: IIN0_STRIDES0, IIN0_SIZE
    INTEGER(KIND=JPIB)  :: ISIZE

    REAL(KIND=JPRBT) :: ZPRBT_DUMMY
    REAL(KIND=JPRD) :: ZPRD_DUMMY

    CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_SIZE=IIN_SIZE,&
                       IIN0_STRIDES0=IIN0_STRIDES0,IIN0_SIZE=IIN0_SIZE)

    ! Check if the reuse buffer is large enough
    ISIZE = ALIGN(IIN_SIZE*C_SIZEOF(ZPRBT_DUMMY),128)
    ISIZE = ISIZE + ALIGN(IIN_SIZE*C_SIZEOF(ZPRBT_DUMMY),128)
    ISIZE = ISIZE + ALIGN(IIN0_SIZE*C_SIZEOF(ZPRD_DUMMY),128)
    ISIZE = ISIZE + ALIGN(IIN0_SIZE*C_SIZEOF(ZPRD_DUMMY),128)

    HTRLTOM_UNPACK%HINPS_AND_ZINPA = RESERVE(ALLOCATOR, ISIZE, "HTRLTOM_UNPACK%HINPS_AND_ZINPA")
  END FUNCTION PREPARE_TRLTOM_UNPACK

  SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV)
    USE PARKIND_ECTRANS,             ONLY: JPIM, JPRBT, JPRD, JPIB
    USE TPM_DIM,                     ONLY: R
    USE TPM_GEOMETRY,                ONLY: G
    USE BUFFERED_ALLOCATOR_MOD,      ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION
    USE TPM_FIELDS,                  ONLY: F
    USE TPM_DISTR,                   ONLY: D
    USE LEDIR_MOD,                   ONLY: LEDIR_STRIDES
    USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_SIZEOF

    IMPLICIT NONE

    REAL(KIND=JPRBT), INTENT(IN) :: FOUBUF(:)
    REAL(KIND=JPRBT), POINTER, INTENT(INOUT) :: ZINPS(:), ZINPA(:)
    REAL(KIND=JPRD), POINTER, INTENT(INOUT) :: ZINPS0(:), ZINPA0(:)
    INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV
    TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR
    TYPE(TRLTOM_UNPACK_HANDLE), INTENT(IN) :: HTRLTOM_UNPACK

    REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:)

    INTEGER(KIND=JPIM) :: IIN_STRIDES0
    INTEGER(KIND=JPIB) :: IIN_SIZE
    INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE

    INTEGER(KIND=JPIB) :: IALLOC_POS, IALLOC_SZ

    INTEGER(KIND=JPIB)  :: JF, OFFSET1, OFFSET2
    INTEGER(KIND=JPIM) :: KM, ISL, IGLS, JGL, KMLOC

    REAL(KIND=JPRBT) :: PAIA, PAIS

    ASSOCIATE(D_NUMP=>D%NUMP, R_NDGNH=>R%NDGNH, R_NDGL=>R%NDGL, F_RW=>F%RW, F_RACTHE=>F%RACTHE, &
            & D_MYMS=>D%MYMS, D_NPNTGTB1=>D%NPNTGTB1, D_OFFSETS_GEMM1=>D%OFFSETS_GEMM1, &
            & G_NDGLU=>G%NDGLU)
    CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_SIZE=IIN_SIZE,&
                       IIN0_STRIDES0=IIN0_STRIDES0,IIN0_SIZE=IIN0_SIZE)

    IALLOC_POS=1

    IALLOC_SZ = ALIGN(IIN_SIZE*C_SIZEOF(ZINPS(0)),128)
    CALL ASSIGN_PTR(ZINPS, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%HINPS_AND_ZINPA),&
        & IALLOC_POS, IALLOC_SZ)
    IALLOC_POS=IALLOC_POS+IALLOC_SZ

    IALLOC_SZ = ALIGN(IIN_SIZE*C_SIZEOF(ZINPA(0)),128)
    CALL ASSIGN_PTR(ZINPA, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%HINPS_AND_ZINPA),&
        & IALLOC_POS, IALLOC_SZ)
    IALLOC_POS=IALLOC_POS+IALLOC_SZ

    IALLOC_SZ = ALIGN(IIN0_SIZE*C_SIZEOF(ZINPS0(0)),128)
    CALL ASSIGN_PTR(ZINPS0, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%HINPS_AND_ZINPA),&
        & IALLOC_POS, IALLOC_SZ)
    IALLOC_POS=IALLOC_POS+IALLOC_SZ

    IALLOC_SZ = ALIGN(IIN0_SIZE*C_SIZEOF(ZINPA0(0)),128)
    CALL ASSIGN_PTR(ZINPA0, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%HINPS_AND_ZINPA),&
        & IALLOC_POS, IALLOC_SZ)
    IALLOC_POS=IALLOC_POS+IALLOC_SZ

#ifdef OMPGPU
    !$OMP TARGET DATA MAP(PRESENT,ALLOC:ZINPS,ZINPA,ZINPS0,ZINPA0) &
    !$OMP& MAP(PRESENT,ALLOC:F,F_RW,F_RACTHE) &
    !$OMP& MAP(PRESENT,ALLOC:D,D_MYMS,D_NUMP,R,R_NDGNH,R_NDGL,G,G_NDGLU) &
    !$OMP& MAP(PRESENT,ALLOC:D_NPNTGTB1,D_OFFSETS_GEMM1,FOUBUF)

    !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) DEFAULT(NONE) &
    !$OMP& PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,PAIA,PAIS) &
    !$OMP& SHARED(D,R,KF_FS,G,FOUBUF,F,&
    !$OMP& IIN_STRIDES0,ZINPA,ZINPS,IIN0_STRIDES0,ZINPA0,ZINPS0,KF_UV) &
    !$OMP& MAP(TO:KF_FS,KF_UV,IIN_STRIDES0,IIN0_STRIDES0)
#endif
#ifdef ACCGPU
    !$ACC DATA &
    !$ACC& PRESENT(ZINPS,ZINPA,ZINPS0,ZINPA0) &
    !$ACC& PRESENT(F,F_RW,F_RACTHE) &
    !$ACC& PRESENT(D,D_MYMS,D_NUMP,R,R_NDGNH,R_NDGL,G,G_NDGLU) &
    !$ACC& PRESENT(D_NPNTGTB1,D_OFFSETS_GEMM1,FOUBUF)

    !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(3) PRIVATE(KM,ISL,IGLS,OFFSET1,OFFSET2,JGL,PAIA,PAIS) &
    !$ACC&              FIRSTPRIVATE(KF_FS,KF_UV,IIN_STRIDES0,IIN0_STRIDES0) &
#ifndef _CRAYFTN
    !$ACC& ASYNC(1)
#else
    !$ACC&
#endif
#endif
    DO KMLOC=1,D_NUMP
      DO JGL=1,R_NDGNH
        DO JF=1,KF_FS*2
          KM = D_MYMS(KMLOC)
          ISL = R_NDGNH-G_NDGLU(KM)+1
          IF (JGL >= ISL) THEN
            !(DO JGL=ISL,R_NDGNH)
            IGLS = R_NDGL+1-JGL
            OFFSET1 = 2_JPIB*D_NPNTGTB1(KMLOC,JGL )*KF_FS
            OFFSET2 = 2_JPIB*D_NPNTGTB1(KMLOC,IGLS)*KF_FS
            PAIA = FOUBUF(OFFSET1+JF)-FOUBUF(OFFSET2+JF)
            PAIS = FOUBUF(OFFSET1+JF)+FOUBUF(OFFSET2+JF)
            IF (JF <= 4*KF_UV) THEN
                ! Multiply in case of velocity
              PAIA = PAIA*REAL(F_RACTHE(JGL),JPRBT)
              PAIS = PAIS*REAL(F_RACTHE(JGL),JPRBT)
            ENDIF
            IF (KM /= 0) THEN
              ZINPA(JF+(JGL-ISL)*IIN_STRIDES0+IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC))=PAIA*REAL(F_RW(JGL),JPRBT)
              ZINPS(JF+(JGL-ISL)*IIN_STRIDES0+IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC))=PAIS*REAL(F_RW(JGL),JPRBT)
            ELSEIF (MOD(JF-1,2) == 0) THEN
              ! every other field is sufficient because Im(KM=0) == 0
              ZINPA0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIA*REAL(F_RW(JGL),JPRBT)
              ZINPS0((JF-1)/2+1+(JGL-1)*IIN0_STRIDES0)=PAIS*REAL(F_RW(JGL),JPRBT)
            ENDIF
          ENDIF
        ENDDO
      ENDDO
    END DO

#if defined(USE_CUTLASS) && defined(USE_CUTLASS_3XTF32)
#ifdef OMPGPU
    !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(2) DEFAULT(NONE) PRIVATE(KM) &
    !$OMP& SHARED(D_NUMP,KF_FS,D_MYMS,G_NDGLU,D_OFFSETS_GEMM1,IIN_STRIDES0,ZINPA,ZINPS) MAP(TO:KF_FS)
#endif
#ifdef ACCGPU
    !$ACC PARALLEL LOOP DEFAULT(NONE) COLLAPSE(2) PRIVATE(KM,JGL) &
    !$ACC&              FIRSTPRIVATE(KF_FS,IIN_STRIDES0) ASYNC(1)
#endif
    DO KMLOC=1,D_NUMP
      DO JF=1,KF_FS*2
          KM = D_MYMS(KMLOC)
#ifdef ACCGPU
          !$ACC LOOP SEQ
#endif
          DO JGL=G_NDGLU(KM),ALIGN(G_NDGLU(KM),A)-1
            IF (KM /= 0) THEN
              ZINPA(JF+JGL*IIN_STRIDES0+IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC))=0.0_JPRB
              ZINPS(JF+JGL*IIN_STRIDES0+IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC))=0.0_JPRB
            ENDIF
          ENDDO
      ENDDO
    END DO
#endif

#ifdef OMPGPU
    !$OMP END TARGET DATA
#endif
#ifdef ACCGPU
    !$ACC END DATA
#endif

   END ASSOCIATE
  END SUBROUTINE TRLTOM_UNPACK

END MODULE TRLTOM_PACK_UNPACK

