!>
!! @file m_mpi_proxy.f90
!! @brief Contains module m_mpi_proxy
!! @author S. Bryngelson, K. Schimdmayer, V. Coralic, J. Meng, K. Maeda, T. Colonius
!! @version 1.0
!! @date JUNE 06 2019

!> @brief The module serves as a proxy to the parameters and subroutines
!!          available in the MPI implementation's MPI module. Specifically,
!!          the purpose of the proxy is to harness basic MPI commands into
!!          more complicated procedures as to accomplish the communication
!!          goals for the simulation.
module m_mpi_proxy

    ! Dependencies =============================================================
#ifdef MFC_MPI
    use mpi                    !< Message passing interface (MPI) module
#endif

    use m_derived_types        !< Definitions of the derived types

    use m_global_parameters    !< Definitions of the global parameters
    ! ==========================================================================

    implicit none

    real(kind(0d0)), private, allocatable, dimension(:) :: q_cons_buff_send !<
    !! This variable is utilized to pack and send the buffer of the cell-average
    !! conservative variables, for a single computational domain boundary at the
    !! time, to the relevant neighboring processor.

    real(kind(0d0)), private, allocatable, dimension(:) :: q_cons_buff_recv !<
    !! q_cons_buff_recv is utilized to receive and unpack the buffer of the cell-
    !! average conservative variables, for a single computational domain boundary
    !! at the time, from the relevant neighboring processor.

    !> @name Generic flags used to identify and report MPI errors
    !> @{
    integer, private :: err_code, ierr
    !> @}

!$acc declare create(q_cons_buff_send, q_cons_buff_recv)

    real :: s_time, e_time
    real :: compress_time, mpi_time, decompress_time
    integer :: nCalls_time = 0

contains

    !> The subroutine intializes the MPI execution environment
        !!      and queries both the number of processors which will be
        !!      available for the job and the local processor rank.
    subroutine s_mpi_initialize() ! ----------------------------------------

#ifndef MFC_MPI

        ! Serial run only has 1 processor
        num_procs = 1
        ! Local processor rank is 0
        proc_rank = 0

#else

        ! Initializing the MPI environment
        call MPI_INIT(ierr)

        ! Checking whether the MPI environment has been properly intialized
        if (ierr /= MPI_SUCCESS) then
            print '(A)', 'Unable to initialize MPI environment. Exiting ...'
            call MPI_ABORT(MPI_COMM_WORLD, 1, ierr)
        end if

        ! Querying the number of processors available for the job
        call MPI_COMM_SIZE(MPI_COMM_WORLD, num_procs, ierr)

        ! Querying the rank of the local processor
        call MPI_COMM_RANK(MPI_COMM_WORLD, proc_rank, ierr)
    
#endif

    end subroutine s_mpi_initialize ! --------------------------------------

    !> The subroutine terminates the MPI execution environment.
    subroutine s_mpi_abort() ! ---------------------------------------------

#ifndef MFC_MPI

        stop 1

#else

        ! Terminating the MPI environment
        call MPI_ABORT(MPI_COMM_WORLD, 1, ierr)

#endif

    end subroutine s_mpi_abort ! -------------------------------------------

    !> The subroutine that initializes MPI data structures
        !!  @param q_cons_vf Conservative variables
    subroutine s_initialize_mpi_data(q_cons_vf) ! --------------------------

        type(scalar_field), &
            dimension(sys_size), &
            intent(IN) :: q_cons_vf

        integer, dimension(num_dims) :: sizes_glb, sizes_loc
        integer :: ierr

        integer :: i !< Generic loop iterator

#ifndef MFC_MPI

        print '(A)', '[m_mpi_proxy] s_initialize_mpi_data not supported without MPI.'

#else

        do i = 1, sys_size
            MPI_IO_DATA%var(i)%sf => q_cons_vf(i)%sf(0:m, 0:n, 0:p)
        end do

        ! Define global(g) and local(l) sizes for flow variables
        sizes_glb(1) = m_glb + 1; sizes_loc(1) = m + 1
        if (n > 0) then
            sizes_glb(2) = n_glb + 1; sizes_loc(2) = n + 1
            if (p > 0) then
                sizes_glb(3) = p_glb + 1; sizes_loc(3) = p + 1
            end if
        end if

        ! Define the view for each variable
        do i = 1, sys_size
            call MPI_TYPE_CREATE_SUBARRAY(num_dims, sizes_glb, sizes_loc, start_idx, &
                                          MPI_ORDER_FORTRAN, MPI_DOUBLE_PRECISION, MPI_IO_DATA%view(i), ierr)
            call MPI_TYPE_COMMIT(MPI_IO_DATA%view(i), ierr)
        end do

#endif

    end subroutine s_initialize_mpi_data ! ---------------------------------

    !> Halts all processes until all have reached barrier.
    subroutine s_mpi_barrier() ! -------------------------------------------

#ifndef MFC_MPI

        print '(A)', '[m_mpi_proxy] s_mpi_barrier not supported without MPI.'

#else

        ! Calling MPI_BARRIER
        call MPI_BARRIER(MPI_COMM_WORLD, ierr)

#endif

    end subroutine s_mpi_barrier ! -----------------------------------------

    !> The computation of parameters, the allocation of memory,
        !!      the association of pointers and/or the execution of any
        !!      other procedures that are necessary to setup the module.
    subroutine s_initialize_mpi_proxy_module() ! ---------------------------

#ifndef MFC_MPI

        print '(A)', '[m_mpi_proxy] s_initialize_mpi_proxy_module not supported without MPI.'

#else

        ! Allocating q_cons_buff_send and q_cons_buff_recv. Please note that
        ! for the sake of simplicity, both variables are provided sufficient
        ! storage to hold the largest buffer in the computational domain.

        if (n > 0) then

            if (p > 0) then
                allocate (q_cons_buff_send(0:-1 + buff_size*sys_size* &
                                           (m + 2*buff_size + 1)* &
                                           (n + 2*buff_size + 1)* &
                                           (p + 2*buff_size + 1)/ &
                                           (min(m, n, p) + 2*buff_size + 1)))
            else
                allocate (q_cons_buff_send(0:-1 + buff_size*sys_size* &
                                           (max(m, n) + 2*buff_size + 1)))
            end if

        else

            allocate (q_cons_buff_send(0:-1 + buff_size*sys_size))

        end if

        allocate (q_cons_buff_recv(0:ubound(q_cons_buff_send, 1)))

#endif

    end subroutine s_initialize_mpi_proxy_module ! -------------------------

    !>  Since only the processor with rank 0 reads and verifies
        !!      the consistency of user inputs, these are initially not
        !!      available to the other processors. Then, the purpose of
        !!      this subroutine is to distribute the user inputs to the
        !!      remaining processors in the communicator.
    subroutine s_mpi_bcast_user_inputs() ! ---------------------------------

#ifndef MFC_MPI

        print '(A)', '[m_mpi_proxy] s_mpi_bcast_user_inputs not supported without MPI.'

#else

        integer :: i, j !< Generic loop iterator

        ! Logistics
        call MPI_BCAST(case_dir, len(case_dir), MPI_CHARACTER, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(run_time_info, 1, MPI_LOGICAL, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(t_step_old, 1, MPI_INTEGER, &
                       0, MPI_COMM_WORLD, ierr)

        ! Computational domain parameters
        call MPI_BCAST(m, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(n, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(p, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(m_glb, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(n_glb, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(p_glb, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)

        call MPI_BCAST(cyl_coord, 1, MPI_LOGICAL, &
                       0, MPI_COMM_WORLD, ierr)

        call MPI_BCAST(dt, 1, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr)

        call MPI_BCAST(t_step_start, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(t_step_stop, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(t_step_save, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)

        call MPI_BCAST(debug, 1, MPI_LOGICAL, &
                       0, MPI_COMM_WORLD, ierr)

        ! Simulation algorithm parameters
        call MPI_BCAST(model_eqns, 1, MPI_INTEGER, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(num_fluids, 1, MPI_INTEGER, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(adv_alphan, 1, MPI_LOGICAL, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(mpp_lim, 1, MPI_LOGICAL, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(time_stepper, 1, MPI_INTEGER, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(weno_vars, 1, MPI_INTEGER, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(weno_order, 1, MPI_INTEGER, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(weno_eps, 1, MPI_DOUBLE_PRECISION, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(mapped_weno, 1, MPI_LOGICAL, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(mp_weno, 1, MPI_LOGICAL, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(cu_mpi, 1, MPI_LOGICAL, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(weno_flat, 1, MPI_LOGICAL, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(riemann_flat, 1, MPI_LOGICAL, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(weno_Re_flux, 1, MPI_LOGICAL, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(riemann_solver, 1, MPI_INTEGER, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(wave_speeds, 1, MPI_INTEGER, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(avg_state, 1, MPI_INTEGER, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(alt_soundspeed, 1, MPI_LOGICAL, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(null_weights, 1, MPI_LOGICAL, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(mixture_err, 1, MPI_LOGICAL, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(parallel_io, 1, MPI_LOGICAL, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(precision, 1, MPI_INTEGER, &
                       0, MPI_COMM_WORLD, ierr)

        call MPI_BCAST(bc_x%beg, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(bc_x%end, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(bc_y%beg, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(bc_y%end, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(bc_z%beg, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(bc_z%end, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)

        call MPI_BCAST(bc_x_glb%beg, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(bc_x_glb%end, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(bc_y_glb%beg, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(bc_y_glb%end, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(bc_z_glb%beg, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(bc_z_glb%end, 1, MPI_INTEGER, 0, MPI_COMM_WORLD, ierr)

        ! Fluids physical parameters
        do i = 1, num_fluids_max
            call MPI_BCAST(fluid_pp(i)%gamma, 1, &
                           MPI_DOUBLE_PRECISION, 0, &
                           MPI_COMM_WORLD, ierr)
            call MPI_BCAST(fluid_pp(i)%pi_inf, 1, &
                           MPI_DOUBLE_PRECISION, 0, &
                           MPI_COMM_WORLD, ierr)
            call MPI_BCAST(fluid_pp(i)%Re(1), 2, &
                           MPI_DOUBLE_PRECISION, 0, &
                           MPI_COMM_WORLD, ierr)

            call MPI_BCAST(fluid_pp(i)%mul0, 1, &
                           MPI_DOUBLE_PRECISION, 0, &
                           MPI_COMM_WORLD, ierr)
            call MPI_BCAST(fluid_pp(i)%ss, 1, &
                           MPI_DOUBLE_PRECISION, 0, &
                           MPI_COMM_WORLD, ierr)
            call MPI_BCAST(fluid_pp(i)%pv, 1, &
                           MPI_DOUBLE_PRECISION, 0, &
                           MPI_COMM_WORLD, ierr)
            call MPI_BCAST(fluid_pp(i)%gamma_v, 1, &
                           MPI_DOUBLE_PRECISION, 0, &
                           MPI_COMM_WORLD, ierr)
            call MPI_BCAST(fluid_pp(i)%M_v, 1, &
                           MPI_DOUBLE_PRECISION, 0, &
                           MPI_COMM_WORLD, ierr)
            call MPI_BCAST(fluid_pp(i)%mu_v, 1, &
                           MPI_DOUBLE_PRECISION, 0, &
                           MPI_COMM_WORLD, ierr)
            call MPI_BCAST(fluid_pp(i)%k_v, 1, &
                           MPI_DOUBLE_PRECISION, 0, &
                           MPI_COMM_WORLD, ierr)
        end do

        !Tait EOS
        call MPI_BCAST(pref, 1, &
                       MPI_DOUBLE_PRECISION, 0, &
                       MPI_COMM_WORLD, ierr)
        call MPI_BCAST(rhoref, 1, &
                       MPI_DOUBLE_PRECISION, 0, &
                       MPI_COMM_WORLD, ierr)

        !Bubble modeling
        call MPI_BCAST(bubbles, 1, &
                       MPI_LOGICAL, 0, &
                       MPI_COMM_WORLD, ierr)
        call MPI_BCAST(bubble_model, 1, &
                       MPI_INTEGER, 0, &
                       MPI_COMM_WORLD, ierr)
        call MPI_BCAST(polytropic, 1, &
                       MPI_LOGICAL, 0, &
                       MPI_COMM_WORLD, ierr)
        call MPI_BCAST(thermal, 1, &
                       MPI_INTEGER, 0, &
                       MPI_COMM_WORLD, ierr)
        call MPI_BCAST(R0ref, 1, &
                       MPI_DOUBLE_PRECISION, 0, &
                       MPI_COMM_WORLD, ierr)
        call MPI_BCAST(nb, 1, &
                       MPI_INTEGER, 0, &
                       MPI_COMM_WORLD, ierr)
        call MPI_BCAST(R0_type, 1, &
                       MPI_INTEGER, 0, &
                       MPI_COMM_WORLD, ierr)
        call MPI_BCAST(Web, 1, &
                       MPI_DOUBLE_PRECISION, 0, &
                       MPI_COMM_WORLD, ierr)
        call MPI_BCAST(Ca, 1, &
                       MPI_DOUBLE_PRECISION, 0, &
                       MPI_COMM_WORLD, ierr)
        call MPI_BCAST(Re_inv, 1, &
                       MPI_DOUBLE_PRECISION, 0, &
                       MPI_COMM_WORLD, ierr)
        call MPI_BCAST(polydisperse, 1, &
                       MPI_LOGICAL, 0, &
                       MPI_COMM_WORLD, ierr)
        call MPI_BCAST(poly_sigma, 1, &
                       MPI_DOUBLE_PRECISION, 0, &
                       MPI_COMM_WORLD, ierr)

        call MPI_BCAST(qbmm, 1, &
                       MPI_LOGICAL, 0, &
                       MPI_COMM_WORLD, ierr)
        call MPI_BCAST(nnode, 1, &
                       MPI_INTEGER, 0, &
                       MPI_COMM_WORLD, ierr)

        !Acoustic monopole
        call MPI_BCAST(monopole, 1, &
                       MPI_LOGICAL, 0, &
                       MPI_COMM_WORLD, ierr)
        call MPI_BCAST(num_mono, 1, &
                       MPI_INTEGER, 0, &
                       MPI_COMM_WORLD, ierr)

        do j = 1, num_probes_max
            do i = 1, 3
                call MPI_BCAST(mono(j)%loc(i), 1, &
                               MPI_DOUBLE_PRECISION, 0, &
                               MPI_COMM_WORLD, ierr)
            end do
            call MPI_BCAST(mono(j)%mag, 1, &
                           MPI_DOUBLE_PRECISION, 0, &
                           MPI_COMM_WORLD, ierr)
            call MPI_BCAST(mono(j)%length, 1, &
                           MPI_DOUBLE_PRECISION, 0, &
                           MPI_COMM_WORLD, ierr)
            call MPI_BCAST(mono(j)%delay, 1, &
                           MPI_DOUBLE_PRECISION, 0, &
                           MPI_COMM_WORLD, ierr)
            call MPI_BCAST(mono(j)%dir, 1, &
                           MPI_DOUBLE_PRECISION, 0, &
                           MPI_COMM_WORLD, ierr)
            call MPI_BCAST(mono(j)%npulse, 1, &
                           MPI_DOUBLE_PRECISION, 0, &
                           MPI_COMM_WORLD, ierr)
            call MPI_BCAST(mono(j)%pulse, 1, &
                           MPI_INTEGER, 0, &
                           MPI_COMM_WORLD, ierr)
            call MPI_BCAST(mono(j)%support, 1, &
                           MPI_INTEGER, 0, &
                           MPI_COMM_WORLD, ierr)
        end do

        call MPI_BCAST(com_wrt(1), num_fluids_max, MPI_LOGICAL, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(cb_wrt(1), num_fluids_max, MPI_LOGICAL, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(fd_order, 1, MPI_INTEGER, &
                       0, MPI_COMM_WORLD, ierr)

        call MPI_BCAST(num_probes, 1, MPI_INTEGER, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(probe_wrt, 1, MPI_LOGICAL, &
                       0, MPI_COMM_WORLD, ierr)
        do i = 1, num_probes_max
            call MPI_BCAST(probe(i)%x, 1, MPI_DOUBLE_PRECISION, &
                           0, MPI_COMM_WORLD, ierr)
            call MPI_BCAST(probe(i)%y, 1, MPI_DOUBLE_PRECISION, &
                           0, MPI_COMM_WORLD, ierr)
            call MPI_BCAST(probe(i)%z, 1, MPI_DOUBLE_PRECISION, &
                           0, MPI_COMM_WORLD, ierr)
        end do

        call MPI_BCAST(integral_wrt, 1, MPI_LOGICAL, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(num_integrals, 1, MPI_INTEGER, &
                       0, MPI_COMM_WORLD, ierr)
        do i = 1, num_probes_max
            call MPI_BCAST(integral(i)%xmin, 1, MPI_DOUBLE_PRECISION, &
                           0, MPI_COMM_WORLD, ierr)
            call MPI_BCAST(integral(i)%xmax, 1, MPI_DOUBLE_PRECISION, &
                           0, MPI_COMM_WORLD, ierr)
            call MPI_BCAST(integral(i)%ymin, 1, MPI_DOUBLE_PRECISION, &
                           0, MPI_COMM_WORLD, ierr)
            call MPI_BCAST(integral(i)%ymax, 1, MPI_DOUBLE_PRECISION, &
                           0, MPI_COMM_WORLD, ierr)
            call MPI_BCAST(integral(i)%zmin, 1, MPI_DOUBLE_PRECISION, &
                           0, MPI_COMM_WORLD, ierr)
            call MPI_BCAST(integral(i)%zmax, 1, MPI_DOUBLE_PRECISION, &
                           0, MPI_COMM_WORLD, ierr)
        end do

        call MPI_BCAST(threshold_mf(1), 5, MPI_DOUBLE_PRECISION, &
                       0, MPI_COMM_WORLD, ierr)
        call MPI_BCAST(moment_order(1), 5, MPI_INTEGER, &
                       0, MPI_COMM_WORLD, ierr)

#endif

    end subroutine s_mpi_bcast_user_inputs ! -------------------------------

    subroutine mpi_bcast_time_step_values(proc_time, time_avg)

            real(kind(0d0)), dimension(0:num_procs - 1), intent(INOUT) :: proc_time 
            real(kind(0d0)), intent(INOUT) :: time_avg

#ifndef MFC_MPI

        print '(A)', '[m_mpi_proxy] mpi_bcast_time_step_values not supported without MPI.'

#else

        integer :: j

        call MPI_GATHER(time_avg, 1, MPI_DOUBLE_PRECISION, proc_time(0), 1, MPI_DOUBLE_PRECISION, 0, MPI_COMM_WORLD, ierr )

#endif

      end subroutine mpi_bcast_time_step_values

    !>  The purpose of this procedure is to optimally decompose
        !!      the computational domain among the available processors.
        !!      This is performed by attempting to award each processor,
        !!      in each of the coordinate directions, approximately the
        !!      same number of cells, and then recomputing the affected
        !!      global parameters.
    subroutine s_mpi_decompose_computational_domain() ! --------------------

#ifndef MFC_MPI

        print '(A)', '[m_mpi_proxy] s_mpi_decompose_computational_domain not supported without MPI.'

#else

        integer :: num_procs_x, num_procs_y, num_procs_z !<
            !! Optimal number of processors in the x-, y- and z-directions

        real(kind(0d0)) :: tmp_num_procs_x, tmp_num_procs_y, tmp_num_procs_z !<
            !! Non-optimal number of processors in the x-, y- and z-directions

        real(kind(0d0)) :: fct_min !<
            !! Processor factorization (fct) minimization parameter

        integer :: MPI_COMM_CART !<
            !! Cartesian processor topology communicator

        integer :: rem_cells !<
            !! Remaining number of cells, in a particular coordinate direction,
            !! after the majority is divided up among the available processors

        integer :: i, j !< Generic loop iterators

        if (num_procs == 1 .and. parallel_io) then
            do i = 1, num_dims
                start_idx(i) = 0
            end do
            return
        end if

        ! 3D Cartesian Processor Topology ==================================
        if (n > 0) then

            if (p > 0) then

                if (cyl_coord .and. p > 0) then
                    ! Implement pencil processor blocking if using cylindrical coordinates so
                    ! that all cells in azimuthal direction are stored on a single processor.
                    ! This is necessary for efficient application of Fourier filter near axis.

                    ! Initial values of the processor factorization optimization
                    num_procs_x = 1
                    num_procs_y = num_procs
                    num_procs_z = 1
                    ierr = -1

                    ! Computing minimization variable for these initial values
                    tmp_num_procs_x = num_procs_x
                    tmp_num_procs_y = num_procs_y
                    tmp_num_procs_z = num_procs_z
                    fct_min = 10d0*abs((m + 1)/tmp_num_procs_x &
                                       - (n + 1)/tmp_num_procs_y)

                    ! Searching for optimal computational domain distribution
                    do i = 1, num_procs

                        if (mod(num_procs, i) == 0 &
                            .and. &
                            (m + 1)/i >= num_stcls_min*weno_order) then

                            tmp_num_procs_x = i
                            tmp_num_procs_y = num_procs/i

                            if (fct_min >= abs((m + 1)/tmp_num_procs_x &
                                               - (n + 1)/tmp_num_procs_y) &
                                .and. &
                                (n + 1)/tmp_num_procs_y &
                                >= &
                                num_stcls_min*weno_order) then

                                num_procs_x = i
                                num_procs_y = num_procs/i
                                fct_min = abs((m + 1)/tmp_num_procs_x &
                                              - (n + 1)/tmp_num_procs_y)
                                ierr = 0

                            end if

                        end if

                    end do

                else

                    ! Initial estimate of optimal processor topology
                    num_procs_x = 1
                    num_procs_y = 1
                    num_procs_z = num_procs
                    ierr = -1

                    ! Benchmarking the quality of this initial guess
                    tmp_num_procs_x = num_procs_x
                    tmp_num_procs_y = num_procs_y
                    tmp_num_procs_z = num_procs_z
                    fct_min = 10d0*abs((m + 1)/tmp_num_procs_x &
                                       - (n + 1)/tmp_num_procs_y) &
                              + 10d0*abs((n + 1)/tmp_num_procs_y &
                                         - (p + 1)/tmp_num_procs_z)

                    ! Optimization of the initial processor topology
                    do i = 1, num_procs

                        if (mod(num_procs, i) == 0 &
                            .and. &
                            (m + 1)/i >= num_stcls_min*weno_order) then

                            do j = 1, num_procs/i

                                if (mod(num_procs/i, j) == 0 &
                                    .and. &
                                    (n + 1)/j >= num_stcls_min*weno_order) then

                                    tmp_num_procs_x = i
                                    tmp_num_procs_y = j
                                    tmp_num_procs_z = num_procs/(i*j)

                                    if (fct_min >= abs((m + 1)/tmp_num_procs_x &
                                                       - (n + 1)/tmp_num_procs_y) &
                                        + abs((n + 1)/tmp_num_procs_y &
                                              - (p + 1)/tmp_num_procs_z) &
                                        .and. &
                                        (p + 1)/tmp_num_procs_z &
                                        >= &
                                        num_stcls_min*weno_order) &
                                        then

                                        num_procs_x = i
                                        num_procs_y = j
                                        num_procs_z = num_procs/(i*j)
                                        fct_min = abs((m + 1)/tmp_num_procs_x &
                                                      - (n + 1)/tmp_num_procs_y) &
                                                  + abs((n + 1)/tmp_num_procs_y &
                                                        - (p + 1)/tmp_num_procs_z)
                                        ierr = 0

                                    end if

                                end if

                            end do

                        end if

                    end do

                end if

                ! Verifying that a valid decomposition of the computational
                ! domain has been established. If not, the simulation exits.
                if (proc_rank == 0 .and. ierr == -1) then
                    print '(A)', 'Unsupported combination of values '// &
                        'of num_procs, m, n, p and '// &
                        'weno_order. Exiting ...'
                    call s_mpi_abort()
                end if

                ! Creating new communicator using the Cartesian topology
                call MPI_CART_CREATE(MPI_COMM_WORLD, 3, (/num_procs_x, &
                                                          num_procs_y, num_procs_z/), &
                                     (/.true., .true., .true./), &
                                     .false., MPI_COMM_CART, ierr)

                ! Finding the Cartesian coordinates of the local process
                call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 3, &
                                     proc_coords, ierr)
                ! END: 3D Cartesian Processor Topology =============================

                ! Global Parameters for z-direction ================================

                ! Number of remaining cells
                rem_cells = mod(p + 1, num_procs_z)

                ! Optimal number of cells per processor
                p = (p + 1)/num_procs_z - 1

                ! Distributing the remaining cells
                do i = 1, rem_cells
                    if (proc_coords(3) == i - 1) then
                        p = p + 1; exit
                    end if
                end do

                ! Boundary condition at the beginning
                if (proc_coords(3) > 0 .or. bc_z%beg == -1) then
                    proc_coords(3) = proc_coords(3) - 1
                    call MPI_CART_RANK(MPI_COMM_CART, proc_coords, &
                                       bc_z%beg, ierr)
                    proc_coords(3) = proc_coords(3) + 1
                end if

                ! Boundary condition at the end
                if (proc_coords(3) < num_procs_z - 1 .or. bc_z%end == -1) then
                    proc_coords(3) = proc_coords(3) + 1
                    call MPI_CART_RANK(MPI_COMM_CART, proc_coords, &
                                       bc_z%end, ierr)
                    proc_coords(3) = proc_coords(3) - 1
                end if

                if (parallel_io) then
                    if (proc_coords(3) < rem_cells) then
                        start_idx(3) = (p + 1)*proc_coords(3)
                    else
                        start_idx(3) = (p + 1)*proc_coords(3) + rem_cells
                    end if
                end if
                ! ==================================================================

                ! 2D Cartesian Processor Topology ==================================
            else

                ! Initial estimate of optimal processor topology
                num_procs_x = 1
                num_procs_y = num_procs
                ierr = -1

                ! Benchmarking the quality of this initial guess
                tmp_num_procs_x = num_procs_x
                tmp_num_procs_y = num_procs_y
                fct_min = 10d0*abs((m + 1)/tmp_num_procs_x &
                                   - (n + 1)/tmp_num_procs_y)

                ! Optimization of the initial processor topology
                do i = 1, num_procs

                    if (mod(num_procs, i) == 0 &
                        .and. &
                        (m + 1)/i >= num_stcls_min*weno_order) then

                        tmp_num_procs_x = i
                        tmp_num_procs_y = num_procs/i

                        if (fct_min >= abs((m + 1)/tmp_num_procs_x &
                                           - (n + 1)/tmp_num_procs_y) &
                            .and. &
                            (n + 1)/tmp_num_procs_y &
                            >= &
                            num_stcls_min*weno_order) then

                            num_procs_x = i
                            num_procs_y = num_procs/i
                            fct_min = abs((m + 1)/tmp_num_procs_x &
                                          - (n + 1)/tmp_num_procs_y)
                            ierr = 0

                        end if

                    end if

                end do

                ! Verifying that a valid decomposition of the computational
                ! domain has been established. If not, the simulation exits.
                if (proc_rank == 0 .and. ierr == -1) then
                    print '(A)', 'Unsupported combination of values '// &
                        'of num_procs, m, n and '// &
                        'weno_order. Exiting ...'
                    call s_mpi_abort()
                end if

                ! Creating new communicator using the Cartesian topology
                call MPI_CART_CREATE(MPI_COMM_WORLD, 2, (/num_procs_x, &
                                                          num_procs_y/), (/.true., &
                                                                           .true./), .false., MPI_COMM_CART, &
                                     ierr)

                ! Finding the Cartesian coordinates of the local process
                call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 2, &
                                     proc_coords, ierr)

            end if
            ! END: 2D Cartesian Processor Topology =============================

            ! Global Parameters for y-direction ================================

            ! Number of remaining cells
            rem_cells = mod(n + 1, num_procs_y)

            ! Optimal number of cells per processor
            n = (n + 1)/num_procs_y - 1

            ! Distributing the remaining cells
            do i = 1, rem_cells
                if (proc_coords(2) == i - 1) then
                    n = n + 1; exit
                end if
            end do

            ! Boundary condition at the beginning
            if (proc_coords(2) > 0 .or. bc_y%beg == -1) then
                proc_coords(2) = proc_coords(2) - 1
                call MPI_CART_RANK(MPI_COMM_CART, proc_coords, &
                                   bc_y%beg, ierr)
                proc_coords(2) = proc_coords(2) + 1
            end if

            ! Boundary condition at the end
            if (proc_coords(2) < num_procs_y - 1 .or. bc_y%end == -1) then
                proc_coords(2) = proc_coords(2) + 1
                call MPI_CART_RANK(MPI_COMM_CART, proc_coords, &
                                   bc_y%end, ierr)
                proc_coords(2) = proc_coords(2) - 1
            end if

            if (parallel_io) then
                if (proc_coords(2) < rem_cells) then
                    start_idx(2) = (n + 1)*proc_coords(2)
                else
                    start_idx(2) = (n + 1)*proc_coords(2) + rem_cells
                end if
            end if
            ! ==================================================================

            ! 1D Cartesian Processor Topology ==================================
        else

            ! Optimal processor topology
            num_procs_x = num_procs

            ! Creating new communicator using the Cartesian topology
            call MPI_CART_CREATE(MPI_COMM_WORLD, 1, (/num_procs_x/), &
                                 (/.true./), .false., MPI_COMM_CART, &
                                 ierr)

            ! Finding the Cartesian coordinates of the local process
            call MPI_CART_COORDS(MPI_COMM_CART, proc_rank, 1, &
                                 proc_coords, ierr)

        end if
        ! ==================================================================

        ! Global Parameters for x-direction ================================

        ! Number of remaining cells
        rem_cells = mod(m + 1, num_procs_x)

        ! Optimal number of cells per processor
        m = (m + 1)/num_procs_x - 1

        ! Distributing the remaining cells
        do i = 1, rem_cells
            if (proc_coords(1) == i - 1) then
                m = m + 1; exit
            end if
        end do

        ! Boundary condition at the beginning
        if (proc_coords(1) > 0 .or. bc_x%beg == -1) then
            proc_coords(1) = proc_coords(1) - 1
            call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_x%beg, ierr)
            proc_coords(1) = proc_coords(1) + 1
        end if

        print *, bc_x%beg

        ! Boundary condition at the end
        if (proc_coords(1) < num_procs_x - 1 .or. bc_x%end == -1) then
            proc_coords(1) = proc_coords(1) + 1
            call MPI_CART_RANK(MPI_COMM_CART, proc_coords, bc_x%end, ierr)
            proc_coords(1) = proc_coords(1) - 1
        end if

        if (parallel_io) then
            if (proc_coords(1) < rem_cells) then
                start_idx(1) = (m + 1)*proc_coords(1)
            else
                start_idx(1) = (m + 1)*proc_coords(1) + rem_cells
            end if
        end if
        ! ==================================================================
        
        if(proc_rank == 0) then
          print *, m, n, p
        end if

#endif

    end subroutine s_mpi_decompose_computational_domain ! ------------------

    !>  The goal of this procedure is to populate the buffers of
        !!      the grid variables by communicating with the neighboring
        !!      processors. Note that only the buffers of the cell-width
        !!      distributions are handled in such a way. This is because
        !!      the buffers of cell-boundary locations may be calculated
        !!      directly from those of the cell-width distributions.
        !!  @param mpi_dir MPI communication coordinate direction
        !!  @param pbc_loc Processor boundary condition (PBC) location
    subroutine s_mpi_sendrecv_grid_variables_buffers(mpi_dir, pbc_loc) ! ---

        integer, intent(IN) :: mpi_dir
        integer, intent(IN) :: pbc_loc

#ifndef MFC_MPI

        print '(A)', '[m_mpi_proxy] s_mpi_sendrecv_grid_variables_buffers not supported without MPI.'

#else

        ! MPI Communication in x-direction =================================
        if (mpi_dir == 1) then

            if (pbc_loc == -1) then      ! PBC at the beginning

                if (bc_x%end >= 0) then      ! PBC at the beginning and end

                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        dx(m - buff_size + 1), buff_size, &
                        MPI_DOUBLE_PRECISION, bc_x%end, 0, &
                        dx(-buff_size), buff_size, &
                        MPI_DOUBLE_PRECISION, bc_x%beg, 0, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

                else                        ! PBC at the beginning only

                    ! Send/receive buffer to/from bc_x%beg/bc_x%beg
                    call MPI_SENDRECV( &
                        dx(0), buff_size, &
                        MPI_DOUBLE_PRECISION, bc_x%beg, 1, &
                        dx(-buff_size), buff_size, &
                        MPI_DOUBLE_PRECISION, bc_x%beg, 0, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

                end if

            else                        ! PBC at the end

                if (bc_x%beg >= 0) then      ! PBC at the end and beginning

                    ! Send/receive buffer to/from bc_x%beg/bc_x%end
                    call MPI_SENDRECV( &
                        dx(0), buff_size, &
                        MPI_DOUBLE_PRECISION, bc_x%beg, 1, &
                        dx(m + 1), buff_size, &
                        MPI_DOUBLE_PRECISION, bc_x%end, 1, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

                else                        ! PBC at the end only

                    ! Send/receive buffer to/from bc_x%end/bc_x%end
                    call MPI_SENDRECV( &
                        dx(m - buff_size + 1), buff_size, &
                        MPI_DOUBLE_PRECISION, bc_x%end, 0, &
                        dx(m + 1), buff_size, &
                        MPI_DOUBLE_PRECISION, bc_x%end, 1, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

                end if

            end if
            ! END: MPI Communication in x-direction ============================

            ! MPI Communication in y-direction =================================
        elseif (mpi_dir == 2) then

            if (pbc_loc == -1) then      ! PBC at the beginning

                if (bc_y%end >= 0) then      ! PBC at the beginning and end

                    ! Send/receive buffer to/from bc_y%end/bc_y%beg
                    call MPI_SENDRECV( &
                        dy(n - buff_size + 1), buff_size, &
                        MPI_DOUBLE_PRECISION, bc_y%end, 0, &
                        dy(-buff_size), buff_size, &
                        MPI_DOUBLE_PRECISION, bc_y%beg, 0, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

                else                        ! PBC at the beginning only

                    ! Send/receive buffer to/from bc_y%beg/bc_y%beg
                    call MPI_SENDRECV( &
                        dy(0), buff_size, &
                        MPI_DOUBLE_PRECISION, bc_y%beg, 1, &
                        dy(-buff_size), buff_size, &
                        MPI_DOUBLE_PRECISION, bc_y%beg, 0, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

                end if

            else                        ! PBC at the end

                if (bc_y%beg >= 0) then      ! PBC at the end and beginning

                    ! Send/receive buffer to/from bc_y%beg/bc_y%end
                    call MPI_SENDRECV( &
                        dy(0), buff_size, &
                        MPI_DOUBLE_PRECISION, bc_y%beg, 1, &
                        dy(n + 1), buff_size, &
                        MPI_DOUBLE_PRECISION, bc_y%end, 1, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

                else                        ! PBC at the end only

                    ! Send/receive buffer to/from bc_y%end/bc_y%end
                    call MPI_SENDRECV( &
                        dy(n - buff_size + 1), buff_size, &
                        MPI_DOUBLE_PRECISION, bc_y%end, 0, &
                        dy(n + 1), buff_size, &
                        MPI_DOUBLE_PRECISION, bc_y%end, 1, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

                end if

            end if
            ! END: MPI Communication in y-direction ============================

            ! MPI Communication in z-direction =================================
        else

            if (pbc_loc == -1) then      ! PBC at the beginning

                if (bc_z%end >= 0) then      ! PBC at the beginning and end

                    ! Send/receive buffer to/from bc_z%end/bc_z%beg
                    call MPI_SENDRECV( &
                        dz(p - buff_size + 1), buff_size, &
                        MPI_DOUBLE_PRECISION, bc_z%end, 0, &
                        dz(-buff_size), buff_size, &
                        MPI_DOUBLE_PRECISION, bc_z%beg, 0, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

                else                        ! PBC at the beginning only

                    ! Send/receive buffer to/from bc_z%beg/bc_z%beg
                    call MPI_SENDRECV( &
                        dz(0), buff_size, &
                        MPI_DOUBLE_PRECISION, bc_z%beg, 1, &
                        dz(-buff_size), buff_size, &
                        MPI_DOUBLE_PRECISION, bc_z%beg, 0, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

                end if

            else                        ! PBC at the end

                if (bc_z%beg >= 0) then      ! PBC at the end and beginning

                    ! Send/receive buffer to/from bc_z%beg/bc_z%end
                    call MPI_SENDRECV( &
                        dz(0), buff_size, &
                        MPI_DOUBLE_PRECISION, bc_z%beg, 1, &
                        dz(p + 1), buff_size, &
                        MPI_DOUBLE_PRECISION, bc_z%end, 1, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

                else                        ! PBC at the end only

                    ! Send/receive buffer to/from bc_z%end/bc_z%end
                    call MPI_SENDRECV( &
                        dz(p - buff_size + 1), buff_size, &
                        MPI_DOUBLE_PRECISION, bc_z%end, 0, &
                        dz(p + 1), buff_size, &
                        MPI_DOUBLE_PRECISION, bc_z%end, 1, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

                end if

            end if

        end if
        ! END: MPI Communication in z-direction ============================

#endif

    end subroutine s_mpi_sendrecv_grid_variables_buffers ! -----------------

    !>  The goal of this subroutine is to determine the global
        !!      extrema of the stability criteria in the computational
        !!      domain. This is performed by sifting through the local
        !!      extrema of each stability criterion. Note that each of
        !!      the local extrema is from a single process, within its
        !!      assigned section of the computational domain. Finally,
        !!      note that the global extrema values are only bookkeept
        !!      on the rank 0 processor.
        !!  @param icfl_max_loc Local maximum ICFL stability criterion
        !!  @param vcfl_max_loc Local maximum VCFL stability criterion
        !!  @param ccfl_max_loc Local maximum CCFL stability criterion
        !!  @param Rc_min_loc Local minimum Rc stability criterion
        !!  @param icfl_max_glb Global maximum ICFL stability criterion
        !!  @param vcfl_max_glb Global maximum VCFL stability criterion
        !!  @param ccfl_max_glb Global maximum CCFL stability criterion
        !!  @param Rc_min_glb Global minimum Rc stability criterion
    subroutine s_mpi_reduce_stability_criteria_extrema(icfl_max_loc, & ! --
                                                       vcfl_max_loc, &
                                                       ccfl_max_loc, &
                                                       Rc_min_loc, &
                                                       icfl_max_glb, &
                                                       vcfl_max_glb, &
                                                       ccfl_max_glb, &
                                                       Rc_min_glb)

        real(kind(0d0)), intent(IN)  :: icfl_max_loc
        real(kind(0d0)), intent(IN)  :: vcfl_max_loc
        real(kind(0d0)), intent(IN)  :: ccfl_max_loc
        real(kind(0d0)), intent(IN)  ::   Rc_min_loc

        real(kind(0d0)), intent(OUT) :: icfl_max_glb
        real(kind(0d0)), intent(OUT) :: vcfl_max_glb
        real(kind(0d0)), intent(OUT) :: ccfl_max_glb
        real(kind(0d0)), intent(OUT) ::   Rc_min_glb

#ifndef MFC_MPI

        print '(A)', '[m_mpi_proxy] s_mpi_reduce_stability_criteria_extrema not supported without MPI.'

#else

        ! Reducing local extrema of ICFL, VCFL, CCFL and Rc numbers to their
        ! global extrema and bookkeeping the results on the rank 0 processor
        call MPI_REDUCE(icfl_max_loc, icfl_max_glb, 1, &
                        MPI_DOUBLE_PRECISION, MPI_MAX, 0, &
                        MPI_COMM_WORLD, ierr)

        if (any(Re_size > 0)) then
            call MPI_REDUCE(vcfl_max_loc, vcfl_max_glb, 1, &
                            MPI_DOUBLE_PRECISION, MPI_MAX, 0, &
                            MPI_COMM_WORLD, ierr)
            call MPI_REDUCE(Rc_min_loc, Rc_min_glb, 1, &
                            MPI_DOUBLE_PRECISION, MPI_MIN, 0, &
                            MPI_COMM_WORLD, ierr)
        end if

#endif

    end subroutine s_mpi_reduce_stability_criteria_extrema ! ---------------

    !>  The following subroutine takes the input local variable
        !!      from all processors and reduces to the sum of all
        !!      values. The reduced variable is recorded back onto the
        !!      original local variable on each processor.
        !!  @param var_loc Some variable containing the local value which should be
        !!  reduced amongst all the processors in the communicator.
        !!  @param var_glb The globally reduced value
    subroutine s_mpi_allreduce_sum(var_loc, var_glb) ! ---------------------

        real(kind(0d0)), intent(IN) :: var_loc
        real(kind(0d0)), intent(OUT) :: var_glb

#ifndef MFC_MPI

        print '(A)', '[m_mpi_proxy] s_mpi_allreduce_sum not supported without MPI.'

#else

        ! Performing the reduction procedure
        call MPI_ALLREDUCE(var_loc, var_glb, 1, MPI_DOUBLE_PRECISION, &
                           MPI_SUM, MPI_COMM_WORLD, ierr)

#endif

    end subroutine s_mpi_allreduce_sum ! -----------------------------------

    !>  The following subroutine takes the input local variable
        !!      from all processors and reduces to the minimum of all
        !!      values. The reduced variable is recorded back onto the
        !!      original local variable on each processor.
        !!  @param var_loc Some variable containing the local value which should be
        !!  reduced amongst all the processors in the communicator.
        !!  @param var_glb The globally reduced value
    subroutine s_mpi_allreduce_min(var_loc, var_glb) ! ---------------------

        real(kind(0d0)), intent(IN) :: var_loc
        real(kind(0d0)), intent(OUT) :: var_glb

#ifndef MFC_MPI

        print '(A)', '[m_mpi_proxy] s_mpi_allreduce_min not supported without MPI.'

#else

        ! Performing the reduction procedure
        call MPI_ALLREDUCE(var_loc, var_glb, 1, MPI_DOUBLE_PRECISION, &
                           MPI_MIN, MPI_COMM_WORLD, ierr)

#endif

    end subroutine s_mpi_allreduce_min ! -----------------------------------

    !>  The following subroutine takes the input local variable
        !!      from all processors and reduces to the maximum of all
        !!      values. The reduced variable is recorded back onto the
        !!      original local variable on each processor.
        !!  @param var_loc Some variable containing the local value which should be
        !!  reduced amongst all the processors in the communicator.
        !!  @param var_glb The globally reduced value
    subroutine s_mpi_allreduce_max(var_loc, var_glb) ! ---------------------

        real(kind(0d0)), intent(IN) :: var_loc
        real(kind(0d0)), intent(OUT) :: var_glb

#ifndef MFC_MPI

        print '(A)', '[m_mpi_proxy] s_mpi_allreduce_max not supported without MPI.'

#else

        ! Performing the reduction procedure
        call MPI_ALLREDUCE(var_loc, var_glb, 1, MPI_DOUBLE_PRECISION, &
                           MPI_MAX, MPI_COMM_WORLD, ierr)

#endif

    end subroutine s_mpi_allreduce_max ! -----------------------------------

    !>  The goal of this procedure is to populate the buffers of
        !!      the cell-average conservative variables by communicating
        !!      with the neighboring processors.
        !!  @param q_cons_vf Cell-average conservative variables
        !!  @param mpi_dir MPI communication coordinate direction
        !!  @param pbc_loc Processor boundary condition (PBC) location
    subroutine s_mpi_sendrecv_conservative_variables_buffers(q_cons_vf, &
                                                             mpi_dir, &
                                                             pbc_loc)

        type(scalar_field), dimension(sys_size), intent(INOUT) :: q_cons_vf
        integer, intent(IN) :: mpi_dir
        integer, intent(IN) :: pbc_loc

        integer :: i, j, k, l, r !< Generic loop iterators

#ifndef MFC_MPI

        print '(A)', '[m_mpi_proxy] s_mpi_sendrecv_conservative_variables_buffers not supported without MPI.'

#else

        nCalls_time = nCalls_time + 1

        ! MPI Communication in x-direction =================================
        if (mpi_dir == 1) then

            if (pbc_loc == -1) then      ! PBC at the beginning

                if (bc_x%end >= 0) then      ! PBC at the beginning and end

                    ! Packing buffer to be sent to bc_x%end
!$acc parallel loop collapse(4) gang vector default(present) private(r)
                      do l = 0, p
                        do k = 0, n
                            do j = m - buff_size + 1, m
                              do i = 1, sys_size
                                    r = (i - 1) + sys_size* &
                                        ((j - m - 1) + buff_size*((k + 1) + (n + 1)*l))
                                    q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l)
                            end do
                        end do
                      end do
                    end do

                    call MPI_Barrier(MPI_COMM_WORLD, ierr)

#if defined(_OPENACC) && defined(__PGI)
                    if(cu_mpi) then
!$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send )

                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        q_cons_buff_send(0), &
                        buff_size*sys_size*(n + 1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_x%end, 0, &
                        q_cons_buff_recv(0), &
                        buff_size*sys_size*(n + 1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_x%beg, 0, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

!$acc end host_data
!$acc wait
                    else
#endif // #if defined(_OPENACC) && defined(__PGI)
            call cpu_time(s_time)
!$acc update host(q_cons_buff_send)
            call cpu_time(e_time)
            compress_time = compress_time + (e_time - s_time)

call cpu_time(s_time)

! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        q_cons_buff_send(0), &
                        buff_size*sys_size*(n + 1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_x%end, 0, &
                        q_cons_buff_recv(0), &
                        buff_size*sys_size*(n + 1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_x%beg, 0, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
call cpu_time(e_time)
mpi_time = mpi_time + (e_time - s_time)

#if defined(_OPENACC) && defined(__PGI)
                    end if
#endif

                else                        ! PBC at the beginning only

                    ! Packing buffer to be sent to bc_x%beg
!$acc parallel loop collapse(4) gang vector default(present) private(r)
                      do l = 0, p
                          do k = 0, n
                              do j = 0, buff_size - 1
                                do i = 1, sys_size
                                    r = (i - 1) + sys_size* &
                                        (j + buff_size*(k + (n + 1)*l))
                                    q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l)
                                end do
                            end do
                        end do
                    end do

                    call MPI_Barrier(MPI_COMM_WORLD, ierr)

#if defined(_OPENACC) && defined(__PGI)
                    if(cu_mpi) then
!$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send )

                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        q_cons_buff_send(0), &
                        buff_size*sys_size*(n + 1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_x%beg, 1, &
                        q_cons_buff_recv(0), &
                        buff_size*sys_size*(n + 1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_x%beg, 0, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

!$acc end host_data
!$acc wait
                    else
#endif
            call cpu_time(s_time)
!$acc update host(q_cons_buff_send)
            call cpu_time(e_time)
            compress_time = compress_time + (e_time - s_time)

call cpu_time(s_time)
! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        q_cons_buff_send(0), &
                        buff_size*sys_size*(n + 1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_x%beg, 1, &
                        q_cons_buff_recv(0), &
                        buff_size*sys_size*(n + 1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_x%beg, 0, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
call cpu_time(e_time)
mpi_time = mpi_time + (e_time - s_time)

#if defined(_OPENACC) && defined(__PGI)
                    end if
#endif

                end if

#if defined(_OPENACC) && defined(__PGI)
              if(cu_mpi .eqv. .false.) then
call cpu_time(s_time)
!$acc update device(q_cons_buff_recv)
call cpu_time(e_time)
decompress_time = decompress_time + (e_time - s_time)
              end if
#endif

                ! Unpacking buffer received from bc_x%beg
!$acc parallel loop collapse(4) gang vector default(present) private(r)
                  do l = 0, p
                      do k = 0, n
                          do j = -buff_size, -1
                            do i = 1, sys_size
                                r = (i - 1) + sys_size* &
                                    (j + buff_size*((k + 1) + (n + 1)*l))
                                q_cons_vf(i)%sf(j, k, l) = q_cons_buff_recv(r)
                            end do
                        end do
                    end do
                end do

            else                        ! PBC at the end

                if (bc_x%beg >= 0) then      ! PBC at the end and beginning

!$acc parallel loop collapse(4) gang vector default(present) private(r)
                    ! Packing buffer to be sent to bc_x%beg
                    do l = 0, p
                        do k = 0, n
                            do j = 0, buff_size - 1
                              do i = 1, sys_size
                                    r = (i - 1) + sys_size* &
                                        (j + buff_size*(k + (n + 1)*l))
                                    q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l)
                                end do
                            end do
                        end do
                    end do

                    call MPI_Barrier(MPI_COMM_WORLD, ierr)

#if defined(_OPENACC) && defined(__PGI)
                    if(cu_mpi) then
!$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send )

                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        q_cons_buff_send(0), &
                        buff_size*sys_size*(n + 1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_x%beg, 1, &
                        q_cons_buff_recv(0), &
                        buff_size*sys_size*(n + 1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_x%end, 1, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

!$acc end host_data
!$acc wait
                    else
#endif
            call cpu_time(s_time)
!$acc update host(q_cons_buff_send)
            call cpu_time(e_time)
            compress_time = compress_time + (e_time - s_time)
            
call cpu_time(s_time)
! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        q_cons_buff_send(0), &
                        buff_size*sys_size*(n + 1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_x%beg, 1, &
                        q_cons_buff_recv(0), &
                        buff_size*sys_size*(n + 1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_x%end, 1, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
call cpu_time(e_time)
mpi_time = mpi_time + (e_time - s_time)

#if defined(_OPENACC) && defined(__PGI)
                    end if
#endif

                else                        ! PBC at the end only

                    ! Packing buffer to be sent to bc_x%end
!$acc parallel loop collapse(4) gang vector default(present) private(r)
                    do l = 0, p
                        do k = 0, n
                            do j = m - buff_size + 1, m
                              do i = 1, sys_size
                                    r = (i - 1) + sys_size* &
                                        ((j - m - 1) + buff_size*((k + 1) + (n + 1)*l))
                                    q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l)
                                end do
                            end do
                        end do
                    end do

                    call MPI_Barrier(MPI_COMM_WORLD, ierr)

#if defined(_OPENACC) && defined(__PGI)
                    if(cu_mpi) then
!$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send )

                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        q_cons_buff_send(0), &
                        buff_size*sys_size*(n + 1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_x%end, 0, &
                        q_cons_buff_recv(0), &
                        buff_size*sys_size*(n + 1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_x%end, 1, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

!$acc end host_data
!$acc wait
                    else
#endif
            call cpu_time(s_time)
!$acc update host(q_cons_buff_send)
            call cpu_time(e_time)
            compress_time = compress_time + (e_time - s_time)
            
call cpu_time(s_time)
! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        q_cons_buff_send(0), &
                        buff_size*sys_size*(n + 1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_x%end, 0, &
                        q_cons_buff_recv(0), &
                        buff_size*sys_size*(n + 1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_x%end, 1, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
call cpu_time(e_time)
mpi_time = mpi_time + (e_time - s_time)

#if defined(_OPENACC) && defined(__PGI)
                    end if
#endif

                end if

              if(cu_mpi .eqv. .false.) then
call cpu_time(s_time)
!$acc update device(q_cons_buff_recv)
call cpu_time(e_time)
decompress_time = decompress_time + (e_time - s_time)
              end if

                ! Unpacking buffer received from bc_x%end
!$acc parallel loop collapse(4) gang vector default(present) private(r)
                do l = 0, p
                    do k = 0, n
                        do j = m + 1, m + buff_size
                          do i = 1, sys_size
                                r = (i - 1) + sys_size* &
                                    ((j - m - 1) + buff_size*(k + (n + 1)*l))
                                q_cons_vf(i)%sf(j, k, l) = q_cons_buff_recv(r)
                            end do
                        end do
                    end do
                end do

            end if
            ! END: MPI Communication in x-direction ============================

            ! MPI Communication in y-direction =================================
        elseif (mpi_dir == 2) then

            if (pbc_loc == -1) then      ! PBC at the beginning

                if (bc_y%end >= 0) then      ! PBC at the beginning and end

                    ! Packing buffer to be sent to bc_y%end
!$acc parallel loop collapse(4) gang vector default(present) private(r)
                  do i = 1, sys_size
                    do l = 0, p
                        do k = n - buff_size + 1, n
                            do j = -buff_size, m + buff_size
                                    r = (i - 1) + sys_size* &
                                        ((j + buff_size) + (m + 2*buff_size + 1)* &
                                         ((k - n + buff_size - 1) + buff_size*l))
                                    q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l)
                                end do
                            end do
                        end do
                    end do

                    call MPI_Barrier(MPI_COMM_WORLD, ierr)

#if defined(_OPENACC) && defined(__PGI)
                    if(cu_mpi) then
!$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send )

                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        q_cons_buff_send(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_y%end, 0, &
                        q_cons_buff_recv(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_y%beg, 0, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

!$acc end host_data
!$acc wait
                    else
#endif
            call cpu_time(s_time)
!$acc update host(q_cons_buff_send)
            call cpu_time(e_time)
            compress_time = compress_time + (e_time - s_time)
            
call cpu_time(s_time)
! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        q_cons_buff_send(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_y%end, 0, &
                        q_cons_buff_recv(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_y%beg, 0, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
call cpu_time(e_time)
mpi_time = mpi_time + (e_time - s_time)

#if defined(_OPENACC) && defined(__PGI)
                    end if
#endif

                else                        ! PBC at the beginning only

                    ! Packing buffer to be sent to bc_y%beg
!$acc parallel loop collapse(4) gang vector default(present) private(r)
                  do i = 1, sys_size
                    do l = 0, p
                        do k = 0, buff_size - 1
                            do j = -buff_size, m + buff_size
                                    r = (i - 1) + sys_size* &
                                        ((j + buff_size) + (m + 2*buff_size + 1)* &
                                         (k + buff_size*l))
                                    q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l)
                                end do
                            end do
                        end do
                    end do

                    call MPI_Barrier(MPI_COMM_WORLD, ierr)

#if defined(_OPENACC) && defined(__PGI)
                    if(cu_mpi) then
!$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send )

                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        q_cons_buff_send(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_y%beg, 1, &
                        q_cons_buff_recv(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_y%beg, 0, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

!$acc end host_data
!$acc wait
                    else
#endif
            call cpu_time(s_time)
!$acc update host(q_cons_buff_send)
            call cpu_time(e_time)
            compress_time = compress_time + (e_time - s_time)

call cpu_time(s_time)
! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        q_cons_buff_send(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_y%beg, 1, &
                        q_cons_buff_recv(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_y%beg, 0, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
call cpu_time(e_time)
mpi_time = mpi_time + (e_time - s_time)

#if defined(_OPENACC) && defined(__PGI)
                    end if
#endif

                end if

#if defined(_OPENACC) && defined(__PGI)
              if(cu_mpi .eqv. .false.) then
call cpu_time(s_time)
!$acc update device(q_cons_buff_recv)
call cpu_time(e_time)
decompress_time = decompress_time + (e_time - s_time)
              end if
#endif

                ! Unpacking buffer received from bc_y%beg
!$acc parallel loop collapse(4) gang vector default(present) private(r)
              do i = 1, sys_size
                do l = 0, p
                    do k = -buff_size, -1
                        do j = -buff_size, m + buff_size
                                r = (i - 1) + sys_size* &
                                    ((j + buff_size) + (m + 2*buff_size + 1)* &
                                     ((k + buff_size) + buff_size*l))
                                q_cons_vf(i)%sf(j, k, l) = q_cons_buff_recv(r)
                            end do
                        end do
                    end do
                end do

            else                        ! PBC at the end

                if (bc_y%beg >= 0) then      ! PBC at the end and beginning

                    ! Packing buffer to be sent to bc_y%beg
!$acc parallel loop collapse(4) gang vector default(present) private(r)
                  do i = 1, sys_size
                    do l = 0, p
                        do k = 0, buff_size - 1
                            do j = -buff_size, m + buff_size
                                    r = (i - 1) + sys_size* &
                                        ((j + buff_size) + (m + 2*buff_size + 1)* &
                                         (k + buff_size*l))
                                    q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l)
                                end do
                            end do
                        end do
                    end do

                    call MPI_Barrier(MPI_COMM_WORLD, ierr)

#if defined(_OPENACC) && defined(__PGI)
                    if(cu_mpi) then
!$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send )

                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        q_cons_buff_send(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_y%beg, 1, &
                        q_cons_buff_recv(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_y%end, 1, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

!$acc end host_data
!$acc wait
                    else
#endif
            call cpu_time(s_time)
!$acc update host(q_cons_buff_send)
            call cpu_time(e_time)
            compress_time = compress_time + (e_time - s_time)

call cpu_time(s_time)
! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        q_cons_buff_send(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_y%beg, 1, &
                        q_cons_buff_recv(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_y%end, 1, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
call cpu_time(e_time)
mpi_time = mpi_time + (e_time - s_time)

#if defined(_OPENACC) && defined(__PGI)
                    end if
#endif

                else                        ! PBC at the end only

                    ! Packing buffer to be sent to bc_y%end
!$acc parallel loop collapse(4) gang vector default(present) private(r)
                  do i = 1, sys_size
                    do l = 0, p
                        do k = n - buff_size + 1, n
                            do j = -buff_size, m + buff_size
                                    r = (i - 1) + sys_size* &
                                        ((j + buff_size) + (m + 2*buff_size + 1)* &
                                         ((k - n + buff_size - 1) + buff_size*l))
                                    q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l)
                                end do
                            end do
                        end do
                    end do

                    call MPI_Barrier(MPI_COMM_WORLD, ierr)

#if defined(_OPENACC) && defined(__PGI)
                    if(cu_mpi) then
!$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send )

                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        q_cons_buff_send(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_y%end, 0, &
                        q_cons_buff_recv(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_y%end, 1, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

!$acc end host_data
!$acc wait
                    else
#endif
            call cpu_time(s_time)
!$acc update host(q_cons_buff_send)
            call cpu_time(e_time)
            compress_time = compress_time + (e_time - s_time)

call cpu_time(s_time)
! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        q_cons_buff_send(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_y%end, 0, &
                        q_cons_buff_recv(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(p + 1), &
                        MPI_DOUBLE_PRECISION, bc_y%end, 1, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
call cpu_time(e_time)
mpi_time = mpi_time + (e_time - s_time)

#if defined(_OPENACC) && defined(__PGI)
                    end if
#endif

                end if

#if defined(_OPENACC) && defined(__PGI)
              if(cu_mpi .eqv. .false.) then
call cpu_time(s_time)
!$acc update device(q_cons_buff_recv)
call cpu_time(e_time)
decompress_time = decompress_time + (e_time - s_time)
              end if
#endif

                ! Unpacking buffer received form bc_y%end
!$acc parallel loop collapse(4) gang vector default(present) private(r)
              do i = 1, sys_size
                do l = 0, p
                    do k = n + 1, n + buff_size
                        do j = -buff_size, m + buff_size
                                r = (i - 1) + sys_size* &
                                    ((j + buff_size) + (m + 2*buff_size + 1)* &
                                     ((k - n - 1) + buff_size*l))
                                q_cons_vf(i)%sf(j, k, l) = q_cons_buff_recv(r)
                            end do
                        end do
                    end do
                end do

            end if
            ! END: MPI Communication in y-direction ============================

            ! MPI Communication in z-direction =================================
        else

            if (pbc_loc == -1) then      ! PBC at the beginning

                if (bc_z%end >= 0) then      ! PBC at the beginning and end

                    ! Packing buffer to be sent to bc_z%end
!$acc parallel loop collapse(4) gang vector default(present) private(r)
                  do i = 1, sys_size 
                    do l = p - buff_size + 1, p
                        do k = -buff_size, n + buff_size
                            do j = -buff_size, m + buff_size
                                    r = (i - 1) + sys_size* &
                                        ((j + buff_size) + (m + 2*buff_size + 1)* &
                                         ((k + buff_size) + (n + 2*buff_size + 1)* &
                                          (l - p + buff_size - 1)))
                                    q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l)
                                end do
                            end do
                        end do
                    end do

                    call MPI_Barrier(MPI_COMM_WORLD, ierr)

#if defined(_OPENACC) && defined(__PGI)
                    if(cu_mpi) then
!$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send )

                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        q_cons_buff_send(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(n+2*buff_size+1), &
                        MPI_DOUBLE_PRECISION, bc_z%end, 0, &
                        q_cons_buff_recv(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(n+2*buff_size+1), &
                        MPI_DOUBLE_PRECISION, bc_z%beg, 0, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

!$acc end host_data
!$acc wait
                    else
#endif
            call cpu_time(s_time)
!$acc update host(q_cons_buff_send)
            call cpu_time(e_time)
            compress_time = compress_time + (e_time - s_time)
    
call cpu_time(s_time)
! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        q_cons_buff_send(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(n+2*buff_size+1), &
                        MPI_DOUBLE_PRECISION, bc_z%end, 0, &
                        q_cons_buff_recv(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(n+2*buff_size+1), &
                        MPI_DOUBLE_PRECISION, bc_z%beg, 0, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
call cpu_time(e_time)
mpi_time = mpi_time + (e_time - s_time)

#if defined(_OPENACC) && defined(__PGI)
                    end if
#endif

                else                        ! PBC at the beginning only

                    ! Packing buffer to be sent to bc_z%beg
!$acc parallel loop collapse(4) gang vector default(present) private(r)
                 do i = 1, sys_size
                    do l = 0, buff_size - 1
                        do k = -buff_size, n + buff_size
                            do j = -buff_size, m + buff_size
                                    r = (i - 1) + sys_size* &
                                        ((j + buff_size) + (m + 2*buff_size + 1)* &
                                         ((k + buff_size) + (n + 2*buff_size + 1)*l))
                                    q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l)
                                end do
                            end do
                        end do
                    end do

                    call MPI_Barrier(MPI_COMM_WORLD, ierr)

#if defined(_OPENACC) && defined(__PGI)
                    if(cu_mpi) then
!$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send )

                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        q_cons_buff_send(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(n+2*buff_size+1), &
                        MPI_DOUBLE_PRECISION, bc_z%beg, 1, &
                        q_cons_buff_recv(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(n+2*buff_size+1), &
                        MPI_DOUBLE_PRECISION, bc_z%beg, 0, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

!$acc end host_data
!$acc wait
                    else
#endif
            call cpu_time(s_time)
!$acc update host(q_cons_buff_send)
            call cpu_time(e_time)
            compress_time = compress_time + (e_time - s_time)

call cpu_time(s_time)
! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        q_cons_buff_send(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(n+2*buff_size+1), &
                        MPI_DOUBLE_PRECISION, bc_z%beg, 1, &
                        q_cons_buff_recv(0),&
                        buff_size*sys_size*(m+2*buff_size+1)*(n+2*buff_size+1), &
                        MPI_DOUBLE_PRECISION, bc_z%beg, 0, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
call cpu_time(e_time)
mpi_time = mpi_time + (e_time - s_time)

#if defined(_OPENACC) && defined(__PGI)
                    end if
#endif

                end if

#if defined(_OPENACC) && defined(__PGI)
              if(cu_mpi .eqv. .false.) then
call cpu_time(s_time)
!$acc update device(q_cons_buff_recv)
call cpu_time(e_time)
decompress_time = decompress_time + (e_time - s_time)
              end if
#endif

                ! Unpacking buffer from bc_z%beg
!$acc parallel loop collapse(4) gang vector default(present) private(r)
              do i = 1, sys_size
                do l = -buff_size, -1
                    do k = -buff_size, n + buff_size
                        do j = -buff_size, m + buff_size                            
                                r = (i - 1) + sys_size* &
                                    ((j + buff_size) + (m + 2*buff_size + 1)* &
                                     ((k + buff_size) + (n + 2*buff_size + 1)* &
                                      (l + buff_size)))
                                q_cons_vf(i)%sf(j, k, l) = q_cons_buff_recv(r)
                            end do
                        end do
                    end do
                end do

            else                        ! PBC at the end

                if (bc_z%beg >= 0) then      ! PBC at the end and beginning

                    ! Packing buffer to be sent to bc_z%beg
!$acc parallel loop collapse(4) gang vector default(present) private(r)
                  do i = 1, sys_size
                    do l = 0, buff_size - 1
                        do k = -buff_size, n + buff_size
                            do j = -buff_size, m + buff_size
                                    r = (i - 1) + sys_size* &
                                        ((j + buff_size) + (m + 2*buff_size + 1)* &
                                         ((k + buff_size) + (n + 2*buff_size + 1)*l))
                                    q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l)
                                end do
                            end do
                        end do
                    end do

                    call MPI_Barrier(MPI_COMM_WORLD, ierr)

#if defined(_OPENACC) && defined(__PGI)
                    if(cu_mpi) then
!$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send )

                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        q_cons_buff_send(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(n+2*buff_size+1), &
                        MPI_DOUBLE_PRECISION, bc_z%beg, 1, &
                        q_cons_buff_recv(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(n+2*buff_size+1), &
                        MPI_DOUBLE_PRECISION, bc_z%end, 1, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

!$acc end host_data
!$acc wait
                    else
#endif
            call cpu_time(s_time)
!$acc update host(q_cons_buff_send)
            call cpu_time(e_time)
            compress_time = compress_time + (e_time - s_time)

call cpu_time(s_time)
! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        q_cons_buff_send(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(n+2*buff_size+1), &
                        MPI_DOUBLE_PRECISION, bc_z%beg, 1, &
                        q_cons_buff_recv(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(n+2*buff_size+1), &
                        MPI_DOUBLE_PRECISION, bc_z%end, 1, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
call cpu_time(e_time)
mpi_time = mpi_time + (e_time - s_time)

#if defined(_OPENACC) && defined(__PGI)
                    end if
#endif

                else                        ! PBC at the end only

                    ! Packing buffer to be sent to bc_z%end
!$acc parallel loop collapse(4) gang vector default(present) private(r)
                  do i = 1, sys_size
                    do l = p - buff_size + 1, p
                        do k = -buff_size, n + buff_size
                            do j = -buff_size, m + buff_size
                                    r = (i - 1) + sys_size* &
                                        ((j + buff_size) + (m + 2*buff_size + 1)* &
                                         ((k + buff_size) + (n + 2*buff_size + 1)* &
                                          (l - p + buff_size - 1)))
                                    q_cons_buff_send(r) = q_cons_vf(i)%sf(j, k, l)
                                end do
                            end do
                        end do
                    end do

                    call MPI_Barrier(MPI_COMM_WORLD, ierr)

#if defined(_OPENACC) && defined(__PGI)
                    if(cu_mpi) then
!$acc host_data use_device( q_cons_buff_recv, q_cons_buff_send )

                    ! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        q_cons_buff_send(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(n+2*buff_size+1), &
                        MPI_DOUBLE_PRECISION, bc_z%end, 0, &
                        q_cons_buff_recv(0), &
                        buff_size*sys_size*(m+2*buff_size+1)*(n+2*buff_size+1), &
                        MPI_DOUBLE_PRECISION, bc_z%end, 1, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

!$acc end host_data
!$acc wait
                    else
#endif
            call cpu_time(s_time)
!$acc update host(q_cons_buff_send)
            call cpu_time(e_time)
            compress_time = compress_time + (e_time - s_time)

call cpu_time(s_time)
! Send/receive buffer to/from bc_x%end/bc_x%beg
                    call MPI_SENDRECV( &
                        q_cons_buff_send(0), &
                        buff_size*sys_size*(m + 2*buff_size + 1)*(n + 2*buff_size+ 1), &
                        MPI_DOUBLE_PRECISION, bc_z%end, 0, &
                        q_cons_buff_recv(0), &
                        buff_size*sys_size*(m + 2*buff_size + 1)*(n +2*buff_size + 1), &
                        MPI_DOUBLE_PRECISION, bc_z%end, 1, &
                        MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
call cpu_time(e_time)
mpi_time = mpi_time + (e_time - s_time)

#if defined(_OPENACC) && defined(__PGI)
                    end if
#endif
                      
                end if

#if defined(_OPENACC) && defined(__PGI)
              if(cu_mpi .eqv. .false.) then
call cpu_time(s_time)
!$acc update device(q_cons_buff_recv)
call cpu_time(e_time)
decompress_time = decompress_time + (e_time - s_time)
              end if
#endif

                ! Unpacking buffer received from bc_z%end
!$acc parallel loop collapse(4) gang vector default(present) private(r)
              do i = 1, sys_size
                do l = p + 1, p + buff_size
                    do k = -buff_size, n + buff_size
                        do j = -buff_size, m + buff_size                            
                                r = (i - 1) + sys_size* &
                                    ((j + buff_size) + (m + 2*buff_size + 1)* &
                                     ((k + buff_size) + (n + 2*buff_size + 1)* &
                                      (l - p - 1)))
                                q_cons_vf(i)%sf(j, k, l) = q_cons_buff_recv(r)
                            end do
                        end do
                    end do
                end do

            end if

        end if
        ! END: MPI Communication in z-direction ============================

#endif

    end subroutine s_mpi_sendrecv_conservative_variables_buffers ! ---------

    !> Module deallocation and/or disassociation procedures
    subroutine s_finalize_mpi_proxy_module() ! -----------------------------

#ifndef MFC_MPI

        print '(A)', '[m_mpi_proxy] s_finalize_mpi_proxy_module not supported without MPI.'

#else

        ! Deallocating q_cons_buff_send and q_cons_buff_recv
        deallocate (q_cons_buff_send, q_cons_buff_recv)

#endif

    end subroutine s_finalize_mpi_proxy_module ! ---------------------------

    !> The subroutine finalizes the MPI execution environment.
    subroutine s_mpi_finalize() ! ------------------------------------------

#ifndef MFC_MPI

        print '(A)', '[m_mpi_proxy] s_mpi_finalize not supported without MPI.'

#else

        ! Finalizing the MPI environment
        call MPI_FINALIZE(ierr)

#endif

    end subroutine s_mpi_finalize ! ----------------------------------------

end module m_mpi_proxy