Configuring and computing a DFT in DPC++

Contents

Configuring and computing a DFT in DPC++#

Some usage examples are provided below for computing (possibly batched) DFTs using device-accessible USM allocations, with default or otherwise recommended strides. More working examples can also be found in the oneMKL installation directory under share/doc/mkl/examples/sycl/dft.

To use the DPC++ interface, include oneapi/mkl/dft.hpp. The DPC++ interface supports CPU and Intel GPU devices with some limitations, documented in previous pages. Refer to the Intel® oneAPI Math Kernel Library Release Notes for other (release-specific) known limitations.

The DFT functions assume the Cartesian representation of complex data; that is, complex numbers are defined by their real and imaginary parts. oneMKL provides efficient Vector Mathematical Functions for conversion to and from polar representation.

Usage examples#

The following example illustrates the computation of (possibly many) complex and real discrete Fourier transforms using device-accessible USM allocations without dependencies on other events, via the DPC++ interface of oneMKL.

#include <oneapi/mkl/dft.hpp>
#include <complex>
enum class dft_compute_direction {
   FWD_DFT,
   BWD_DFT
};
namespace dft_ns = oneapi::mkl::dft;

template<typename T>
bool is_usable_compute_type = std::is_same_v<T, float> |
                              std::is_same_v<T, std::complex<float>> |
                              std::is_same_v<T, double> |
                              std::is_same_v<T, std::complex<double>>;

template<dft_ns::precision prec, dft_ns::domain dom>
void ready_descriptor(dft_ns::descriptor<prec, dom>& desc,
                      sycl::queue& q,
                      dft_ns::config_value placement,
                      std::int64_t batch_size) {
   // this example assumes that desc was freshly created and that its default
   // configuration values have not been changed prior to invoking this routine

   // Fetch rank, lengths and type of forward domain
   std::int64_t rank;
   desc.get_value(dft_ns::config_param::DIMENSION, &rank);
   std::vector<std::int64_t> lengths(rank);
   desc.get_value(dft_ns::config_param::LENGTHS, &lengths);
   if (placement == dft_ns::config_value::NOT_INPLACE) {
      // non-default behavior
      desc.set_value(dft_ns::config_param::PLACEMENT, placement);
      if constexpr (dom == dft_ns::domain::REAL) {
         // for real descriptors interpreting backward domain's data type as
         // complex (default behavior), unpadded forward-domain data layouts
         // may be used if operating out-of-place:
         std::vector<std::int64_t> fwd_strides(rank + 1);
         std::vector<std::int64_t> bwd_strides(rank + 1);
         fwd_strides[rank] = bwd_strides[rank] = 1;
         if (rank > 1) {
            fwd_strides[rank - 1] = lengths[rank - 1];
            bwd_strides[rank - 1] = lengths[rank - 1] / 2 + 1;
            for (std::int64_t dim = 2; dim < rank; dim++) {
               fwd_strides[rank - dim] = fwd_strides[rank - dim + 1] * lengths[rank - dim];
               bwd_strides[rank - dim] = bwd_strides[rank - dim + 1] * lengths[rank - dim];
            }
         }
         fwd_strides[0] = bwd_strides[0] = 0; // offsets in data layouts
         desc.set_value(dft_ns::config_param::FWD_STRIDES, fwd_strides);
         desc.set_value(dft_ns::config_param::BWD_STRIDES, bwd_strides);
      }
   }
   if (batch_size > 1) {
      // configuration of batched DFTs
      desc.set_value(dft_ns::config_param::NUMBER_OF_TRANSFORMS, batch_size);
      std::int64_t fwd_distance, bwd_distance;
      fwd_distance = bwd_distance = 1;
      for (std::int64_t dim = 0; dim < rank - 1; dim++) {
         fwd_distance *= lengths[dim];
         bwd_distance *= lengths[dim];
      }
      if constexpr (dom == dft_ns::domain::REAL) {
         if (placement == dft_ns::config_value::NOT_INPLACE)
            fwd_distance *= lengths[rank - 1];
         else
            fwd_distance *= 2 * (lengths[rank - 1] / 2 + 1);
         bwd_distance *= (lengths[rank - 1] / 2 + 1);
      }
      else {
         fwd_distance *= lengths[rank - 1];
         bwd_distance *= lengths[rank - 1];
      }
      desc.set_value(dft_ns::config_param::FWD_DISTANCE, fwd_distance);
      desc.set_value(dft_ns::config_param::BWD_DISTANCE, bwd_distance);
   }
   desc.commit(q);
}

template <typename T, std::enable_if_t<is_usable_compute_type<T>, bool> = true>
sycl::event compute_inplace_cmplx_dft(sycl::queue& q,
                                      const std::vector<std::int64_t>& lengths,
                                      const std::int64_t& batch_size,
                                      const dft_compute_direction& direction,
                                      T* device_accessible_usm_data) {
   constexpr auto prec =
      std::is_same_v<T, float> | std::is_same_v<T, std::complex<float>> ?
         dft_ns::precision::SINGLE : dft_ns::precision::DOUBLE;
   auto desc = dft_ns::descriptor<prec, dft_ns::domain::COMPLEX>(lengths);
   ready_descriptor(desc, q, dft_ns::config_value::INPLACE, batch_size);
   // Initialize data such that input data entry of multi-index
   // - {k1,k2,k3; m}   (for 3D DFTs, i.e., lengths.size() == 3)
   //   is Z[ m*lengths[0]*lengths[1]*lengths[2]
   //         + k1*lengths[1]*lengths[2] + k2*lengths[2] + k3 ]
   // - {k1,k2; m}      (for 2D DFTs, i.e., lengths.size() == 2)
   //   is Z[ m*lengths[0]*lengths[1] + k1*lengths[1] + k2 ]
   // - {k1; m}         (for 1D DFTs, i.e., lengths.size() == 1)
   //   is Z[ m*lengths[0] + k1 ]
   // wherein
   // using real_t =
   //   std::conditional_t<prec == dft_ns::precision::SINGLE, float, double>;
   // std::complex<real_t>* Z =
   //    static_cast<std::complex<real_t>*>(device_accessible_usm_data);
   // (0 <= kj < lengths[j-1])
   if (direction == dft_compute_direction::FWD_DFT)
      return dft_ns::compute_forward(desc, device_accessible_usm_data);
   else
      return dft_ns::compute_backward(desc, device_accessible_usm_data);
   // Upon completion of any of the above, the output data entry of multi-index
   // - {k1,k2,k3; m}   (for 3D DFTs, i.e., lengths.size() == 3)
   //   is Z[ m*lengths[0]*lengths[1]*lengths[2]
   //         + k1*lengths[1]*lengths[2] + k2*lengths[2] + k3 ]
   // - {k1,k2; m}      (for 2D DFTs, i.e., lengths.size() == 2)
   //   is Z[ m*lengths[0]*lengths[1] + k1*lengths[1] + k2 ]
   // - {k1; m}         (for 1D DFTs, i.e., lengths.size() == 1)
   //   is Z[ m*lengths[0] + k1 ]
   // (0 <= kj < lengths[j-1])
}

template <typename T, std::enable_if_t<is_usable_compute_type<T>, bool> = true>
sycl::event compute_outofplace_cmplx_dft(sycl::queue& q,
                                         const std::vector<std::int64_t>& lengths,
                                         const std::int64_t& batch_size,
                                         const dft_compute_direction& direction,
                                         T* device_accessible_usm_data_in,
                                         T* device_accessible_usm_data_out) {
   constexpr bool is_single_precision =
      std::is_same_v<T, float> | std::is_same_v<T, std::complex<float>>;
   constexpr auto prec = (is_single_precision) ?
      dft_ns::precision::SINGLE : dft_ns::precision::DOUBLE;
   auto desc = dft_ns::descriptor<prec, dft_ns::domain::COMPLEX>(lengths);
   ready_descriptor(desc, q, dft_ns::config_value::NOT_INPLACE, batch_size);
   // Initialize data such that input data entry of multi-index
   // - {k1,k2,k3; m}   (for 3D DFTs, i.e., lengths.size() == 3)
   //   is Z[ m*lengths[0]*lengths[1]*lengths[2]
   //         + k1*lengths[1]*lengths[2] + k2*lengths[2] + k3 ]
   // - {k1,k2; m}      (for 2D DFTs, i.e., lengths.size() == 2)
   //   is Z[ m*lengths[0]*lengths[1] + k1*lengths[1] + k2 ]
   // - {k1; m}         (for 1D DFTs, i.e., lengths.size() == 1)
   //   is Z[ m*lengths[0] + k1 ]
   // wherein
   // using real_t =
   //   std::conditional_t<prec == dft_ns::precision::SINGLE, float, double>;
   // std::complex<real_t>* Z =
   //    static_cast<std::complex<real_t>*>(device_accessible_usm_data_in);
   // (0 <= kj < lengths[j-1])
   if (direction == dft_compute_direction::FWD_DFT)
      return dft_ns::compute_forward(desc, device_accessible_usm_data_in,
                                           device_accessible_usm_data_out);
   else
      return dft_ns::compute_backward(desc, device_accessible_usm_data_in,
                                            device_accessible_usm_data_out);
   // Upon completion of any of the above, the output data entry of multi-index
   // - {k1,k2,k3; m}   (for 3D DFTs, i.e., lengths.size() == 3)
   //   = Y[ m*lengths[0]*lengths[1]*lengths[2]
   //        + k1*lengths[1]*lengths[2] + k2*lengths[2] + k3 ]
   // - {k1,k2; m}      (for 2D DFTs, i.e., lengths.size() == 2)
   //   = Y[ m*lengths[0]*lengths[1] + k1*lengths[1] + k2 ]
   // - {k1; m}         (for 1D DFTs, i.e., lengths.size() == 1)
   //   = Y[ m*lengths[0] + k1 ]
   // wherein
   // using real_t =
   //   std::conditional_t<prec == dft_ns::precision::SINGLE, float, double>;
   // std::complex<real_t>* Y =
   //    static_cast<std::complex<real_t>*>(device_accessible_usm_data_out);
   // (0 <= kj < lengths[j-1])
}

template<typename T, std::enable_if_t<is_usable_compute_type<T>, bool> = true>
sycl::event compute_inplace_real_dft(sycl::queue& q,
                                     const std::vector<std::int64_t>& lengths,
                                     const std::int64_t& batch_size,
                                     const dft_compute_direction& direction,
                                     T* device_accessible_usm_data) {
   constexpr bool is_single_precision =
      std::is_same_v<T, float> | std::is_same_v<T, std::complex<float>>;
   constexpr auto prec = (is_single_precision) ?
      dft_ns::precision::SINGLE : dft_ns::precision::DOUBLE;
   auto desc = dft_ns::descriptor<prec, dft_ns::domain::REAL>(lengths);
   ready_descriptor(desc, q, dft_ns::config_value::INPLACE, batch_size);
   // using real_t =
   //   std::conditional_t<prec == dft_ns::precision::SINGLE, float, double>;
   // Let
   // real_t* fwd_Z = static_cast<real_t*>(device_accessible_usm_data);
   // and
   // std::complex<real_t>* bwd_Z =
   //    static_cast<std::complex<real_t>*>(device_accessible_usm_data);
   if (direction == dft_compute_direction::FWD_DFT) {
      // Initialize data such that input data entry of multi-index
      // - {k1,k2,k3; m}   (for 3D DFTs, i.e., lengths.size() == 3)
      //   is fwd_Z[ m*lengths[0]*lengths[1]*2*(lengths[2]/2 + 1)
      //             + k1*lengths[1]*2*(lengths[2]/2 + 1)
      //             + k2*2*(lengths[2]/2 + 1) + k3 ]
      // - {k1,k2; m}      (for 2D DFTs, i.e., lengths.size() == 2)
      //   is fwd_Z[ m*lengths[0]*2*(lengths[1]/2 + 1)
      //             + k1*2*(lengths[1]/2 + 1) + k2 ]
      // - {k1; m}         (for 1D DFTs, i.e., lengths.size() == 1)
      //   is fwd_Z[ m*2*(lengths[0]/2 + 1) + k1 ]
      // (0 <= kj < lengths[j-1])
      return dft_ns::compute_forward(desc, device_accessible_usm_data);
      // Upon completion of the above, the output data entry of multi-index
      // - {k1,k2,k3; m}   (for 3D DFTs, i.e., lengths.size() == 3)
      //   is bwd_Z[ m*lengths[0]*lengths[1]*(lengths[2]/2 + 1)
      //             + k1*lengths[1]*(lengths[2]/2 + 1)
      //             + k2*(lengths[2]/2 + 1) + k3 ]
      // - {k1,k2; m}      (for 2D DFTs, i.e., lengths.size() == 2)
      //   is bwd_Z[ m*lengths[0]*(lengths[1]/2 + 1)
      //             + k1*(lengths[1]/2 + 1) + k2 ]
      // - {k1; m}         (for 1D DFTs, i.e., lengths.size() == 1)
      //   is bwd_Z[ m*(lengths[0]/2 + 1) + k1 ]
      // (0 <= kj < lengths[j-1]       for j <  lengths.size() - 1 and
      //  0 <= kj < lengths[j-1]/2 + 1 for j == lengths.size() - 1)
   }
   else {
      // Initialize data such that input data entry of multi-index
      // - {k1,k2,k3; m}   (for 3D DFTs, i.e., lengths.size() == 3)
      //   is bwd_Z[ m*lengths[0]*lengths[1]*(lengths[2]/2 + 1)
      //             + k1*lengths[1]*(lengths[2]/2 + 1)
      //             + k2*(lengths[2]/2 + 1) + k3 ]
      // - {k1,k2; m}      (for 2D DFTs, i.e., lengths.size() == 2)
      //   is bwd_Z[ m*lengths[0]*(lengths[1]/2 + 1)
      //             + k1*(lengths[1]/2 + 1) + k2 ]
      // - {k1; m}         (for 1D DFTs, i.e., lengths.size() == 1)
      //   is bwd_Z[ m*(lengths[0]/2 + 1) + k1 ]
      // (0 <= kj < lengths[j-1]       for j <  lengths.size() - 1 and
      //  0 <= kj < lengths[j-1]/2 + 1 for j == lengths.size() - 1)
      return dft_ns::compute_backward(desc, device_accessible_usm_data);
      // Upon completion of the above, the output data entry of multi-index
      // - {k1,k2,k3; m}   (for 3D DFTs, i.e., lengths.size() == 3)
      //   is fwd_Z[ m*lengths[0]*lengths[1]*2*(lengths[2]/2 + 1)
      //             + k1*lengths[1]*2*(lengths[2]/2 + 1)
      //             + k2*2*(lengths[2]/2 + 1) + k3 ]
      // - {k1,k2; m}      (for 2D DFTs, i.e., lengths.size() == 2)
      //   is fwd_Z[ m*lengths[0]*2*(lengths[1]/2 + 1)
      //             + k1*2*(lengths[1]/2 + 1) + k2 ]
      // - {k1; m}         (for 1D DFTs, i.e., lengths.size() == 1)
      //   is fwd_Z[ m*2*(lengths[0]/2 + 1) + k1 ]
      // (0 <= kj < lengths[j-1])
   }
}

template<typename T, std::enable_if_t<is_usable_compute_type<T>, bool> = true>
sycl::event compute_outofplace_real_dft(sycl::queue& q,
                                        const std::vector<std::int64_t>& lengths,
                                        const std::int64_t& batch_size,
                                        const dft_compute_direction& direction,
                                        T* device_accessible_usm_data_in,
                                        T* device_accessible_usm_data_out) {
   constexpr bool is_single_precision =
      std::is_same_v<T, float> | std::is_same_v<T, std::complex<float>>;
   constexpr auto prec = (is_single_precision) ?
      dft_ns::precision::SINGLE : dft_ns::precision::DOUBLE;
   auto desc = dft_ns::descriptor<prec, dft_ns::domain::REAL>(lengths);
   ready_descriptor(desc, q, dft_ns::config_value::NOT_INPLACE, batch_size);
   if (direction == dft_compute_direction::FWD_DFT) {
      // Initialize data such that input data entry of multi-index
      // - {k1,k2,k3; m}   (for 3D DFTs, i.e., lengths.size() == 3)
      //   is fwd_Z[ m*lengths[0]*lengths[1]*lengths[2]
      //             + k1*lengths[1]*lengths[2] + k2*lengths[2] + k3 ]
      // - {k1,k2; m}      (for 2D DFTs, i.e., lengths.size() == 2)
      //   is fwd_Z[ m*lengths[0]*lengths[1] + k1*lengths[1] + k2 ]
      // - {k1; m}         (for 1D DFTs, i.e., lengths.size() == 1)
      //   is fwd_Z[ m*lengths[0] + k1 ]
      // wherein
      // using real_t =
      //    std::conditional_t<prec == dft_ns::precision::SINGLE, float, double>;
      // real_t* fwd_Z = static_cast<real_t*>(device_accessible_usm_data_in);
      // (0 <= kj < lengths[j-1])
      return dft_ns::compute_forward(desc, device_accessible_usm_data_in,
                                           device_accessible_usm_data_out);
      // Upon completion of the above, the output data entry of multi-index
      // - {k1,k2,k3; m}   (for 3D DFTs, i.e., lengths.size() == 3)
      //   is bwd_Z[ m*lengths[0]*lengths[1]*(lengths[2]/2 + 1)
      //             + k1*lengths[1]*(lengths[2]/2 + 1)
      //             + k2*(lengths[2]/2 + 1) + k3 ]
      // - {k1,k2; m}      (for 2D DFTs, i.e., lengths.size() == 2)
      //   is bwd_Z[ m*lengths[0]*(lengths[1]/2 + 1)
      //             + k1*(lengths[1]/2 + 1) + k2 ]
      // - {k1; m}         (for 1D DFTs, i.e., lengths.size() == 1)
      //   is bwd_Z[ m*(lengths[0]/2 + 1) + k1 ]
      // (0 <= kj < lengths[j-1]       for j <  lengths.size() - 1 and
      //  0 <= kj < lengths[j-1]/2 + 1 for j == lengths.size() - 1)
      // wherein
      // using real_t =
      //    std::conditional_t<prec == dft_ns::precision::SINGLE, float, double>;
      // std::complex<real_t>* bwd_Z =
      //    static_cast<std::complex<real_t>*>(device_accessible_usm_data_out);
   }
   else {
      // Initialize data such that input data entry of multi-index
      // - {k1,k2,k3; m}   (for 3D DFTs, i.e., lengths.size() == 3)
      //   is bwd_Z[ m*lengths[0]*lengths[1]*(lengths[2]/2 + 1)
      //             + k1*lengths[1]*(lengths[2]/2 + 1)
      //             + k2*(lengths[2]/2 + 1) + k3 ]
      // - {k1,k2; m}      (for 2D DFTs, i.e., lengths.size() == 2)
      //   is bwd_Z[ m*lengths[0]*(lengths[1]/2 + 1)
      //             + k1*(lengths[1]/2 + 1) + k2 ]
      // - {k1; m}         (for 1D DFTs, i.e., lengths.size() == 1)
      //   is bwd_Z[ m*(lengths[0]/2 + 1) + k1 ]
      // (0 <= kj < lengths[j-1]       for j <  lengths.size() - 1 and
      //  0 <= kj < lengths[j-1]/2 + 1 for j == lengths.size() - 1)
      // wherein
      // using real_t =
      //    std::conditional_t<prec == dft_ns::precision::SINGLE, float, double>;
      // std::complex<real_t>* bwd_Z =
      //    static_cast<std::complex<real_t>*>(device_accessible_usm_data_in);
      return dft_ns::compute_backward(desc, device_accessible_usm_data_in,
                                            device_accessible_usm_data_out);
      // Upon completion of the above, the output data entry of multi-index
      // - {k1,k2,k3; m}   (for 3D DFTs, i.e., lengths.size() == 3)
      //   is fwd_Z[ m*lengths[0]*lengths[1]*lengths[2]
      //             + k1*lengths[1]*lengths[2] + k2*lengths[2] + k3 ]
      // - {k1,k2; m}      (for 2D DFTs, i.e., lengths.size() == 2)
      //   is fwd_Z[ m*lengths[0]*lengths[1] + k1*lengths[1] + k2 ]
      // - {k1; m}         (for 1D DFTs, i.e., lengths.size() == 1)
      //   is fwd_Z[ m*lengths[0] + k1 ]
      // (0 <= kj < lengths[j-1])
      // wherein
      // using real_t =
      //    std::conditional_t<prec == dft_ns::precision::SINGLE, float, double>;
      // real_t* fwd_Z = static_cast<real_t*>(device_accessible_usm_data_out);
   }
}