Configuring and computing a DFT in DPC++#
Some usage examples are provided below for computing (possibly batched) DFTs using device-accessible USM allocations, with default or otherwise recommended strides. More working examples can also be found in the oneMKL installation directory under share/doc/mkl/examples/sycl/dft
.
To use the DPC++ interface, include oneapi/mkl/dft.hpp
. The DPC++ interface supports CPU and Intel GPU devices with some limitations, documented in previous pages. Refer to the Intel® oneAPI Math Kernel Library Release Notes for other (release-specific) known limitations.
The DFT functions assume the Cartesian representation of complex data; that is, complex numbers are defined by their real and imaginary parts. oneMKL provides efficient Vector Mathematical Functions for conversion to and from polar representation.
Usage examples#
The following example illustrates the computation of (possibly many) complex and real discrete Fourier transforms using device-accessible USM allocations without dependencies on other events, via the DPC++ interface of oneMKL.
#include <oneapi/mkl/dft.hpp>
#include <complex>
enum class dft_compute_direction {
FWD_DFT,
BWD_DFT
};
namespace dft_ns = oneapi::mkl::dft;
template<typename T>
bool is_usable_compute_type = std::is_same_v<T, float> |
std::is_same_v<T, std::complex<float>> |
std::is_same_v<T, double> |
std::is_same_v<T, std::complex<double>>;
template<dft_ns::precision prec, dft_ns::domain dom>
void ready_descriptor(dft_ns::descriptor<prec, dom>& desc,
sycl::queue& q,
dft_ns::config_value placement,
std::int64_t batch_size) {
// this example assumes that desc was freshly created and that its default
// configuration values have not been changed prior to invoking this routine
// Fetch rank, lengths and type of forward domain
std::int64_t rank;
desc.get_value(dft_ns::config_param::DIMENSION, &rank);
std::vector<std::int64_t> lengths(rank);
desc.get_value(dft_ns::config_param::LENGTHS, &lengths);
if (placement == dft_ns::config_value::NOT_INPLACE) {
// non-default behavior
desc.set_value(dft_ns::config_param::PLACEMENT, placement);
if constexpr (dom == dft_ns::domain::REAL) {
// for real descriptors interpreting backward domain's data type as
// complex (default behavior), unpadded forward-domain data layouts
// may be used if operating out-of-place:
std::vector<std::int64_t> fwd_strides(rank + 1);
std::vector<std::int64_t> bwd_strides(rank + 1);
fwd_strides[rank] = bwd_strides[rank] = 1;
if (rank > 1) {
fwd_strides[rank - 1] = lengths[rank - 1];
bwd_strides[rank - 1] = lengths[rank - 1] / 2 + 1;
for (std::int64_t dim = 2; dim < rank; dim++) {
fwd_strides[rank - dim] = fwd_strides[rank - dim + 1] * lengths[rank - dim];
bwd_strides[rank - dim] = bwd_strides[rank - dim + 1] * lengths[rank - dim];
}
}
fwd_strides[0] = bwd_strides[0] = 0; // offsets in data layouts
desc.set_value(dft_ns::config_param::FWD_STRIDES, fwd_strides);
desc.set_value(dft_ns::config_param::BWD_STRIDES, bwd_strides);
}
}
if (batch_size > 1) {
// configuration of batched DFTs
desc.set_value(dft_ns::config_param::NUMBER_OF_TRANSFORMS, batch_size);
std::int64_t fwd_distance, bwd_distance;
fwd_distance = bwd_distance = 1;
for (std::int64_t dim = 0; dim < rank - 1; dim++) {
fwd_distance *= lengths[dim];
bwd_distance *= lengths[dim];
}
if constexpr (dom == dft_ns::domain::REAL) {
if (placement == dft_ns::config_value::NOT_INPLACE)
fwd_distance *= lengths[rank - 1];
else
fwd_distance *= 2 * (lengths[rank - 1] / 2 + 1);
bwd_distance *= (lengths[rank - 1] / 2 + 1);
}
else {
fwd_distance *= lengths[rank - 1];
bwd_distance *= lengths[rank - 1];
}
desc.set_value(dft_ns::config_param::FWD_DISTANCE, fwd_distance);
desc.set_value(dft_ns::config_param::BWD_DISTANCE, bwd_distance);
}
desc.commit(q);
}
template <typename T, std::enable_if_t<is_usable_compute_type<T>, bool> = true>
sycl::event compute_inplace_cmplx_dft(sycl::queue& q,
const std::vector<std::int64_t>& lengths,
const std::int64_t& batch_size,
const dft_compute_direction& direction,
T* device_accessible_usm_data) {
constexpr auto prec =
std::is_same_v<T, float> | std::is_same_v<T, std::complex<float>> ?
dft_ns::precision::SINGLE : dft_ns::precision::DOUBLE;
auto desc = dft_ns::descriptor<prec, dft_ns::domain::COMPLEX>(lengths);
ready_descriptor(desc, q, dft_ns::config_value::INPLACE, batch_size);
// Initialize data such that input data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is Z[ m*lengths[0]*lengths[1]*lengths[2]
// + k1*lengths[1]*lengths[2] + k2*lengths[2] + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is Z[ m*lengths[0]*lengths[1] + k1*lengths[1] + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is Z[ m*lengths[0] + k1 ]
// wherein
// using real_t =
// std::conditional_t<prec == dft_ns::precision::SINGLE, float, double>;
// std::complex<real_t>* Z =
// static_cast<std::complex<real_t>*>(device_accessible_usm_data);
// (0 <= kj < lengths[j-1])
if (direction == dft_compute_direction::FWD_DFT)
return dft_ns::compute_forward(desc, device_accessible_usm_data);
else
return dft_ns::compute_backward(desc, device_accessible_usm_data);
// Upon completion of any of the above, the output data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is Z[ m*lengths[0]*lengths[1]*lengths[2]
// + k1*lengths[1]*lengths[2] + k2*lengths[2] + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is Z[ m*lengths[0]*lengths[1] + k1*lengths[1] + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is Z[ m*lengths[0] + k1 ]
// (0 <= kj < lengths[j-1])
}
template <typename T, std::enable_if_t<is_usable_compute_type<T>, bool> = true>
sycl::event compute_outofplace_cmplx_dft(sycl::queue& q,
const std::vector<std::int64_t>& lengths,
const std::int64_t& batch_size,
const dft_compute_direction& direction,
T* device_accessible_usm_data_in,
T* device_accessible_usm_data_out) {
constexpr bool is_single_precision =
std::is_same_v<T, float> | std::is_same_v<T, std::complex<float>>;
constexpr auto prec = (is_single_precision) ?
dft_ns::precision::SINGLE : dft_ns::precision::DOUBLE;
auto desc = dft_ns::descriptor<prec, dft_ns::domain::COMPLEX>(lengths);
ready_descriptor(desc, q, dft_ns::config_value::NOT_INPLACE, batch_size);
// Initialize data such that input data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is Z[ m*lengths[0]*lengths[1]*lengths[2]
// + k1*lengths[1]*lengths[2] + k2*lengths[2] + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is Z[ m*lengths[0]*lengths[1] + k1*lengths[1] + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is Z[ m*lengths[0] + k1 ]
// wherein
// using real_t =
// std::conditional_t<prec == dft_ns::precision::SINGLE, float, double>;
// std::complex<real_t>* Z =
// static_cast<std::complex<real_t>*>(device_accessible_usm_data_in);
// (0 <= kj < lengths[j-1])
if (direction == dft_compute_direction::FWD_DFT)
return dft_ns::compute_forward(desc, device_accessible_usm_data_in,
device_accessible_usm_data_out);
else
return dft_ns::compute_backward(desc, device_accessible_usm_data_in,
device_accessible_usm_data_out);
// Upon completion of any of the above, the output data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// = Y[ m*lengths[0]*lengths[1]*lengths[2]
// + k1*lengths[1]*lengths[2] + k2*lengths[2] + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// = Y[ m*lengths[0]*lengths[1] + k1*lengths[1] + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// = Y[ m*lengths[0] + k1 ]
// wherein
// using real_t =
// std::conditional_t<prec == dft_ns::precision::SINGLE, float, double>;
// std::complex<real_t>* Y =
// static_cast<std::complex<real_t>*>(device_accessible_usm_data_out);
// (0 <= kj < lengths[j-1])
}
template<typename T, std::enable_if_t<is_usable_compute_type<T>, bool> = true>
sycl::event compute_inplace_real_dft(sycl::queue& q,
const std::vector<std::int64_t>& lengths,
const std::int64_t& batch_size,
const dft_compute_direction& direction,
T* device_accessible_usm_data) {
constexpr bool is_single_precision =
std::is_same_v<T, float> | std::is_same_v<T, std::complex<float>>;
constexpr auto prec = (is_single_precision) ?
dft_ns::precision::SINGLE : dft_ns::precision::DOUBLE;
auto desc = dft_ns::descriptor<prec, dft_ns::domain::REAL>(lengths);
ready_descriptor(desc, q, dft_ns::config_value::INPLACE, batch_size);
// using real_t =
// std::conditional_t<prec == dft_ns::precision::SINGLE, float, double>;
// Let
// real_t* fwd_Z = static_cast<real_t*>(device_accessible_usm_data);
// and
// std::complex<real_t>* bwd_Z =
// static_cast<std::complex<real_t>*>(device_accessible_usm_data);
if (direction == dft_compute_direction::FWD_DFT) {
// Initialize data such that input data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is fwd_Z[ m*lengths[0]*lengths[1]*2*(lengths[2]/2 + 1)
// + k1*lengths[1]*2*(lengths[2]/2 + 1)
// + k2*2*(lengths[2]/2 + 1) + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is fwd_Z[ m*lengths[0]*2*(lengths[1]/2 + 1)
// + k1*2*(lengths[1]/2 + 1) + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is fwd_Z[ m*2*(lengths[0]/2 + 1) + k1 ]
// (0 <= kj < lengths[j-1])
return dft_ns::compute_forward(desc, device_accessible_usm_data);
// Upon completion of the above, the output data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is bwd_Z[ m*lengths[0]*lengths[1]*(lengths[2]/2 + 1)
// + k1*lengths[1]*(lengths[2]/2 + 1)
// + k2*(lengths[2]/2 + 1) + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is bwd_Z[ m*lengths[0]*(lengths[1]/2 + 1)
// + k1*(lengths[1]/2 + 1) + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is bwd_Z[ m*(lengths[0]/2 + 1) + k1 ]
// (0 <= kj < lengths[j-1] for j < lengths.size() - 1 and
// 0 <= kj < lengths[j-1]/2 + 1 for j == lengths.size() - 1)
}
else {
// Initialize data such that input data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is bwd_Z[ m*lengths[0]*lengths[1]*(lengths[2]/2 + 1)
// + k1*lengths[1]*(lengths[2]/2 + 1)
// + k2*(lengths[2]/2 + 1) + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is bwd_Z[ m*lengths[0]*(lengths[1]/2 + 1)
// + k1*(lengths[1]/2 + 1) + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is bwd_Z[ m*(lengths[0]/2 + 1) + k1 ]
// (0 <= kj < lengths[j-1] for j < lengths.size() - 1 and
// 0 <= kj < lengths[j-1]/2 + 1 for j == lengths.size() - 1)
return dft_ns::compute_backward(desc, device_accessible_usm_data);
// Upon completion of the above, the output data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is fwd_Z[ m*lengths[0]*lengths[1]*2*(lengths[2]/2 + 1)
// + k1*lengths[1]*2*(lengths[2]/2 + 1)
// + k2*2*(lengths[2]/2 + 1) + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is fwd_Z[ m*lengths[0]*2*(lengths[1]/2 + 1)
// + k1*2*(lengths[1]/2 + 1) + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is fwd_Z[ m*2*(lengths[0]/2 + 1) + k1 ]
// (0 <= kj < lengths[j-1])
}
}
template<typename T, std::enable_if_t<is_usable_compute_type<T>, bool> = true>
sycl::event compute_outofplace_real_dft(sycl::queue& q,
const std::vector<std::int64_t>& lengths,
const std::int64_t& batch_size,
const dft_compute_direction& direction,
T* device_accessible_usm_data_in,
T* device_accessible_usm_data_out) {
constexpr bool is_single_precision =
std::is_same_v<T, float> | std::is_same_v<T, std::complex<float>>;
constexpr auto prec = (is_single_precision) ?
dft_ns::precision::SINGLE : dft_ns::precision::DOUBLE;
auto desc = dft_ns::descriptor<prec, dft_ns::domain::REAL>(lengths);
ready_descriptor(desc, q, dft_ns::config_value::NOT_INPLACE, batch_size);
if (direction == dft_compute_direction::FWD_DFT) {
// Initialize data such that input data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is fwd_Z[ m*lengths[0]*lengths[1]*lengths[2]
// + k1*lengths[1]*lengths[2] + k2*lengths[2] + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is fwd_Z[ m*lengths[0]*lengths[1] + k1*lengths[1] + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is fwd_Z[ m*lengths[0] + k1 ]
// wherein
// using real_t =
// std::conditional_t<prec == dft_ns::precision::SINGLE, float, double>;
// real_t* fwd_Z = static_cast<real_t*>(device_accessible_usm_data_in);
// (0 <= kj < lengths[j-1])
return dft_ns::compute_forward(desc, device_accessible_usm_data_in,
device_accessible_usm_data_out);
// Upon completion of the above, the output data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is bwd_Z[ m*lengths[0]*lengths[1]*(lengths[2]/2 + 1)
// + k1*lengths[1]*(lengths[2]/2 + 1)
// + k2*(lengths[2]/2 + 1) + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is bwd_Z[ m*lengths[0]*(lengths[1]/2 + 1)
// + k1*(lengths[1]/2 + 1) + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is bwd_Z[ m*(lengths[0]/2 + 1) + k1 ]
// (0 <= kj < lengths[j-1] for j < lengths.size() - 1 and
// 0 <= kj < lengths[j-1]/2 + 1 for j == lengths.size() - 1)
// wherein
// using real_t =
// std::conditional_t<prec == dft_ns::precision::SINGLE, float, double>;
// std::complex<real_t>* bwd_Z =
// static_cast<std::complex<real_t>*>(device_accessible_usm_data_out);
}
else {
// Initialize data such that input data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is bwd_Z[ m*lengths[0]*lengths[1]*(lengths[2]/2 + 1)
// + k1*lengths[1]*(lengths[2]/2 + 1)
// + k2*(lengths[2]/2 + 1) + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is bwd_Z[ m*lengths[0]*(lengths[1]/2 + 1)
// + k1*(lengths[1]/2 + 1) + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is bwd_Z[ m*(lengths[0]/2 + 1) + k1 ]
// (0 <= kj < lengths[j-1] for j < lengths.size() - 1 and
// 0 <= kj < lengths[j-1]/2 + 1 for j == lengths.size() - 1)
// wherein
// using real_t =
// std::conditional_t<prec == dft_ns::precision::SINGLE, float, double>;
// std::complex<real_t>* bwd_Z =
// static_cast<std::complex<real_t>*>(device_accessible_usm_data_in);
return dft_ns::compute_backward(desc, device_accessible_usm_data_in,
device_accessible_usm_data_out);
// Upon completion of the above, the output data entry of multi-index
// - {k1,k2,k3; m} (for 3D DFTs, i.e., lengths.size() == 3)
// is fwd_Z[ m*lengths[0]*lengths[1]*lengths[2]
// + k1*lengths[1]*lengths[2] + k2*lengths[2] + k3 ]
// - {k1,k2; m} (for 2D DFTs, i.e., lengths.size() == 2)
// is fwd_Z[ m*lengths[0]*lengths[1] + k1*lengths[1] + k2 ]
// - {k1; m} (for 1D DFTs, i.e., lengths.size() == 1)
// is fwd_Z[ m*lengths[0] + k1 ]
// (0 <= kj < lengths[j-1])
// wherein
// using real_t =
// std::conditional_t<prec == dft_ns::precision::SINGLE, float, double>;
// real_t* fwd_Z = static_cast<real_t*>(device_accessible_usm_data_out);
}
}