Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions source/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ if(USE_CUDA)
source_base/kernels/cuda/math_kernel_op.cu
source_base/kernels/cuda/math_kernel_op_vec.cu
source_hamilt/module_xc/kernels/cuda/xc_functional_op.cu
source_pw/module_ofdft/kedf_wt_gpu.cu
source_pw/module_pwdft/kernels/cuda/cal_density_real_op.cu
source_pw/module_pwdft/kernels/cuda/mul_potential_op.cu
source_pw/module_pwdft/kernels/cuda/vec_mul_vec_complex.cu
Expand Down
7 changes: 7 additions & 0 deletions source/source_pw/module_ofdft/kedf_wt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,13 @@ double KEDF_WT::diff_linhard(double eta, double vw_weight)
*/
void KEDF_WT::multi_kernel(const double* const* prho, double** rkernel_rho, double exponent, ModulePW::PW_Basis* pw_rho)
{
#ifdef __CUDA
if (pw_rho->get_device() == "gpu") {
this->multi_kernel_gpu(prho, rkernel_rho, exponent, pw_rho);
return;
}
#endif

std::complex<double>** recipkernelRho = new std::complex<double>*[PARAM.inp.nspin];
for (int is = 0; is < PARAM.inp.nspin; ++is)
{
Expand Down
25 changes: 24 additions & 1 deletion source/source_pw/module_ofdft/kedf_wt.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,17 @@
#define KEDF_WT_H
#include <cmath>
#include <cstdio>
#include <complex>

#include "source_base/global_function.h"
#include "source_base/matrix.h"
#include "source_base/timer.h"
#include "source_basis/module_pw/pw_basis.h"

#ifdef __CUDA
#include <cufft.h>
#endif

/**
* @brief A class which calculates the kinetic energy, potential, and stress with Wang-Teter (WT) KEDF.
* See Wang L W, Teter M P. Physical Review B, 1992, 45(23): 13196.
Expand All @@ -22,6 +27,9 @@ class KEDF_WT
}
~KEDF_WT()
{
#ifdef __CUDA
this->free_gpu_buffers();
#endif
delete[] this->kernel_;
}

Expand Down Expand Up @@ -65,5 +73,20 @@ class KEDF_WT
* 2; // 10/3*(3*pi^2)^{2/3}, multiply by 2 to convert unit from Hartree to Ry, finally in Ry*Bohr^(-2)
double wt_coef_ = 0.; // coefficient of WT kernel
double* kernel_ = nullptr;

#ifdef __CUDA
void multi_kernel_gpu(const double* const* prho, double** rkernel_rho,
double exponent, ModulePW::PW_Basis* pw_rho);
void free_gpu_buffers();

// Persistent GPU buffers (lazily allocated once, reused across SCF iterations)
double* d_rho_ = nullptr; // real-space input (nrxx doubles)
cufftHandle cufft_plan_fwd_ = 0; // cuFFT forward plan
cufftHandle cufft_plan_bwd_ = 0; // cuFFT backward plan
double* d_result_ = nullptr; // real-space output (nrxx doubles)
double* d_kernel_ = nullptr; // WT kernel on device (npw doubles)

bool gpu_allocated_ = false;
#endif
};
#endif
#endif
161 changes: 161 additions & 0 deletions source/source_pw/module_ofdft/kedf_wt_gpu.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
/**
* @file kedf_wt_gpu.cu
* @brief GPU-accelerated WT KEDF multi_kernel convolution.
*
* Offloads the rho^exponent → FFT → kernel multiply → IFFT pipeline
* to GPU using cuFFT directly (bypassing PW_Basis template API to
* avoid cross-target template instantiation issues).
*
* Persistent GPU buffers are lazily allocated and reused across SCF.
*
* @author Wang Chenxi, Reze
* @date 2026-06
*/
#include "kedf_wt.h"
#include "source_base/module_device/device_check.h"
#include "source_base/module_device/memory_op.h"
#include "source_io/module_parameter/parameter.h"

#include <cuda_runtime.h>
#include <cufft.h>
#include <thrust/complex.h>

namespace {

constexpr int THREADS_PER_BLOCK = 256;

/// Element-wise multiply: complex array *= real kernel.
__global__ void kedf_wt_recip_multiply(
thrust::complex<double>* __restrict__ data,
const double* __restrict__ kernel,
int npw)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= npw) { return; }
data[idx] *= kernel[idx];
}

/// Real → complex conversion (imag = 0).
__global__ void kedf_wt_real_to_complex(
const double* __restrict__ src,
thrust::complex<double>* __restrict__ dst,
int n)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= n) { return; }
dst[idx] = thrust::complex<double>(src[idx], 0.0);
}

/// Complex → real with 1/N normalization (cuFFT inverse doesn't normalize).
__global__ void kedf_wt_complex_to_real_norm(
const thrust::complex<double>* __restrict__ src,
double* __restrict__ dst,
double inv_n,
int n)
{
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= n) { return; }
dst[idx] = src[idx].real() * inv_n;
}

/// cuFFT error check wrapper.
inline void cufft_check(cufftResult err, const char* file, int line)
{
if (err != CUFFT_SUCCESS) {
std::cerr << "cuFFT error " << (int)err
<< " at " << file << ":" << line << std::endl;
exit(1);
}
}
#define CUFFT_CHECK(call) cufft_check(call, __FILE__, __LINE__)

} // anonymous namespace

void KEDF_WT::multi_kernel_gpu(
const double* const* prho,
double** rkernel_rho,
double exponent,
ModulePW::PW_Basis* pw_rho)
{
const int nrxx = pw_rho->nrxx;
const int npw = pw_rho->npw;
const int nx = pw_rho->nx;
const int ny = pw_rho->ny;
const int nz = pw_rho->nz;
const double inv_nrxx = 1.0 / nrxx;

// ── Lazy allocation of persistent GPU buffers ──
if (!gpu_allocated_) {
resmem_dd_op()(d_rho_, nrxx);
resmem_dd_op()(d_result_, nrxx * 2); // reused as complex work buffer
resmem_dd_op()(d_kernel_, npw);

syncmem_d2d_h2d_op()(d_kernel_, this->kernel_, npw);

// Create cuFFT plans (3D Z2Z, in-place)
CUFFT_CHECK(cufftPlan3d(&cufft_plan_fwd_, nz, ny, nx, CUFFT_Z2Z));
CUFFT_CHECK(cufftPlan3d(&cufft_plan_bwd_, nz, ny, nx, CUFFT_Z2Z));

gpu_allocated_ = true;
}

const int blocks = (npw + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;

// d_result_ is double* but reused as cuFFT complex buffer (nrxx*2 = nrxx complex doubles).
auto* d_fft = reinterpret_cast<cufftDoubleComplex*>(d_result_);

for (int is = 0; is < PARAM.inp.nspin; ++is) {
// Step 1: rho^exponent on CPU
for (int ir = 0; ir < nrxx; ++ir) {
rkernel_rho[is][ir] = std::pow(prho[is][ir], exponent);
}

// Step 2: H → D
syncmem_d2d_h2d_op()(d_rho_, rkernel_rho[is], nrxx);

// Step 3: Real → Complex on GPU
kedf_wt_real_to_complex<<<blocks, THREADS_PER_BLOCK>>>(
d_rho_,
reinterpret_cast<thrust::complex<double>*>(d_fft),
nrxx);
CHECK_CUDA_SYNC();

// Step 4: Forward FFT (in-place on d_fft)
CUFFT_CHECK(cufftExecZ2Z(cufft_plan_fwd_, d_fft, d_fft, CUFFT_FORWARD));

// Step 5: Multiply by WT kernel in G-space
kedf_wt_recip_multiply<<<blocks, THREADS_PER_BLOCK>>>(
reinterpret_cast<thrust::complex<double>*>(d_fft),
d_kernel_,
npw);
CHECK_CUDA_SYNC();

// Step 6: Inverse FFT (in-place on d_fft)
CUFFT_CHECK(cufftExecZ2Z(cufft_plan_bwd_, d_fft, d_fft, CUFFT_INVERSE));

// Step 7: Complex → Real with 1/N normalization
kedf_wt_complex_to_real_norm<<<blocks, THREADS_PER_BLOCK>>>(
reinterpret_cast<thrust::complex<double>*>(d_fft),
d_result_, // back to double* usage
inv_nrxx,
nrxx);
CHECK_CUDA_SYNC();

// Step 8: D → H
syncmem_d2d_d2h_op()(rkernel_rho[is], d_result_, nrxx);
}
}

void KEDF_WT::free_gpu_buffers()
{
if (!gpu_allocated_) { return; }

if (cufft_plan_fwd_ != 0) { cufftDestroy(cufft_plan_fwd_); cufft_plan_fwd_ = 0; }
if (cufft_plan_bwd_ != 0) { cufftDestroy(cufft_plan_bwd_); cufft_plan_bwd_ = 0; }

if (d_rho_ != nullptr) { delmem_dd_op()(d_rho_); d_rho_ = nullptr; }
if (d_result_ != nullptr) { delmem_dd_op()(d_result_); d_result_ = nullptr; }
if (d_kernel_ != nullptr) { delmem_dd_op()(d_kernel_); d_kernel_ = nullptr; }

gpu_allocated_ = false;
}
Loading