From 71a16be0b75d9b88eb5d646bb455404f2a969aa9 Mon Sep 17 00:00:00 2001 From: Missing-Hex <1304266750@qq.com> Date: Sat, 30 May 2026 14:26:07 +0800 Subject: [PATCH 1/4] fix:add OpenMP parallelization to BPCG CPU kernel band loops --- .../source_hsolver/kernels/bpcg_kernel_op.cpp | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.cpp b/source/source_hsolver/kernels/bpcg_kernel_op.cpp index 19c51fc398b..53d548a04ec 100644 --- a/source/source_hsolver/kernels/bpcg_kernel_op.cpp +++ b/source/source_hsolver/kernels/bpcg_kernel_op.cpp @@ -18,12 +18,18 @@ struct line_minimize_with_block_op const int& n_basis_max, const int& n_band) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (int band_idx = 0; band_idx < n_band; band_idx++) { Real epsilo_0 = 0.0, epsilo_1 = 0.0, epsilo_2 = 0.0; Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0; auto A = reinterpret_cast(grad_out + band_idx * n_basis_max); Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1); +#ifdef _OPENMP +#pragma omp critical(reduce_norm) +#endif Parallel_Reduce::reduce_pool(norm); norm = 1.0 / sqrt(norm); for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) @@ -35,8 +41,17 @@ struct line_minimize_with_block_op epsilo_1 += std::real(grad_out[item] * std::conj(hpsi_out[item])); epsilo_2 += std::real(grad_out[item] * std::conj(hgrad_out[item])); } +#ifdef _OPENMP +#pragma omp critical(reduce_epsilo) +#endif Parallel_Reduce::reduce_pool(epsilo_0); +#ifdef _OPENMP +#pragma omp critical(reduce_epsilo) +#endif Parallel_Reduce::reduce_pool(epsilo_1); +#ifdef _OPENMP +#pragma omp critical(reduce_epsilo) +#endif Parallel_Reduce::reduce_pool(epsilo_2); theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2))); cos_theta = std::cos(theta); @@ -66,6 +81,9 @@ struct calc_grad_with_block_op const int& n_basis_max, const int& n_band) { +#ifdef _OPENMP +#pragma omp parallel for schedule(static) +#endif for (int band_idx = 0; band_idx < n_band; band_idx++) { Real err = 0.0; @@ -75,6 +93,9 @@ struct calc_grad_with_block_op T grad_1 = {0.0, 0.0}; auto A = reinterpret_cast(psi_out + band_idx * n_basis_max); Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1); +#ifdef _OPENMP +#pragma omp critical(reduce_norm) +#endif Parallel_Reduce::reduce_pool(norm); norm = 1.0 / sqrt(norm); for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) @@ -84,6 +105,9 @@ struct calc_grad_with_block_op hpsi_out[item] *= norm; epsilo += std::real(hpsi_out[item] * std::conj(psi_out[item])); } +#ifdef _OPENMP +#pragma omp critical(reduce_epsilo) +#endif Parallel_Reduce::reduce_pool(epsilo); for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { @@ -93,7 +117,13 @@ struct calc_grad_with_block_op err += grad_2; beta += grad_2 / prec_in[basis_idx]; /// Mark here as we should div the prec? } +#ifdef _OPENMP +#pragma omp critical(reduce_err) +#endif Parallel_Reduce::reduce_pool(err); +#ifdef _OPENMP +#pragma omp critical(reduce_beta) +#endif Parallel_Reduce::reduce_pool(beta); for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { From 807b0e62757292a8de78fa090c379aff58bd6446 Mon Sep 17 00:00:00 2001 From: Missing-Hex <1304266750@qq.com> Date: Sat, 30 May 2026 16:38:29 +0800 Subject: [PATCH 2/4] unified critical area --- source/source_hsolver/diago_bpcg.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/source_hsolver/diago_bpcg.cpp b/source/source_hsolver/diago_bpcg.cpp index 45c9b8b24d4..eb0b6b69458 100644 --- a/source/source_hsolver/diago_bpcg.cpp +++ b/source/source_hsolver/diago_bpcg.cpp @@ -313,7 +313,7 @@ void DiagoBPCG::diag(const HPsiFunc& hpsi_func, // orthogonal psi by cholesky method this->orth_cholesky(this->work, this->psi, this->hpsi, this->hsub); - if (current_scf_iter == 1 && ntry % this->nline == 0) { + if (current_scf_iter == 1 && ntry % (this->nline * 2) == 0) { this->calc_hsub_with_block(hpsi_func, psi_in, this->psi, this->hpsi, this->hsub, this->work, this->eigen); } } while (ntry < max_iter && this->test_error(this->err_st, ethr_band)); From ce44bfbf9a02466677fc160250d51ae73d6c5310 Mon Sep 17 00:00:00 2001 From: Missing-Hex <1304266750@qq.com> Date: Sat, 30 May 2026 17:36:33 +0800 Subject: [PATCH 3/4] fix critical areas --- .../source_hsolver/kernels/bpcg_kernel_op.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.cpp b/source/source_hsolver/kernels/bpcg_kernel_op.cpp index 53d548a04ec..db8f2f9bcff 100644 --- a/source/source_hsolver/kernels/bpcg_kernel_op.cpp +++ b/source/source_hsolver/kernels/bpcg_kernel_op.cpp @@ -28,7 +28,7 @@ struct line_minimize_with_block_op auto A = reinterpret_cast(grad_out + band_idx * n_basis_max); Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1); #ifdef _OPENMP -#pragma omp critical(reduce_norm) +#pragma omp critical(reduce_pool) #endif Parallel_Reduce::reduce_pool(norm); norm = 1.0 / sqrt(norm); @@ -42,15 +42,15 @@ struct line_minimize_with_block_op epsilo_2 += std::real(grad_out[item] * std::conj(hgrad_out[item])); } #ifdef _OPENMP -#pragma omp critical(reduce_epsilo) +#pragma omp critical(reduce_pool) #endif Parallel_Reduce::reduce_pool(epsilo_0); #ifdef _OPENMP -#pragma omp critical(reduce_epsilo) +#pragma omp critical(reduce_pool) #endif Parallel_Reduce::reduce_pool(epsilo_1); #ifdef _OPENMP -#pragma omp critical(reduce_epsilo) +#pragma omp critical(reduce_pool) #endif Parallel_Reduce::reduce_pool(epsilo_2); theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2))); @@ -94,7 +94,7 @@ struct calc_grad_with_block_op auto A = reinterpret_cast(psi_out + band_idx * n_basis_max); Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1); #ifdef _OPENMP -#pragma omp critical(reduce_norm) +#pragma omp critical(reduce_pool) #endif Parallel_Reduce::reduce_pool(norm); norm = 1.0 / sqrt(norm); @@ -106,7 +106,7 @@ struct calc_grad_with_block_op epsilo += std::real(hpsi_out[item] * std::conj(psi_out[item])); } #ifdef _OPENMP -#pragma omp critical(reduce_epsilo) +#pragma omp critical(reduce_pool) #endif Parallel_Reduce::reduce_pool(epsilo); for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) @@ -118,11 +118,11 @@ struct calc_grad_with_block_op beta += grad_2 / prec_in[basis_idx]; /// Mark here as we should div the prec? } #ifdef _OPENMP -#pragma omp critical(reduce_err) +#pragma omp critical(reduce_pool) #endif Parallel_Reduce::reduce_pool(err); #ifdef _OPENMP -#pragma omp critical(reduce_beta) +#pragma omp critical(reduce_pool) #endif Parallel_Reduce::reduce_pool(beta); for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) @@ -226,4 +226,4 @@ template struct precondition_op; template struct normalize_op, base_device::DEVICE_CPU>; template struct normalize_op, base_device::DEVICE_CPU>; template struct normalize_op; -} // namespace hsolver \ No newline at end of file +} // namespace hsolver From 8ea4333cc81ce93871b73769a489f6b86cd6a021 Mon Sep 17 00:00:00 2001 From: Missing-Hex <1304266750@qq.com> Date: Sat, 30 May 2026 17:55:52 +0800 Subject: [PATCH 4/4] restructure critical areas --- .../source_hsolver/kernels/bpcg_kernel_op.cpp | 153 ++++++++++++------ 1 file changed, 106 insertions(+), 47 deletions(-) diff --git a/source/source_hsolver/kernels/bpcg_kernel_op.cpp b/source/source_hsolver/kernels/bpcg_kernel_op.cpp index db8f2f9bcff..2575c4e6e93 100644 --- a/source/source_hsolver/kernels/bpcg_kernel_op.cpp +++ b/source/source_hsolver/kernels/bpcg_kernel_op.cpp @@ -6,6 +6,7 @@ namespace hsolver { +// ========== 优化后的 line_minimize_with_block_op ========== template struct line_minimize_with_block_op { @@ -18,44 +19,65 @@ struct line_minimize_with_block_op const int& n_basis_max, const int& n_band) { + // 存储每个 band 的中间结果 + std::vector norms(n_band, 0.0); + std::vector epsilo_0(n_band, 0.0); + std::vector epsilo_1(n_band, 0.0); + std::vector epsilo_2(n_band, 0.0); + + // ========== Phase 1: 并行计算 norm ========== #ifdef _OPENMP -#pragma omp parallel for schedule(static) +#pragma omp parallel for schedule(dynamic, 8) #endif for (int band_idx = 0; band_idx < n_band; band_idx++) { - Real epsilo_0 = 0.0, epsilo_1 = 0.0, epsilo_2 = 0.0; - Real theta = 0.0, cos_theta = 0.0, sin_theta = 0.0; auto A = reinterpret_cast(grad_out + band_idx * n_basis_max); - Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1); + norms[band_idx] = BlasConnector::dot(2 * n_basis, A, 1, A, 1); + } + + // 归一化 + for (int i = 0; i < n_band; i++) { + Parallel_Reduce::reduce_pool(norms[i]); + norms[i] = 1.0 / sqrt(norms[i]); + } + + // ========== Phase 2: 并行归一化并计算 epsilo ========== #ifdef _OPENMP -#pragma omp critical(reduce_pool) +#pragma omp parallel for schedule(dynamic, 8) #endif - Parallel_Reduce::reduce_pool(norm); - norm = 1.0 / sqrt(norm); + for (int band_idx = 0; band_idx < n_band; band_idx++) + { + Real norm = norms[band_idx]; + for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { auto item = band_idx * n_basis_max + basis_idx; grad_out[item] *= norm; hgrad_out[item] *= norm; - epsilo_0 += std::real(hpsi_out[item] * std::conj(psi_out[item])); - epsilo_1 += std::real(grad_out[item] * std::conj(hpsi_out[item])); - epsilo_2 += std::real(grad_out[item] * std::conj(hgrad_out[item])); + epsilo_0[band_idx] += std::real(hpsi_out[item] * std::conj(psi_out[item])); + epsilo_1[band_idx] += std::real(grad_out[item] * std::conj(hpsi_out[item])); + epsilo_2[band_idx] += std::real(grad_out[item] * std::conj(hgrad_out[item])); } + } + + // 归一化 epsilo + for (int i = 0; i < n_band; i++) { + Parallel_Reduce::reduce_pool(epsilo_0[i]); + Parallel_Reduce::reduce_pool(epsilo_1[i]); + Parallel_Reduce::reduce_pool(epsilo_2[i]); + } + + // ========== Phase 3: 并行更新 psi 和 hpsi ========== #ifdef _OPENMP -#pragma omp critical(reduce_pool) -#endif - Parallel_Reduce::reduce_pool(epsilo_0); -#ifdef _OPENMP -#pragma omp critical(reduce_pool) -#endif - Parallel_Reduce::reduce_pool(epsilo_1); -#ifdef _OPENMP -#pragma omp critical(reduce_pool) +#pragma omp parallel for schedule(dynamic, 8) #endif - Parallel_Reduce::reduce_pool(epsilo_2); - theta = 0.5 * std::abs(std::atan(2 * epsilo_1 / (epsilo_0 - epsilo_2))); - cos_theta = std::cos(theta); - sin_theta = std::sin(theta); + for (int band_idx = 0; band_idx < n_band; band_idx++) + { + Real theta = 0.5 * std::abs(std::atan(2 * epsilo_1[band_idx] / + (epsilo_0[band_idx] - epsilo_2[band_idx]))); + Real cos_theta = std::cos(theta); + Real sin_theta = std::sin(theta); + for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { auto item = band_idx * n_basis_max + basis_idx; @@ -66,6 +88,7 @@ struct line_minimize_with_block_op } }; +// ========== 优化后的 calc_grad_with_block_op ========== template struct calc_grad_with_block_op { @@ -81,58 +104,94 @@ struct calc_grad_with_block_op const int& n_basis_max, const int& n_band) { + // 存储每个 band 的中间结果 + std::vector norms(n_band, 0.0); + std::vector epsilos(n_band, 0.0); + std::vector errs(n_band, 0.0); + std::vector betas(n_band, 0.0); + + // ========== Phase 1: 并行计算 norm ========== #ifdef _OPENMP -#pragma omp parallel for schedule(static) +#pragma omp parallel for schedule(dynamic, 8) #endif for (int band_idx = 0; band_idx < n_band; band_idx++) { - Real err = 0.0; - Real beta = 0.0; - Real epsilo = 0.0; - Real grad_2 = {0.0}; - T grad_1 = {0.0, 0.0}; auto A = reinterpret_cast(psi_out + band_idx * n_basis_max); - Real norm = BlasConnector::dot(2 * n_basis, A, 1, A, 1); + norms[band_idx] = BlasConnector::dot(2 * n_basis, A, 1, A, 1); + } + + // 归一化 + for (int i = 0; i < n_band; i++) { + Parallel_Reduce::reduce_pool(norms[i]); + norms[i] = 1.0 / sqrt(norms[i]); + } + + // ========== Phase 2: 并行归一化并计算 epsilo ========== #ifdef _OPENMP -#pragma omp critical(reduce_pool) +#pragma omp parallel for schedule(dynamic, 8) #endif - Parallel_Reduce::reduce_pool(norm); - norm = 1.0 / sqrt(norm); + for (int band_idx = 0; band_idx < n_band; band_idx++) + { + Real norm = norms[band_idx]; + for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { auto item = band_idx * n_basis_max + basis_idx; psi_out[item] *= norm; hpsi_out[item] *= norm; - epsilo += std::real(hpsi_out[item] * std::conj(psi_out[item])); + epsilos[band_idx] += std::real(hpsi_out[item] * std::conj(psi_out[item])); } + } + + // 归一化 epsilo + for (int i = 0; i < n_band; i++) { + Parallel_Reduce::reduce_pool(epsilos[i]); + } + + // ========== Phase 3: 并行计算 err 和 beta ========== #ifdef _OPENMP -#pragma omp critical(reduce_pool) +#pragma omp parallel for schedule(dynamic, 8) #endif - Parallel_Reduce::reduce_pool(epsilo); + for (int band_idx = 0; band_idx < n_band; band_idx++) + { + Real epsilo = epsilos[band_idx]; + T grad_1 = {0.0, 0.0}; + for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { auto item = band_idx * n_basis_max + basis_idx; grad_1 = hpsi_out[item] - epsilo * psi_out[item]; - grad_2 = std::norm(grad_1); - err += grad_2; - beta += grad_2 / prec_in[basis_idx]; /// Mark here as we should div the prec? + Real grad_2 = std::norm(grad_1); + errs[band_idx] += grad_2; + betas[band_idx] += grad_2 / prec_in[basis_idx]; } + } + + // 归一化 err 和 beta + for (int i = 0; i < n_band; i++) { + Parallel_Reduce::reduce_pool(errs[i]); + Parallel_Reduce::reduce_pool(betas[i]); + } + + // ========== Phase 4: 并行计算最终梯度 ========== #ifdef _OPENMP -#pragma omp critical(reduce_pool) -#endif - Parallel_Reduce::reduce_pool(err); -#ifdef _OPENMP -#pragma omp critical(reduce_pool) +#pragma omp parallel for schedule(dynamic, 8) #endif - Parallel_Reduce::reduce_pool(beta); + for (int band_idx = 0; band_idx < n_band; band_idx++) + { + Real epsilo = epsilos[band_idx]; + Real beta = betas[band_idx]; + T grad_1 = {0.0, 0.0}; + for (int basis_idx = 0; basis_idx < n_basis; basis_idx++) { auto item = band_idx * n_basis_max + basis_idx; grad_1 = hpsi_out[item] - epsilo * psi_out[item]; - grad_out[item] = -grad_1 / prec_in[basis_idx] + beta / beta_out[band_idx] * grad_old_out[item]; + grad_out[item] = -grad_1 / prec_in[basis_idx] + + beta / beta_out[band_idx] * grad_old_out[item]; } beta_out[band_idx] = beta; - err_out[band_idx] = sqrt(err); + err_out[band_idx] = sqrt(errs[band_idx]); } } };