Highly Efficient FFT for Exascale: HeFFTe v2.4
Loading...
Searching...
No Matches
heffte_backend_rocm.h
1/*
2 -- heFFTe --
3 Univ. of Tennessee, Knoxville
4 @date
5*/
6
7#ifndef HEFFTE_BACKEND_ROCM_H
8#define HEFFTE_BACKEND_ROCM_H
9
10#include "heffte_r2r_executor.h"
11
12#ifdef Heffte_ENABLE_ROCM
13
14#ifndef __HIP_PLATFORM_HCC__
15#define __HIP_PLATFORM_HCC__
16#endif
17#include <hip/hip_runtime.h>
18#include <rocfft/rocfft.h>
19#include "heffte_backend_vector.h"
20
21#ifdef Heffte_ENABLE_MAGMA
22#include "heffte_magma_helpers.h"
23#endif
24
40namespace heffte{
41
46namespace rocm {
51 inline void check_error(hipError_t status, const char *function_name){
52 if (status != hipSuccess)
53 throw std::runtime_error(std::string(function_name) + " failed with message: " + std::string(hipGetErrorString(status)));
54 }
59 inline void check_error(rocfft_status status, const char *function_name){
60 if (status != rocfft_status_success)
61 throw std::runtime_error(std::string(function_name) + " failed with error code: " + std::to_string(status));
62 }
63
70 template<typename precision_type, typename index>
71 void convert(hipStream_t stream, index num_entries, precision_type const source[], std::complex<precision_type> destination[]);
78 template<typename precision_type, typename index>
79 void convert(hipStream_t stream, index num_entries, std::complex<precision_type> const source[], precision_type destination[]);
80
85 template<typename scalar_type, typename index>
86 void scale_data(hipStream_t stream, index num_entries, scalar_type *data, double scale_factor);
87
94 template<typename scalar_type, typename index>
95 void direct_pack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide,
96 scalar_type const source[], scalar_type destination[]);
103 template<typename scalar_type, typename index>
104 void direct_unpack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide,
105 scalar_type const source[], scalar_type destination[]);
112 template<typename scalar_type, typename index>
113 void transpose_unpack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide,
114 index buff_line_stride, index buff_plane_stride, int map0, int map1, int map2,
115 scalar_type const source[], scalar_type destination[]);
116
123 template<typename precision>
124 static void pre_forward(hipStream_t, int length, precision const input[], precision fft_signal[]);
126 template<typename precision>
127 static void post_forward(hipStream_t, int length, std::complex<precision> const fft_result[], precision result[]);
129 template<typename precision>
130 static void pre_backward(hipStream_t, int length, precision const input[], std::complex<precision> fft_signal[]);
132 template<typename precision>
133 static void post_backward(hipStream_t, int length, precision const fft_result[], precision result[]);
135 static int compute_extended_length(int length){
136 return 4 * length;
137 }
138 };
145 template<typename precision>
146 static void pre_forward(hipStream_t, int length, precision const input[], precision fft_signal[]);
148 template<typename precision>
149 static void post_forward(hipStream_t, int length, std::complex<precision> const fft_result[], precision result[]);
151 template<typename precision>
152 static void pre_backward(hipStream_t, int length, precision const input[], std::complex<precision> fft_signal[]);
154 template<typename precision>
155 static void post_backward(hipStream_t, int length, precision const fft_result[], precision result[]);
157 static int compute_extended_length(int length){
159 }
160 };
167 template<typename precision>
168 static void pre_forward(hipStream_t, int length, precision const input[], precision fft_signal[]);
170 template<typename precision>
171 static void post_forward(hipStream_t, int length, std::complex<precision> const fft_result[], precision result[]);
173 template<typename precision>
174 static void pre_backward(hipStream_t, int length, precision const input[], std::complex<precision> fft_signal[]);
176 template<typename precision>
177 static void post_backward(hipStream_t, int length, precision const fft_result[], precision result[]);
179 static int compute_extended_length(int length){
180 return 4 * (length-1);
181 }
182 };
183
188 struct initialize {
190 static bool is_initialized;
192 static void make() {
193 if (not is_initialized) {
194 rocfft_setup();
195 is_initialized = true;
196 }
197
198 }
199 };
200}
201
202namespace backend{
207 template<> struct is_enabled<rocfft> : std::true_type{};
212 template<> struct is_enabled<rocfft_cos> : std::true_type{};
217 template<> struct is_enabled<rocfft_sin> : std::true_type{};
222 template<> struct is_enabled<rocfft_cos1> : std::true_type{};
223
228 template<>
229 struct device_instance<tag::gpu>{
231 device_instance(hipStream_t new_stream = nullptr) : _stream(new_stream){}
233 hipStream_t stream(){ return _stream; }
235 hipStream_t stream() const{ return _stream; }
237 void synchronize_device() const{ rocm::check_error(hipStreamSynchronize(_stream), "device sync"); }
239 mutable hipStream_t _stream;
241 using stream_type = hipStream_t;
242 };
243
248 template<> struct default_backend<tag::gpu>{
250 using type = rocfft;
251 };
252
257 template<> struct data_manipulator<tag::gpu> {
259 using stream_type = hipStream_t;
263 template<typename scalar_type>
264 static scalar_type* allocate(hipStream_t, size_t num_entries){
265 void *new_data = nullptr;
266 rocm::check_error(hipMalloc(&new_data, num_entries * sizeof(scalar_type)), "hipMalloc()");
267 return reinterpret_cast<scalar_type*>(new_data);
268 }
270 template<typename scalar_type>
271 static void free(hipStream_t, scalar_type *pntr){
272 if (pntr == nullptr) return;
273 rocm::check_error(hipFree(pntr), "hipFree()");
274 }
276 template<typename scalar_type>
277 static void copy_n(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[]){
278 if (stream == nullptr)
279 rocm::check_error(hipMemcpy(destination, source, num_entries * sizeof(scalar_type), hipMemcpyDeviceToDevice), "data_manipulator::copy_n()");
280 else
281 rocm::check_error(hipMemcpyAsync(destination, source, num_entries * sizeof(scalar_type), hipMemcpyDeviceToDevice, stream), "data_manipulator::copy_n()");
282 }
284 template<typename scalar_type>
285 static void copy_n(hipStream_t stream, std::complex<scalar_type> const source[], size_t num_entries, scalar_type destination[]){
286 rocm::convert(stream, static_cast<long long>(num_entries), source, destination);
287 }
289 template<typename scalar_type>
290 static void copy_n(hipStream_t stream, scalar_type const source[], size_t num_entries, std::complex<scalar_type> destination[]){
291 rocm::convert(stream, static_cast<long long>(num_entries), source, destination);
292 }
294 template<typename scalar_type>
295 static void copy_device_to_host(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[]){
296 rocm::check_error(hipMemcpyAsync(destination, source, num_entries * sizeof(scalar_type), hipMemcpyDeviceToHost, stream),
297 "device_to_host (rocm)");
298 }
300 template<typename scalar_type>
301 static void copy_device_to_device(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[]){
302 rocm::check_error(hipMemcpyAsync(destination, source, num_entries * sizeof(scalar_type), hipMemcpyDeviceToDevice, stream),
303 "device_to_device (rocm)");
304 }
306 template<typename scalar_type>
307 static void copy_host_to_device(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[]){
308 rocm::check_error(hipMemcpyAsync(destination, source, num_entries * sizeof(scalar_type), hipMemcpyHostToDevice, stream),
309 "host_to_device (rocm)");
310 }
311 };
312
317 template<>
322 template<typename T> using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
323 };
328 template<>
333 template<typename T> using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
334 };
339 template<>
344 template<typename T> using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
345 };
350 template<>
355 template<typename T> using container = heffte::gpu::device_vector<T, data_manipulator<tag::gpu>>;
356 };
357}
358
366template<typename precision_type, direction dir>
377 plan_rocfft(size_t size, size_t batch, size_t stride, size_t rdist, size_t cdist){
378
379 rocfft_plan_description desc = nullptr;
380 rocm::check_error( rocfft_plan_description_create(&desc), "rocm plan create");
381
383 rocfft_plan_description_set_data_layout(
384 desc,
385 (dir == direction::forward) ? rocfft_array_type_real : rocfft_array_type_hermitian_interleaved,
386 (dir == direction::forward) ? rocfft_array_type_hermitian_interleaved : rocfft_array_type_real,
387 nullptr, nullptr,
388 1, &stride, (dir == direction::forward) ? rdist : cdist,
389 1, &stride, (dir == direction::forward) ? cdist : rdist
390 ),
391 "plan layout"
392 );
393
395 rocfft_plan_create(&plan, rocfft_placement_notinplace,
396 (dir == direction::forward) ? rocfft_transform_type_real_forward : rocfft_transform_type_real_inverse,
397 (std::is_same<precision_type, float>::value)? rocfft_precision_single : rocfft_precision_double,
398 1, &size, batch, desc),
399 "plan create");
400
401 rocm::check_error( rocfft_plan_get_work_buffer_size(plan, &worksize), "get_worksize");
402
403 rocm::check_error( rocfft_plan_description_destroy(desc), "rocm plan destroy");
404 }
406 ~plan_rocfft(){ rocfft_plan_destroy(plan); }
408 operator rocfft_plan() const{ return plan; }
410 size_t size_work() const{ return worksize; }
411
412private:
414 rocfft_plan plan;
416 size_t worksize;
417};
418
423template<typename precision_type, direction dir>
424struct plan_rocfft<std::complex<precision_type>, dir>{
433 plan_rocfft(size_t size, size_t batch, size_t stride, size_t dist) : plan(nullptr), worksize(0){
434 rocfft_plan_description desc = nullptr;
435 rocm::check_error( rocfft_plan_description_create(&desc), "rocm plan create");
436
438 rocfft_plan_description_set_data_layout(
439 desc,
440 rocfft_array_type_complex_interleaved,
441 rocfft_array_type_complex_interleaved,
442 nullptr, nullptr,
443 1, &stride, dist, 1, &stride, dist
444 ),
445 "plan layout"
446 );
447
449 rocfft_plan_create(&plan, rocfft_placement_inplace,
450 (dir == direction::forward) ? rocfft_transform_type_complex_forward : rocfft_transform_type_complex_inverse,
451 (std::is_same<precision_type, float>::value)? rocfft_precision_single : rocfft_precision_double,
452 1, &size, batch, desc),
453 "plan create");
454
455 rocm::check_error( rocfft_plan_get_work_buffer_size(plan, &worksize), "get_worksize");
456
457 rocm::check_error( rocfft_plan_description_destroy(desc), "rocm plan destroy");
458 }
468 plan_rocfft(size_t size1, size_t size2, std::array<size_t, 2> const &embed, size_t batch, size_t dist) : plan(nullptr), worksize(0){
469 size_t size[2] = {size1, size2};
470
471 rocfft_plan_description desc = nullptr;
472 rocm::check_error( rocfft_plan_description_create(&desc), "rocm plan create");
473
475 rocfft_plan_description_set_data_layout(
476 desc,
477 rocfft_array_type_complex_interleaved,
478 rocfft_array_type_complex_interleaved,
479 nullptr, nullptr,
480 2, embed.data(), dist, 2, embed.data(), dist
481 ),
482 "plan layout"
483 );
484
486 rocfft_plan_create(&plan, rocfft_placement_inplace,
487 (dir == direction::forward) ? rocfft_transform_type_complex_forward : rocfft_transform_type_complex_inverse,
488 (std::is_same<precision_type, float>::value) ? rocfft_precision_single : rocfft_precision_double,
489 2, size, batch, desc),
490 "plan create");
491
492 rocm::check_error( rocfft_plan_get_work_buffer_size(plan, &worksize), "get_worksize");
493
494 rocm::check_error( rocfft_plan_description_destroy(desc), "rocm plan destroy");
495 }
497 plan_rocfft(size_t size1, size_t size2, size_t size3){
498 std::array<size_t, 3> size = {size1, size2, size3};
499 rocfft_plan_description desc = nullptr;
500 rocm::check_error( rocfft_plan_description_create(&desc), "rocm plan create");
501
503 rocfft_plan_description_set_data_layout(
504 desc,
505 rocfft_array_type_complex_interleaved,
506 rocfft_array_type_complex_interleaved,
507 nullptr, nullptr, 3, nullptr, 1, 3, nullptr, 1
508 ),
509 "plan layout"
510 );
511
513 rocfft_plan_create(&plan, rocfft_placement_inplace,
514 (dir == direction::forward) ? rocfft_transform_type_complex_forward : rocfft_transform_type_complex_inverse,
515 (std::is_same<precision_type, float>::value) ? rocfft_precision_single : rocfft_precision_double,
516 3, size.data(), 1, desc),
517 "plan create 3d");
518
519 rocm::check_error( rocfft_plan_get_work_buffer_size(plan, &worksize), "get_worksize");
520
521 rocm::check_error( rocfft_plan_description_destroy(desc), "rocm plan destroy");
522 }
524 ~plan_rocfft(){ rocm::check_error( rocfft_plan_destroy(plan), "plan destory"); }
526 operator rocfft_plan() const{ return plan; }
528 size_t size_work() const{ return worksize; }
529
530private:
532 rocfft_plan plan;
533 size_t worksize;
534};
535
549public:
557 template<typename index>
558 rocfft_executor(hipStream_t active_stream, box3d<index> const box, int dimension) :
559 stream(active_stream),
560 size(box.size[dimension]), size2(0),
561 howmanyffts(fft1d_get_howmany(box, dimension)),
562 stride(fft1d_get_stride(box, dimension)),
563 dist((dimension == box.order[0]) ? size : 1),
564 blocks((dimension == box.order[1]) ? box.osize(2) : 1),
565 block_stride(box.osize(0) * box.osize(1)),
566 total_size(box.count()),
567 embed({0, 0}),
568 worksize(compute_workspace_size())
569 {
571 }
573 template<typename index>
574 rocfft_executor(hipStream_t active_stream, box3d<index> const box, int dir1, int dir2) :
575 stream(active_stream),
576 size(box.size[std::min(dir1, dir2)]), size2(box.size[std::max(dir1, dir2)]),
577 blocks(1), block_stride(0), total_size(box.count()),
578 worksize(0)
579 {
580 int odir1 = box.find_order(dir1);
581 int odir2 = box.find_order(dir2);
582
583 if (std::min(odir1, odir2) == 0 and std::max(odir1, odir2) == 1){
584 stride = 1;
585 dist = size * size2;
586 embed = {static_cast<size_t>(stride), static_cast<size_t>(size)};
587 howmanyffts = box.size[2];
588 }else if (std::min(odir1, odir2) == 1 and std::max(odir1, odir2) == 2){
589 stride = box.size[0];
590 dist = 1;
591 embed = {static_cast<size_t>(stride), static_cast<size_t>(size) * static_cast<size_t>(stride)};
592 howmanyffts = box.size[0];
593 }else{ // case of directions (0, 2)
594 stride = 1;
595 dist = size;
596 embed = {static_cast<size_t>(stride), static_cast<size_t>(box.size[1]) * static_cast<size_t>(box.size[0])};
597 howmanyffts = box.size[1];
598 }
599 worksize = compute_workspace_size();
600 }
602 template<typename index>
603 rocfft_executor(hipStream_t active_stream, box3d<index> const box) :
604 stream(active_stream),
605 size(box.size[0]), size2(box.size[1]), howmanyffts(box.size[2]),
606 stride(0), dist(0),
607 blocks(1), block_stride(0),
608 total_size(box.count()),
609 embed({0, 0}),
610 worksize(compute_workspace_size())
611 {}
612
614 template<typename precision_type, direction dir>
615 void execute(std::complex<precision_type> data[], std::complex<precision_type> *workspace) const{
616 if (std::is_same<precision_type, float>::value){
617 if (dir == direction::forward)
618 make_plan(ccomplex_forward);
619 else
620 make_plan(ccomplex_backward);
621 }else{
622 if (dir == direction::forward)
623 make_plan(zcomplex_forward);
624 else
625 make_plan(zcomplex_backward);
626 }
627 rocfft_execution_info info;
628 rocfft_execution_info_create(&info);
629 rocfft_execution_info_set_stream(info, stream);
630
631 size_t wsize = (std::is_same<precision_type, float>::value) ?
632 ((dir == direction::forward) ? ccomplex_forward->size_work() : ccomplex_backward->size_work()) :
633 ((dir == direction::forward) ? zcomplex_forward->size_work() : zcomplex_backward->size_work());
634
635 if (wsize > 0)
636 rocfft_execution_info_set_work_buffer(info, reinterpret_cast<void*>(workspace), wsize);
637
638 for(int i=0; i<blocks; i++){
639 void* block_data = reinterpret_cast<void*>(data + i * block_stride);
640 rocm::check_error( rocfft_execute(
641 (std::is_same<precision_type, float>::value) ?
642 ((dir == direction::forward) ? *ccomplex_forward : *ccomplex_backward) :
643 ((dir == direction::forward) ? *zcomplex_forward : *zcomplex_backward),
644 &block_data, nullptr, info), "rocfft execute");
645 }
646 rocfft_execution_info_destroy(info);
647 }
648
650 void forward(std::complex<float> data[], std::complex<float> *workspace) const override{
651 execute<float, direction::forward>(data, workspace);
652 }
654 void forward(std::complex<double> data[], std::complex<double> *workspace) const override{
656 }
658 void backward(std::complex<float> data[], std::complex<float> *workspace) const override{
660 }
662 void backward(std::complex<double> data[], std::complex<double> *workspace) const override{
664 }
665
667 void forward(float const indata[], std::complex<float> outdata[], std::complex<float> *workspace) const override{
668 rocm::convert(stream, total_size, indata, outdata);
669 forward(outdata, workspace);
670 }
672 void forward(double const indata[], std::complex<double> outdata[], std::complex<double> *workspace) const override{
673 rocm::convert(stream, total_size, indata, outdata);
674 forward(outdata, workspace);
675 }
677 void backward(std::complex<float> indata[], float outdata[], std::complex<float> *workspace) const override{
678 backward(indata, workspace);
679 rocm::convert(stream, total_size, indata, outdata);
680 }
682 void backward(std::complex<double> indata[], double outdata[], std::complex<double> *workspace) const override{
683 backward(indata, workspace);
684 rocm::convert(stream, total_size, indata, outdata);
685 }
686
688 int box_size() const override{ return total_size; }
690 size_t workspace_size() const override{ return worksize; }
693 make_plan(ccomplex_forward);
694 make_plan(ccomplex_backward);
695 make_plan(zcomplex_forward);
696 make_plan(zcomplex_backward);
697 return
698 std::max( std::max(ccomplex_forward->size_work(), ccomplex_backward->size_work()) / sizeof(std::complex<float>),
699 std::max(zcomplex_forward->size_work(), zcomplex_backward->size_work()) / sizeof(std::complex<double>) ) + 1;
700 return 0;
701 }
702
703private:
705 template<typename scalar_type, direction dir>
706 void make_plan(std::unique_ptr<plan_rocfft<scalar_type, dir>> &plan) const{
707 if (not plan){
708 if (dist == 0)
709 plan = std::unique_ptr<plan_rocfft<scalar_type, dir>>(new plan_rocfft<scalar_type, dir>(size, size2, howmanyffts));
710 else if (size2 == 0)
711 plan = std::unique_ptr<plan_rocfft<scalar_type, dir>>(new plan_rocfft<scalar_type, dir>(size, howmanyffts, stride, dist));
712 else
713 plan = std::unique_ptr<plan_rocfft<scalar_type, dir>>(new plan_rocfft<scalar_type, dir>(size, size2, embed, howmanyffts, dist));
714 }
715 }
716
717 mutable hipStream_t stream;
718
719 int size, size2, howmanyffts, stride, dist, blocks, block_stride, total_size;
720 std::array<size_t, 2> embed;
721 mutable std::unique_ptr<plan_rocfft<std::complex<float>, direction::forward>> ccomplex_forward;
722 mutable std::unique_ptr<plan_rocfft<std::complex<float>, direction::backward>> ccomplex_backward;
723 mutable std::unique_ptr<plan_rocfft<std::complex<double>, direction::forward>> zcomplex_forward;
724 mutable std::unique_ptr<plan_rocfft<std::complex<double>, direction::backward>> zcomplex_backward;
725
726 size_t worksize;
727};
728
738public:
748 template<typename index>
749 rocfft_executor_r2c(hipStream_t active_stream, box3d<index> const box, int dimension) :
750 stream(active_stream),
751 size(box.size[dimension]),
752 howmanyffts(fft1d_get_howmany(box, dimension)),
753 stride(fft1d_get_stride(box, dimension)),
754 blocks((dimension == box.order[1]) ? box.osize(2) : 1),
755 rdist((dimension == box.order[0]) ? size : 1),
756 cdist((dimension == box.order[0]) ? size/2 + 1 : 1),
757 rblock_stride(box.osize(0) * box.osize(1)),
758 cblock_stride(box.osize(0) * (box.osize(1)/2 + 1)),
759 rsize(box.count()),
760 csize(box.r2c(dimension).count()),
761 worksize(compute_workspace_size())
762 {
764 }
765
767 template<typename precision_type>
768 void forward(precision_type const indata[], std::complex<precision_type> outdata[], std::complex<precision_type> *workspace) const{
769 if (std::is_same<precision_type, float>::value){
770 make_plan(sforward);
771 }else{
772 make_plan(dforward);
773 }
774
775 rocfft_execution_info info;
776 rocfft_execution_info_create(&info);
777 rocfft_execution_info_set_stream(info, stream);
778
779 size_t wsize = (std::is_same<precision_type, float>::value) ? sforward->size_work() : dforward->size_work();
780 if (wsize > 0)
781 rocfft_execution_info_set_work_buffer(info, reinterpret_cast<void*>(workspace), wsize);
782
783 precision_type *copy_indata = reinterpret_cast<precision_type*>(
784 reinterpret_cast<unsigned char *>(workspace) + wsize);
785 backend::data_manipulator<tag::gpu>::copy_n(stream, indata, box_size(), copy_indata);
786
787 for(int i=0; i<blocks; i++){
788 void *rdata = const_cast<void*>(reinterpret_cast<void const*>(copy_indata + i * rblock_stride));
789 void *cdata = reinterpret_cast<void*>(outdata + i * cblock_stride);
790 rocm::check_error( rocfft_execute(
791 (std::is_same<precision_type, float>::value) ? *sforward : *dforward,
792 &rdata, &cdata, info), "rocfft execute");
793 }
794 rocfft_execution_info_destroy(info);
795 }
797 template<typename precision_type>
798 void backward(std::complex<precision_type> indata[], precision_type outdata[], std::complex<precision_type> *workspace) const{
799 if (std::is_same<precision_type, float>::value){
800 make_plan(sbackward);
801 }else{
802 make_plan(dbackward);
803 }
804
805 rocfft_execution_info info;
806 rocfft_execution_info_create(&info);
807 rocfft_execution_info_set_stream(info, stream);
808
809 size_t wsize = (std::is_same<precision_type, float>::value) ? sbackward->size_work() : dbackward->size_work();
810 if (wsize > 0)
811 rocfft_execution_info_set_work_buffer(info, reinterpret_cast<void*>(workspace), wsize);
812
813 std::complex<precision_type> *copy_indata = reinterpret_cast<std::complex<precision_type>*>(
814 reinterpret_cast<unsigned char *>(workspace) + wsize);
815 backend::data_manipulator<tag::gpu>::copy_n(stream, indata, complex_size(), copy_indata);
816
817 for(int i=0; i<blocks; i++){
818 void *cdata = const_cast<void*>(reinterpret_cast<void const*>(copy_indata + i * cblock_stride));
819 void *rdata = reinterpret_cast<void*>(outdata + i * rblock_stride);
820 rocm::check_error( rocfft_execute(
821 (std::is_same<precision_type, float>::value) ? *sbackward : *dbackward,
822 &cdata, &rdata, info), "rocfft execute");
823 }
824 rocfft_execution_info_destroy(info);
825 }
827 void forward(float const indata[], std::complex<float> outdata[], std::complex<float> *workspace) const override{
828 forward<float>(indata, outdata, workspace);
829 }
831 void backward(std::complex<float> indata[], float outdata[], std::complex<float> *workspace) const override{
832 backward<float>(indata, outdata, workspace);
833 }
835 void forward(double const indata[], std::complex<double> outdata[], std::complex<double> *workspace) const override{
836 forward<double>(indata, outdata, workspace);
837 }
839 void backward(std::complex<double> indata[], double outdata[], std::complex<double> *workspace) const override{
840 backward<double>(indata, outdata, workspace);
841 }
842
844 int box_size() const override{ return rsize; }
846 int complex_size() const override{ return csize; }
848 size_t workspace_size() const override{ return worksize; }
851 make_plan(sforward);
852 make_plan(dforward);
853 make_plan(sbackward);
854 make_plan(dbackward);
855 // Temporary copies have to be made, request that from user in addition, to what rocFFT requires.
856 return
857 std::max( std::max(sforward->size_work() + box_size() * sizeof(float), sbackward->size_work() + complex_size() * sizeof(std::complex<float>)) / sizeof(std::complex<float>),
858 std::max(dforward->size_work() + box_size() * sizeof(double), dbackward->size_work() + complex_size() * sizeof(std::complex<double>)) / sizeof(std::complex<double>) ) + 1;
859 }
860
861private:
863 template<typename scalar_type, direction dir>
864 void make_plan(std::unique_ptr<plan_rocfft<scalar_type, dir>> &plan) const{
865 if (!plan) plan = std::unique_ptr<plan_rocfft<scalar_type, dir>>(new plan_rocfft<scalar_type, dir>(size, howmanyffts, stride, rdist, cdist));
866 }
867
868 mutable hipStream_t stream;
869
870 int size, howmanyffts, stride, blocks;
871 int rdist, cdist, rblock_stride, cblock_stride, rsize, csize;
872 mutable std::unique_ptr<plan_rocfft<float, direction::forward>> sforward;
873 mutable std::unique_ptr<plan_rocfft<double, direction::forward>> dforward;
874 mutable std::unique_ptr<plan_rocfft<float, direction::backward>> sbackward;
875 mutable std::unique_ptr<plan_rocfft<double, direction::backward>> dbackward;
876
877 size_t worksize;
878};
879
886template<> struct one_dim_backend<backend::rocfft>{
891};
892
929
934template<> struct direct_packer<tag::gpu>{
936 template<typename scalar_type, typename index>
937 void pack(hipStream_t stream, pack_plan_3d<index> const &plan, scalar_type const data[], scalar_type buffer[]) const{
938 rocm::direct_pack(stream, plan.size[0], plan.size[1], plan.size[2], plan.line_stride, plan.plane_stride, data, buffer);
939 }
941 template<typename scalar_type, typename index>
942 void unpack(hipStream_t stream, pack_plan_3d<index> const &plan, scalar_type const buffer[], scalar_type data[]) const{
943 rocm::direct_unpack(stream, plan.size[0], plan.size[1], plan.size[2], plan.line_stride, plan.plane_stride, buffer, data);
944 }
945};
946
951template<> struct transpose_packer<tag::gpu>{
953 template<typename scalar_type, typename index>
954 void pack(hipStream_t stream, pack_plan_3d<index> const &plan, scalar_type const data[], scalar_type buffer[]) const{
955 direct_packer<tag::gpu>().pack(stream, plan, data, buffer); // packing is done the same way as the direct_packer
956 }
958 template<typename scalar_type, typename index>
959 void unpack(hipStream_t stream, pack_plan_3d<index> const &plan, scalar_type const buffer[], scalar_type data[]) const{
960 rocm::transpose_unpack<scalar_type>(stream, plan.size[0], plan.size[1], plan.size[2], plan.line_stride, plan.plane_stride,
961 plan.buff_line_stride, plan.buff_plane_stride, plan.map[0], plan.map[1], plan.map[2], buffer, data);
962 }
963};
964
965namespace data_scaling {
970 template<typename scalar_type, typename index>
971 static void apply(hipStream_t stream, index num_entries, scalar_type *data, double scale_factor){
972 rocm::scale_data<scalar_type, long long>(stream, static_cast<long long>(num_entries), data, scale_factor);
973 }
978 template<typename precision_type, typename index>
979 static void apply(hipStream_t stream, index num_entries, std::complex<precision_type> *data, double scale_factor){
980 apply<precision_type>(stream, 2*num_entries, reinterpret_cast<precision_type*>(data), scale_factor);
981 }
982};
983
988template<> struct default_plan_options<backend::rocfft>{
990 static const bool use_reorder = true;
991};
996template<> struct default_plan_options<backend::rocfft_cos>{
998 static const bool use_reorder = true;
999};
1004template<> struct default_plan_options<backend::rocfft_sin>{
1006 static const bool use_reorder = true;
1007};
1012template<> struct default_plan_options<backend::rocfft_cos1>{
1014 static const bool use_reorder = true;
1015};
1016
1017}
1018
1019#endif
1020
1021#endif /* HEFFTE_BACKEND_FFTW_H */
Base class for all backend executors.
Definition heffte_common.h:561
virtual int complex_size() const
Return the size of the complex-box (r2c executors).
Definition heffte_common.h:594
virtual void backward(float[], float *) const
Backward r2r, single precision.
Definition heffte_common.h:570
virtual void forward(float[], float *) const
Forward r2r, single precision.
Definition heffte_common.h:566
Wrapper to rocFFT API for real-to-complex transform with shortening of the data.
Definition heffte_backend_rocm.h:737
int complex_size() const override
Returns the size of the box with complex coefficients.
Definition heffte_backend_rocm.h:846
void backward(std::complex< float > indata[], float outdata[], std::complex< float > *workspace) const override
Backward transform, single precision.
Definition heffte_backend_rocm.h:831
rocfft_executor_r2c(hipStream_t active_stream, box3d< index > const box, int dimension)
Constructor defines the box and the dimension of reduction.
Definition heffte_backend_rocm.h:749
void forward(float const indata[], std::complex< float > outdata[], std::complex< float > *workspace) const override
Forward transform, single precision.
Definition heffte_backend_rocm.h:827
int box_size() const override
Returns the size of the box with real data.
Definition heffte_backend_rocm.h:844
void forward(precision_type const indata[], std::complex< precision_type > outdata[], std::complex< precision_type > *workspace) const
Forward transform, single precision.
Definition heffte_backend_rocm.h:768
void backward(std::complex< precision_type > indata[], precision_type outdata[], std::complex< precision_type > *workspace) const
Backward transform, single precision.
Definition heffte_backend_rocm.h:798
size_t compute_workspace_size() const
Computes the size of the needed workspace.
Definition heffte_backend_rocm.h:850
void backward(std::complex< double > indata[], double outdata[], std::complex< double > *workspace) const override
Backward transform, double precision.
Definition heffte_backend_rocm.h:839
size_t workspace_size() const override
Return the size of the needed workspace.
Definition heffte_backend_rocm.h:848
void forward(double const indata[], std::complex< double > outdata[], std::complex< double > *workspace) const override
Forward transform, double precision.
Definition heffte_backend_rocm.h:835
Wrapper around the rocFFT API.
Definition heffte_backend_rocm.h:548
void forward(double const indata[], std::complex< double > outdata[], std::complex< double > *workspace) const override
Converts the real data to complex and performs double-complex forward transform.
Definition heffte_backend_rocm.h:672
void backward(std::complex< float > indata[], float outdata[], std::complex< float > *workspace) const override
Performs backward float-complex transform and truncates the complex part of the result.
Definition heffte_backend_rocm.h:677
void forward(float const indata[], std::complex< float > outdata[], std::complex< float > *workspace) const override
Converts the real data to complex and performs float-complex forward transform.
Definition heffte_backend_rocm.h:667
void backward(std::complex< double > indata[], double outdata[], std::complex< double > *workspace) const override
Performs backward double-complex transform and truncates the complex part of the result.
Definition heffte_backend_rocm.h:682
rocfft_executor(hipStream_t active_stream, box3d< index > const box, int dimension)
Constructor, specifies the box and dimension.
Definition heffte_backend_rocm.h:558
size_t compute_workspace_size() const
Computes the size of the needed workspace.
Definition heffte_backend_rocm.h:692
rocfft_executor(hipStream_t active_stream, box3d< index > const box, int dir1, int dir2)
Merges two FFTs into one.
Definition heffte_backend_rocm.h:574
int box_size() const override
Returns the size of the box.
Definition heffte_backend_rocm.h:688
void forward(std::complex< double > data[], std::complex< double > *workspace) const override
Forward fft, double-complex case.
Definition heffte_backend_rocm.h:654
void backward(std::complex< float > data[], std::complex< float > *workspace) const override
Backward fft, float-complex case.
Definition heffte_backend_rocm.h:658
size_t workspace_size() const override
Return the size of the needed workspace.
Definition heffte_backend_rocm.h:690
void execute(std::complex< precision_type > data[], std::complex< precision_type > *workspace) const
Perform an in-place FFT on the data in the given direction.
Definition heffte_backend_rocm.h:615
rocfft_executor(hipStream_t active_stream, box3d< index > const box)
Merges three FFTs into one.
Definition heffte_backend_rocm.h:603
void backward(std::complex< double > data[], std::complex< double > *workspace) const override
Backward fft, double-complex case.
Definition heffte_backend_rocm.h:662
void forward(std::complex< float > data[], std::complex< float > *workspace) const override
Forward fft, float-complex case.
Definition heffte_backend_rocm.h:650
int fft1d_get_howmany(box3d< index > const box, int const dimension)
Return the number of 1-D ffts contained in the box in the given dimension.
Definition heffte_geometry.h:159
int fft1d_get_stride(box3d< index > const box, int const dimension)
Return the stride of the 1-D ffts contained in the box in the given dimension.
Definition heffte_geometry.h:169
@ backward
Inverse DFT transform.
@ forward
Forward DFT transform.
void apply(cudaStream_t stream, index num_entries, scalar_type *data, double scale_factor)
Simply multiply the num_entries in the data by the scale_factor.
Definition heffte_backend_cuda.h:837
void check_error(hipError_t status, const char *function_name)
Checks the status of a ROCm command and in case of a failure, converts it to a C++ exception.
Definition heffte_backend_rocm.h:51
void convert(hipStream_t stream, index num_entries, precision_type const source[], std::complex< precision_type > destination[])
Convert real numbers to complex when both are located on the GPU device.
void transpose_unpack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, index buff_line_stride, index buff_plane_stride, int map0, int map1, int map2, scalar_type const source[], scalar_type destination[])
Performs a transpose-unpack operation for data sitting on the GPU device.
void scale_data(hipStream_t stream, index num_entries, scalar_type *data, double scale_factor)
Scales real data (double or float) by the scaling factor.
void direct_pack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, scalar_type const source[], scalar_type destination[])
Performs a direct-pack operation for data sitting on the GPU device.
void direct_unpack(hipStream_t stream, index nfast, index nmid, index nslow, index line_stride, index plane_stide, scalar_type const source[], scalar_type destination[])
Performs a direct-unpack operation for data sitting on the GPU device.
Namespace containing all HeFFTe methods and classes.
Definition heffte_backend_cuda.h:38
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the ROCm vector container.
Definition heffte_backend_rocm.h:322
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the ROCm vector container.
Definition heffte_backend_rocm.h:355
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the ROCm vector container.
Definition heffte_backend_rocm.h:333
heffte::gpu::device_vector< T, data_manipulator< tag::gpu > > container
The data is managed by the ROCm vector container.
Definition heffte_backend_rocm.h:344
Defines the container for the temporary buffers.
Definition heffte_common.h:237
Type-tag for the cuFFT backend.
Definition heffte_common.h:162
static void copy_device_to_device(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the device to the device.
Definition heffte_backend_rocm.h:301
cudaStream_t stream_type
The stream type for the device.
Definition heffte_backend_cuda.h:232
static void copy_n(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Equivalent to std::copy_n() but using CUDA arrays.
Definition heffte_backend_rocm.h:277
static scalar_type * allocate(hipStream_t, size_t num_entries)
Allocate memory.
Definition heffte_backend_rocm.h:264
static void copy_n(hipStream_t stream, scalar_type const source[], size_t num_entries, std::complex< scalar_type > destination[])
Copy-convert real-to-complex.
Definition heffte_backend_rocm.h:290
static void copy_host_to_device(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the host to the device.
Definition heffte_backend_rocm.h:307
static void copy_device_to_host(hipStream_t stream, scalar_type const source[], size_t num_entries, scalar_type destination[])
Copy the date from the device to the host.
Definition heffte_backend_rocm.h:295
static void copy_n(hipStream_t stream, std::complex< scalar_type > const source[], size_t num_entries, scalar_type destination[])
Copy-convert complex-to-real.
Definition heffte_backend_rocm.h:285
static void free(hipStream_t, scalar_type *pntr)
Free memory.
Definition heffte_backend_rocm.h:271
Common data-transfer operations, must be specializes for each location (cpu/gpu).
Definition heffte_common.h:59
Defines inverse mapping from the location tag to a default backend tag.
Definition heffte_common.h:430
The CUDA backend uses a CUDA stream.
Definition heffte_backend_rocm.h:229
device_instance(hipStream_t new_stream=nullptr)
Constructor, sets up the stream.
Definition heffte_backend_rocm.h:231
void synchronize_device() const
Syncs the execution with the queue.
Definition heffte_backend_rocm.h:237
hipStream_t stream() const
Returns the nullptr (const case).
Definition heffte_backend_rocm.h:235
cudaStream_t stream_type
The type for the internal stream.
Definition heffte_backend_cuda.h:214
hipStream_t stream()
Returns the nullptr.
Definition heffte_backend_rocm.h:233
hipStream_t _stream
The CUDA stream to be used in all operations.
Definition heffte_backend_rocm.h:239
Holds the auxiliary variables needed by each backend.
Definition heffte_common.h:408
Allows to define whether a specific backend interface has been enabled.
Definition heffte_common.h:226
Type-tag for the Cosine Transform of type 1 using the rocFFT backend.
Definition heffte_common.h:199
Type-tag for the Cosine Transform using the rocFFT backend.
Definition heffte_common.h:189
Type-tag for the Sine Transform using the rocFFT backend.
Definition heffte_common.h:194
Type-tag for the rocFFT backend.
Definition heffte_common.h:184
A generic container that describes a 3d box of indexes.
Definition heffte_geometry.h:67
std::array< index, 3 > const size
The number of indexes in each direction.
Definition heffte_geometry.h:129
int find_order(int dir) const
Returns the effective order of the direction (dir), 0 -> fast, 1 -> mid, 2 -> slow.
Definition heffte_geometry.h:121
Defines a set of default plan options for a given backend.
Definition heffte_common.h:761
void unpack(hipStream_t stream, pack_plan_3d< index > const &plan, scalar_type const buffer[], scalar_type data[]) const
Execute the planned unpack operation.
Definition heffte_backend_rocm.h:942
void pack(hipStream_t stream, pack_plan_3d< index > const &plan, scalar_type const data[], scalar_type buffer[]) const
Execute the planned pack operation.
Definition heffte_backend_rocm.h:937
Defines the direct packer without implementation, use the specializations to get the CPU or GPU imple...
Definition heffte_pack3d.h:83
void executor_r2c
Defines the real-to-complex executor.
Definition heffte_backend_rocm.h:927
void executor_r2c
Defines the real-to-complex executor.
Definition heffte_backend_rocm.h:903
void executor_r2c
Defines the real-to-complex executor.
Definition heffte_backend_rocm.h:915
Indicates the structure that will be used by the fft backend.
Definition heffte_common.h:663
Holds the plan for a pack/unpack operation.
Definition heffte_pack3d.h:32
index buff_plane_stride
Stride of the planes in the received buffer (transpose packing only).
Definition heffte_pack3d.h:42
index line_stride
Stride of the lines.
Definition heffte_pack3d.h:36
index plane_stride
Stride of the planes.
Definition heffte_pack3d.h:38
std::array< index, 3 > size
Number of elements in the three directions.
Definition heffte_pack3d.h:34
std::array< int, 3 > map
Maps the i,j,k indexes from input to the output (transpose packing only).
Definition heffte_pack3d.h:44
index buff_line_stride
Stride of the lines in the received buffer (transpose packing only).
Definition heffte_pack3d.h:40
~plan_rocfft()
Destructor, deletes the plan.
Definition heffte_backend_rocm.h:524
plan_rocfft(size_t size1, size_t size2, std::array< size_t, 2 > const &embed, size_t batch, size_t dist)
Constructor, takes inputs identical to cufftMakePlanMany().
Definition heffte_backend_rocm.h:468
plan_rocfft(size_t size, size_t batch, size_t stride, size_t dist)
Constructor, takes inputs identical to cufftMakePlanMany().
Definition heffte_backend_rocm.h:433
plan_rocfft(size_t size1, size_t size2, size_t size3)
Constructor, takes inputs identical to cufftPlan3d()
Definition heffte_backend_rocm.h:497
size_t size_work() const
Return the worksize.
Definition heffte_backend_rocm.h:528
Plan for the r2c single precision transform.
Definition heffte_backend_rocm.h:367
~plan_rocfft()
Destructor, deletes the plan.
Definition heffte_backend_rocm.h:406
size_t size_work() const
Return the worksize.
Definition heffte_backend_rocm.h:410
plan_rocfft(size_t size, size_t batch, size_t stride, size_t rdist, size_t cdist)
Constructor and initializer of the plan.
Definition heffte_backend_rocm.h:377
Template algorithm for the Sine and Cosine transforms.
Definition heffte_r2r_executor.h:192
Implementation of Cosine Transform of type 1 pre-post processing methods using CUDA.
Definition heffte_backend_rocm.h:165
static void pre_forward(hipStream_t, int length, precision const input[], precision fft_signal[])
Pre-process in the forward transform.
static int compute_extended_length(int length)
Computes the length of the extended signal.
Definition heffte_backend_rocm.h:179
static void post_backward(hipStream_t, int length, precision const fft_result[], precision result[])
Post-process in the inverse transform.
static void post_forward(hipStream_t, int length, std::complex< precision > const fft_result[], precision result[])
Post-process in the forward transform.
static void pre_backward(hipStream_t, int length, precision const input[], std::complex< precision > fft_signal[])
Pre-process in the inverse transform.
Implementation of Cosine Transform pre-post processing methods using CUDA.
Definition heffte_backend_rocm.h:121
static void post_forward(hipStream_t, int length, std::complex< precision > const fft_result[], precision result[])
Post-process in the forward transform.
static void pre_backward(hipStream_t, int length, precision const input[], std::complex< precision > fft_signal[])
Pre-process in the inverse transform.
static void pre_forward(hipStream_t, int length, precision const input[], precision fft_signal[])
Pre-process in the forward transform.
static void post_backward(hipStream_t, int length, precision const fft_result[], precision result[])
Post-process in the inverse transform.
static int compute_extended_length(int length)
Computes the length of the extended signal.
Definition heffte_backend_rocm.h:135
Indicates whether the rocfft_setup() method has been called.
Definition heffte_backend_rocm.h:188
static bool is_initialized
Static (global) variable indicating if rocfft_setup() has been called.
Definition heffte_backend_rocm.h:190
static void make()
Make initialize.
Definition heffte_backend_rocm.h:192
Implementation of Sine Transform pre-post processing methods using CUDA.
Definition heffte_backend_rocm.h:143
static void post_backward(hipStream_t, int length, precision const fft_result[], precision result[])
Post-process in the inverse transform.
static void pre_backward(hipStream_t, int length, precision const input[], std::complex< precision > fft_signal[])
Pre-process in the inverse transform.
static int compute_extended_length(int length)
Computes the length of the extended signal.
Definition heffte_backend_rocm.h:157
static void post_forward(hipStream_t, int length, std::complex< precision > const fft_result[], precision result[])
Post-process in the forward transform.
static void pre_forward(hipStream_t, int length, precision const input[], precision fft_signal[])
Pre-process in the forward transform.
Indicates the use of gpu backend and that all input/output data and arrays will be bound to the gpu d...
Definition heffte_common.h:45
void pack(hipStream_t stream, pack_plan_3d< index > const &plan, scalar_type const data[], scalar_type buffer[]) const
Execute the planned pack operation.
Definition heffte_backend_rocm.h:954
void unpack(hipStream_t stream, pack_plan_3d< index > const &plan, scalar_type const buffer[], scalar_type data[]) const
Execute the planned transpose-unpack operation.
Definition heffte_backend_rocm.h:959
Defines the transpose packer without implementation, use the specializations to get the CPU implement...
Definition heffte_pack3d.h:116