Sacado Package Browser (Single Doxygen Collection) Version of the Day
Loading...
Searching...
No Matches
mat_vec_hierarchical_dfad.cpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Sacado Package
5// Copyright (2006) Sandia Corporation
6//
7// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8// the U.S. Government retains certain rights in this software.
9//
10// This library is free software; you can redistribute it and/or modify
11// it under the terms of the GNU Lesser General Public License as
12// published by the Free Software Foundation; either version 2.1 of the
13// License, or (at your option) any later version.
14//
15// This library is distributed in the hope that it will be useful, but
16// WITHOUT ANY WARRANTY; without even the implied warranty of
17// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18// Lesser General Public License for more details.
19//
20// You should have received a copy of the GNU Lesser General Public
21// License along with this library; if not, write to the Free Software
22// Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23// USA
24// Questions? Contact David M. Gay (dmgay@sandia.gov) or Eric T. Phipps
25// (etphipp@sandia.gov).
26//
27// ***********************************************************************
28// @HEADER
29
30#define SACADO_VIEW_CUDA_HIERARCHICAL_DFAD 1
31#define SACADO_KOKKOS_USE_MEMORY_POOL 1
32
33#include "Sacado.hpp"
34
36
37#include "Kokkos_Timer.hpp"
38
39template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
40void run_mat_vec_hierarchical_dfad(const ViewTypeA& A, const ViewTypeB& b,
41 const ViewTypeC& c) {
42 typedef typename ViewTypeC::value_type scalar_type;
43 typedef typename ViewTypeC::execution_space execution_space;
44
45#if defined (KOKKOS_ENABLE_CUDA)
46 const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
47#else
48 const bool is_cuda = false;
49#endif
50 const unsigned vector_size = is_cuda ? 32 : 1;
51 const unsigned team_size = is_cuda ? 128 / vector_size : 1;
52
53 const int m = A.extent(0);
54 const int n = A.extent(1);
55 const int range = (m+team_size-1)/team_size;
56
57 typedef Kokkos::TeamPolicy<execution_space> Policy;
58 Kokkos::parallel_for(
59 Policy( range,team_size,vector_size ),
60 KOKKOS_LAMBDA (const typename Policy::member_type& team) {
61 const int i = team.league_rank()*team.team_size() + team.team_rank();
62 if (i >= m)
63 return;
64
65 scalar_type t = 0.0;
66 for (int j=0; j<n; ++j)
67 t += A(i,j)*b(j);
68 c(i) = t;
69 }
70 );
71}
72
73template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
75 const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c) {
76 typedef typename ViewTypeC::value_type scalar_type;
77 typedef typename ViewTypeC::execution_space execution_space;
78 typedef Kokkos::TeamPolicy<execution_space> Policy;
79 typedef typename Policy::member_type team_member;
80 typedef Kokkos::View<scalar_type*,Kokkos::LayoutLeft, typename execution_space::scratch_memory_space, Kokkos::MemoryUnmanaged> TmpScratchSpace;
81
82#if defined (KOKKOS_ENABLE_CUDA)
83 const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
84#else
85 const bool is_cuda = false;
86#endif
87 const unsigned VectorSize = is_cuda ? 32 : 1;
88 const unsigned TeamSize = is_cuda ? 128 / VectorSize : 1;
89
90 const int m = A.extent(0);
91 const int n = A.extent(1);
92 const int p = dimension_scalar(A);
93 const int N = (m+TeamSize-1)/TeamSize;
94
95 Policy policy(N, TeamSize, VectorSize);
96 const size_t bytes = TmpScratchSpace::shmem_size(TeamSize,p);
97 Kokkos::parallel_for(
98 policy.set_scratch_size(0, Kokkos::PerTeam(bytes)),
99 KOKKOS_LAMBDA (const team_member& team) {
100 const int team_rank = team.team_rank();
101 const int team_size = team.team_size();
102 TmpScratchSpace t(team.team_scratch(0), team_size, p);
103 const int i = team.league_rank()*team_size + team_rank;
104 if (i < m) {
105 t(team_rank) = 0.0;
106 for (int j=0; j<n; ++j)
107 t(team_rank) += A(i,j)*b(j);
108 c(i) = t(team_rank);
109 }
110 }
111 );
112}
113
114template <typename ViewTypeA, typename ViewTypeB, typename ViewTypeC>
115void
116check_deriv_hierarchical_dfad(const ViewTypeA& A, const ViewTypeB& b, const ViewTypeC& c)
117{
118 const double tol = 1.0e-14;
119 typedef typename ViewTypeC::value_type value_type;
120 typename ViewTypeC::HostMirror h_c = Kokkos::create_mirror_view(c);
121 Kokkos::deep_copy(h_c, c);
122 const size_t m = A.extent(0);
123 const size_t n = A.extent(1);
124 const size_t p = Kokkos::dimension_scalar(A);
125 for (size_t i=0; i<m; ++i) {
126 for (size_t j=0; j<p; ++j) {
127 value_type t = (j == p-1 ? n : 2*n);
128 if (std::abs(h_c(i).fastAccessDx(j)- t) > tol) {
129 std::cout << "Comparison failed! " << i << "," << j << " : "
130 << h_c(i).fastAccessDx(j) << " , " << t << std::endl;
131 }
132 }
133 }
134}
135
136template <typename FadType, typename ... ViewArgs>
137Perf
138do_time_fad_hierarchical_dfad(const size_t m, const size_t n, const size_t p,
139 const size_t nloop, const bool check)
140{
141 typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
142 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
143 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
144 typedef typename ViewTypeA::execution_space execution_space;
148 typedef Kokkos::View<FadType**, ConLayoutA, execution_space> ConViewTypeA;
149 typedef Kokkos::View<FadType*, ConLayoutB, execution_space> ConViewTypeB;
150 typedef Kokkos::View<FadType*, ConLayoutC, execution_space> ConViewTypeC;
151
152 ConViewTypeA A("A",m,n,p+1);
153 ConViewTypeB b("B",n,p+1);
154 ConViewTypeC c("c",m,p+1);
155
156 // FadType a(p, 1.0);
157 // for (size_t k=0; k<p; ++k)
158 // a.fastAccessDx(k) = 1.0;
159 Kokkos::deep_copy(typename ConViewTypeA::array_type(A), 1.0);
160 Kokkos::deep_copy(typename ConViewTypeB::array_type(b), 1.0);
161
162 Kokkos::Timer wall_clock;
163 Perf perf;
164
165#if defined (KOKKOS_ENABLE_CUDA)
166 const bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
167#else
168 const bool is_cuda = false;
169#endif
170 const size_t concurrency = execution_space().concurrency();
171 const size_t warp_dim = is_cuda ? 32 : 1;
172 const size_t block_size = p*sizeof(double);
173 const size_t nkernels = concurrency / warp_dim;
174 const size_t mem_pool_size =
175 static_cast<size_t>(1.2*nkernels*block_size);
176 const size_t superblock_size = std::max<size_t>(nkernels / 100, 1) * block_size;
177 execution_space space;
178 Sacado::createGlobalMemoryPool(space, mem_pool_size,
179 block_size,
180 block_size,
181 superblock_size
182 );
183
184 // Execute the kernel once to warm up
186 execution_space().fence();
187
188 wall_clock.reset();
189 for (size_t l=0; l<nloop; l++) {
191 }
192 execution_space().fence();
193
194 perf.time = wall_clock.seconds() / nloop;
195 perf.flops = m*n*(2+4*p);
196 perf.throughput = perf.flops / perf.time / 1.0e9;
197
198 if (check) {
200 }
201
203
204 return perf;
205}
206
207template <typename FadType, typename ... ViewArgs>
208Perf
210 const size_t m, const size_t n, const size_t p, const size_t nloop,
211 const bool check)
212{
213 typedef Kokkos::View<FadType**, ViewArgs...> ViewTypeA;
214 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeB;
215 typedef Kokkos::View<FadType*, ViewArgs...> ViewTypeC;
216 typedef typename ViewTypeA::execution_space execution_space;
220 typedef Kokkos::View<FadType**, ConLayoutA, execution_space> ConViewTypeA;
221 typedef Kokkos::View<FadType*, ConLayoutB, execution_space> ConViewTypeB;
222 typedef Kokkos::View<FadType*, ConLayoutC, execution_space> ConViewTypeC;
223
224 ConViewTypeA A("A",m,n,p+1);
225 ConViewTypeB b("B",n,p+1);
226 ConViewTypeC c("c",m,p+1);
227
228 // FadType a(p, 1.0);
229 // for (size_t k=0; k<p; ++k)
230 // a.fastAccessDx(k) = 1.0;
231 Kokkos::deep_copy(typename ConViewTypeA::array_type(A), 1.0);
232 Kokkos::deep_copy(typename ConViewTypeB::array_type(b), 1.0);
233
234 Kokkos::Timer wall_clock;
235 Perf perf;
236
237 // Execute the kernel once to warm up
239 execution_space().fence();
240
241 wall_clock.reset();
242 for (size_t l=0; l<nloop; l++) {
244 }
245 execution_space().fence();
246
247 perf.time = wall_clock.seconds() / nloop;
248 perf.flops = m*n*(2+4*p);
249 perf.throughput = perf.flops / perf.time / 1.0e9;
250
251 if (check) {
253 }
254
255 return perf;
256}
257
259
260#define INST_FUNC_FAD_DEV(FAD,DEV) \
261 template Perf do_time_fad_hierarchical_dfad< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
262 template Perf do_time_fad_hierarchical_dfad< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
263 template Perf do_time_fad_hierarchical_dfad< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
264 template Perf do_time_fad_hierarchical_dfad_scratch< FAD, Kokkos::LayoutLeft, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
265 template Perf do_time_fad_hierarchical_dfad_scratch< FAD, Kokkos::LayoutRight, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check ); \
266 template Perf do_time_fad_hierarchical_dfad_scratch< FAD, DEV > ( const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check );
267
268#define INST_FUNC_DEV(DEV) \
269 INST_FUNC_FAD_DEV( DFad_type, DEV )
270
271#ifdef KOKKOS_ENABLE_SERIAL
272INST_FUNC_DEV(Kokkos::Serial)
273#endif
274
275#ifdef KOKKOS_ENABLE_OPENMP
276INST_FUNC_DEV(Kokkos::OpenMP)
277#endif
278
279#ifdef KOKKOS_ENABLE_THREADS
280INST_FUNC_DEV(Kokkos::Threads)
281#endif
282
283#ifdef KOKKOS_ENABLE_CUDA
284INST_FUNC_DEV(Kokkos::Cuda)
285#endif
expr expr expr fastAccessDx(i)) FAD_UNARYOP_MACRO(exp
expr expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c *expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr1 c expr2 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 expr2 expr1 expr2 expr1 expr1 expr1 c
#define A
Definition: Sacado_rad.hpp:572
const int N
Sacado::Fad::DFad< double > FadType
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)
const char * p
Perf do_time_fad_hierarchical_dfad(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
void check_deriv_hierarchical_dfad(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
#define INST_FUNC_DEV(DEV)
void run_mat_vec_hierarchical_dfad_scratch(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
void run_mat_vec_hierarchical_dfad(const ViewTypeA &A, const ViewTypeB &b, const ViewTypeC &c)
Perf do_time_fad_hierarchical_dfad_scratch(const size_t m, const size_t n, const size_t p, const size_t nloop, const bool check)
Sacado::Fad::DFad< double > DFad_type
void createGlobalMemoryPool(const ExecSpace &space, const size_t min_total_alloc_size, const uint32_t min_block_alloc_size, const uint32_t max_block_alloc_size, const uint32_t min_superblock_size)
void destroyGlobalMemoryPool(const ExecSpace &space)
double time
double flops
double throughput
const double tol