Stokhos Package Browser (Single Doxygen Collection) Version of the Day
Loading...
Searching...
No Matches
FadMPAssembly/TestAssembly.hpp
Go to the documentation of this file.
1// @HEADER
2// ***********************************************************************
3//
4// Stokhos Package
5// Copyright (2009) Sandia Corporation
6//
7// Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8// license for use of this work by or on behalf of the U.S. Government.
9//
10// Redistribution and use in source and binary forms, with or without
11// modification, are permitted provided that the following conditions are
12// met:
13//
14// 1. Redistributions of source code must retain the above copyright
15// notice, this list of conditions and the following disclaimer.
16//
17// 2. Redistributions in binary form must reproduce the above copyright
18// notice, this list of conditions and the following disclaimer in the
19// documentation and/or other materials provided with the distribution.
20//
21// 3. Neither the name of the Corporation nor the names of the
22// contributors may be used to endorse or promote products derived from
23// this software without specific prior written permission.
24//
25// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36//
37// Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38//
39// ***********************************************************************
40// @HEADER
41#include <iostream>
42
43// MP::Vector
46
47// Compile-time loops
48#include "Sacado_mpl_range_c.hpp"
49#include "Sacado_mpl_for_each.hpp"
50#include "Sacado_mpl_integral_c.hpp"
51
52// Kokkos libraries' headers:
53#include <Kokkos_UnorderedMap.hpp>
54#include <Kokkos_StaticCrsGraph.hpp>
55#include <Kokkos_Timer.hpp>
56
57// Utilities
58#include <Teuchos_CommHelpers.hpp>
59#include "Teuchos_TestingHelpers.hpp"
60#include "Teuchos_VerboseObject.hpp"
61
62// FENL
63#include <BoxElemFixture.hpp>
64#include <VectorImport.hpp>
65#include <fenl_functors.hpp>
66
67struct Perf {
70 double import_time ;
71 double fill_time ;
72
75 import_time(0) ,
76 fill_time(0) {}
77
78 void increment(const Perf& p) {
83 }
84
85 void scale(double s) {
86 import_time *= s;
87 fill_time *= s;
88 }
89};
90
91inline
92double maximum( const Teuchos::RCP<const Teuchos::Comm<int> >& comm , double local )
93{
94 double global = 0 ;
95 Teuchos::reduceAll( *comm , Teuchos::REDUCE_MAX , 1 , & local , & global );
96 return global ;
97}
98
99template <typename Scalar, typename Device,
102 const Teuchos::RCP<const Teuchos::Comm<int> >& comm ,
103 const int use_print ,
104 const int use_trials ,
105 const int use_nodes[] ,
107 Kokkos::View< Scalar* , Kokkos::LayoutLeft, Device >& nodal_residual)
108{
109 using Teuchos::RCP;
110 using Teuchos::rcp;
111 using Teuchos::rcpFromRef;
112 using Teuchos::arrayView;
113 using Teuchos::ParameterList;
114
116
118
119 typedef typename LocalMatrixType::StaticCrsGraphType LocalGraphType ;
120
122 NodeNodeGraphType ;
123
124 //typedef Kokkos::Example::FENL::ElementComputationConstantCoefficient CoeffFunctionType;
127 ElementComputationType ;
128
129 // typedef Kokkos::Example::FENL::DirichletComputation< FixtureType , LocalMatrixType >
130 // DirichletComputationType ;
131
132 typedef typename ElementComputationType::vector_type VectorType ;
133
135 typename FixtureType::comm_list_type ,
136 typename FixtureType::send_nodeid_type ,
137 VectorType > ImportType ;
138
139 //------------------------------------
140
141 const int print_flag = use_print && std::is_same< Kokkos::HostSpace , typename Device::memory_space >::value ;
142
143 const int comm_rank = comm->getRank();
144 const int comm_size = comm->getSize();
145
146 // Decompose by node to avoid parallel communication in assembly
147
148 const double bubble_x = 1.0 ;
149 const double bubble_y = 1.0 ;
150 const double bubble_z = 1.0 ;
151
152 const FixtureType fixture( Kokkos::Example::BoxElemPart::DecomposeNode ,
153 comm_size , comm_rank ,
154 use_nodes[0] , use_nodes[1] , use_nodes[2] ,
155 bubble_x , bubble_y , bubble_z );
156
157 //------------------------------------
158
159 const ImportType comm_nodal_import(
160 comm ,
161 fixture.recv_node() ,
162 fixture.send_node() ,
163 fixture.send_nodeid() ,
164 fixture.node_count_owned() ,
165 fixture.node_count() - fixture.node_count_owned() );
166
167 //------------------------------------
168
169 // const double bc_lower_value = 1 ;
170 // const double bc_upper_value = 2 ;
171 //CoeffFunctionType diffusion_coefficient( 1.0 );
172 CoeffFunctionType diffusion_coefficient( 1.0, 0.1, 1.0, 5 );
173 Kokkos::deep_copy( diffusion_coefficient.getRandomVariables(), 1.0 );
174
175 //------------------------------------
176
177 if ( print_flag ) {
178 std::cout << "ElemNode {" << std::endl ;
179 for ( unsigned ielem = 0 ; ielem < fixture.elem_count() ; ++ielem ) {
180 std::cout << " elem[" << ielem << "]{" ;
181 for ( unsigned inode = 0 ; inode < FixtureType::ElemNode ; ++inode ) {
182 std::cout << " " << fixture.elem_node(ielem,inode);
183 }
184 std::cout << " }" << std::endl ;
185 }
186 std::cout << "}" << std::endl ;
187 }
188
189 //------------------------------------
190
191 Kokkos::Timer wall_clock ;
192
193 Perf perf_stats = Perf() ;
194
195 for ( int itrial = 0 ; itrial < use_trials ; ++itrial ) {
196
197 Perf perf = Perf() ;
198
199 perf.global_elem_count = fixture.elem_count_global();
200 perf.global_node_count = fixture.node_count_global();
201
202 //----------------------------------
203 // Create the local sparse matrix graph and element-to-graph map
204 // from the element->to->node identifier array.
205 // The graph only has rows for the owned nodes.
206
207 typename NodeNodeGraphType::Times graph_times;
208 const NodeNodeGraphType
209 mesh_to_graph( fixture.elem_node() , fixture.node_count_owned(),
210 graph_times );
211
212 // Create the local sparse matrix from the graph:
213 LocalMatrixType jacobian( mesh_to_graph.graph );
214
215 //----------------------------------
216
217 // Allocate solution vector for each node in the mesh and residual vector for each owned node
218 VectorType nodal_solution( "nodal_solution" , fixture.node_count() );
219 nodal_residual = VectorType( "nodal_residual" , fixture.node_count_owned() );
220
221 // Get DeviceConfig structs used by some functors
222 Kokkos::Example::FENL::DeviceConfig dev_config_elem, dev_config_bc;
224 dev_config_bc );
225
226 // Create element computation functor
227 const ElementComputationType elemcomp( fixture , diffusion_coefficient ,
228 nodal_solution ,
229 mesh_to_graph.elem_graph ,
230 jacobian , nodal_residual ,
231 dev_config_elem );
232
233 // Create boundary condition functor
234 // const DirichletComputationType dirichlet(
235 // fixture , nodal_solution , jacobian , nodal_residual ,
236 // 2 /* apply at 'z' ends */ ,
237 // bc_lower_value ,
238 // bc_upper_value ,
239 // dev_config_bc );
240
241 Kokkos::deep_copy( nodal_solution , Scalar(1) );
242
243 //--------------------------------
244
245 wall_clock.reset();
246
247 comm_nodal_import( nodal_solution );
248
249 Device().fence();
250 perf.import_time = maximum( comm , wall_clock.seconds() );
251
252 //--------------------------------
253 // Element contributions to residual and jacobian
254
255 wall_clock.reset();
256
257 Kokkos::deep_copy( nodal_residual , Scalar(0) );
258 Kokkos::deep_copy( jacobian.values , Scalar(0) );
259
260 elemcomp.apply();
261
262 //--------------------------------
263 // Apply boundary conditions
264
265 //dirichlet.apply();
266
267 Device().fence();
268 perf.fill_time = maximum( comm , wall_clock.seconds() );
269
270 //--------------------------------
271
272 perf_stats.increment(perf);
273
274 }
275
276 return perf_stats ;
277}
278
279template <typename ScalarViewType, typename EnsembleViewType>
280bool check_residuals(const ScalarViewType& scalar_residual,
281 const EnsembleViewType& ensemble_residual)
282{
283 const double tol = 1e-14;
284 bool success = true;
285 Teuchos::RCP<Teuchos::FancyOStream> out =
286 Teuchos::VerboseObjectBase::getDefaultOStream();
287 std::stringstream buf;
288 Teuchos::FancyOStream fbuf(Teuchos::rcp(&buf,false));
289
290 typename ScalarViewType::HostMirror host_scalar_residual =
291 Kokkos::create_mirror_view(scalar_residual);
292 typename EnsembleViewType::HostMirror host_ensemble_residual =
293 Kokkos::create_mirror_view(ensemble_residual);
294 Kokkos::deep_copy( host_scalar_residual, scalar_residual );
295 Kokkos::deep_copy( host_ensemble_residual, ensemble_residual );
296
297 TEUCHOS_TEST_EQUALITY( host_scalar_residual.extent(0),
298 host_ensemble_residual.extent(0), fbuf, success );
299
300 const size_t num_node = host_scalar_residual.extent(0);
301 const size_t num_ensemble = Kokkos::dimension_scalar(host_ensemble_residual);
302 for (size_t i=0; i<num_node; ++i) {
303 for (size_t j=0; j<num_ensemble; ++j) {
304 TEUCHOS_TEST_FLOATING_EQUALITY(
305 host_scalar_residual(i), host_ensemble_residual(i).fastAccessCoeff(j),
306 tol, fbuf, success );
307 }
308 }
309
310 if (!success)
311 *out << buf.str();
312
313 return success;
314}
315
316template <class Storage,
319 typedef typename Storage::value_type Scalar;
320 typedef typename Storage::ordinal_type Ordinal;
321 typedef typename Storage::execution_space Device;
322 Teuchos::RCP<const Teuchos::Comm<int> > comm ;
323 const int use_print ;
324 const int use_trials ;
325 const int *use_nodes ;
326 const bool check ;
328
329 PerformanceDriverOp(const Teuchos::RCP<const Teuchos::Comm<int> >& comm_ ,
330 const int use_print_ ,
331 const int use_trials_ ,
332 const int use_nodes_[] ,
333 const bool check_ ,
335 comm(comm_),
336 use_print(use_print_),
337 use_trials(use_trials_),
338 use_nodes(use_nodes_),
339 check(check_),
340 dev_config(dev_config_) {}
341
342 template <typename ArgT>
343 void operator() (ArgT arg) const {
344 const int ensemble = ArgT::value;
345 typedef typename Storage::template apply_N<ensemble> NewStorageApply;
346 typedef typename NewStorageApply::type storage_type;
347 typedef Sacado::MP::Vector<storage_type> mp_vector_type;
348
349 typedef Kokkos::View< Scalar* , Kokkos::LayoutLeft, Device > scalar_vector_type ;
350 typedef Kokkos::View< mp_vector_type* , Kokkos::LayoutLeft, Device > ensemble_vector_type ;
351
352 scalar_vector_type scalar_residual;
353 Kokkos::Example::FENL::DeviceConfig scalar_dev_config(0, 1, 1);
354 Perf perf_scalar =
355 fenl_assembly<Scalar,Device,Method>(
357 scalar_dev_config, scalar_residual );
358
359 ensemble_vector_type ensemble_residual;
361#if defined( KOKKOS_ENABLE_CUDA )
362 const bool is_cuda = std::is_same<Device,Kokkos::Cuda>::value;
363#else
364 const bool is_cuda = false ;
365#endif
366 if (is_cuda) {
367 const size_t block_size = dev_config.block_dim.x * dev_config.block_dim.y;
368 ensemble_dev_config.block_dim.x = ensemble;
369 ensemble_dev_config.block_dim.y = block_size/ensemble;
370 }
371 Perf perf_ensemble =
372 fenl_assembly<mp_vector_type,Device,Method>(
374 ensemble_dev_config, ensemble_residual);
375
376 if (check)
377 check_residuals( scalar_residual, ensemble_residual );
378
379 double s =
380 1000.0 / ( use_trials * ensemble * perf_scalar.global_node_count );
381 perf_scalar.scale(s);
382 perf_ensemble.scale(s);
383
384 if (comm->getRank() == 0) {
385 std::cout.precision(3);
386 std::cout << use_nodes[0] << " , "
387 << perf_scalar.global_node_count << " , "
388 << std::setw(2) << ensemble << " , "
389 << std::scientific
390 << perf_scalar.import_time << " , "
391 << perf_ensemble.import_time << " , "
392 << std::fixed << std::setw(6)
393 << perf_scalar.import_time / perf_ensemble.import_time << " , "
394 << std::scientific
395 << perf_scalar.fill_time << " , "
396 << perf_ensemble.fill_time << " , "
397 << std::fixed << std::setw(6)
398 << perf_scalar.fill_time / perf_ensemble.fill_time << " , "
399 << std::endl;
400 }
401 }
402};
403
404template <class Storage, int entry_min, int entry_max, int entry_step,
406void performance_test_driver( const Teuchos::RCP<const Teuchos::Comm<int> >& comm ,
407 const int use_print ,
408 const int use_trials ,
409 const int use_nodes[] ,
410 const bool check ,
412{
413 if (comm->getRank() == 0) {
414 std::cout.precision(8);
415 std::cout << std::endl
416 << "\"Grid Size\" , "
417 << "\"FEM Size\" , "
418 << "\"Ensemble Size\" , "
419 << "\"Scalar Import Time\" , "
420 << "\"Ensemble Import Time\" , "
421 << "\"Ensemble Import Speedup\" , "
422 << "\"Scalar Fill Time\" , "
423 << "\"Ensemble Fill Time\" , "
424 << "\"Ensemble Fill Speedup\" , "
425 << std::endl;
426 }
427
428 // Loop over [entry_min, entry_max] vector entries per thread
429 typedef Sacado::mpl::range_c< int, entry_min, entry_max+1, entry_step > Range;
430 PerformanceDriverOp<Storage,Method> op(comm, use_print, use_trials,
431 use_nodes, check, dev_config);
432 Sacado::mpl::for_each_no_kokkos<Range> f(op);
433}
Perf fenl_assembly(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], Kokkos::Example::FENL::DeviceConfig dev_config, Kokkos::View< Scalar *, Kokkos::LayoutLeft, Device > &nodal_residual)
bool check_residuals(const ScalarViewType &scalar_residual, const EnsembleViewType &ensemble_residual)
void performance_test_driver(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
double maximum(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, double local)
expr1 expr1 expr1 expr2 expr1 expr1 c expr2 expr1 c fastAccessCoeff(j) - expr2.val(j)
Stokhos::StandardStorage< int, double > storage_type
Generate a distributed unstructured finite element mesh from a partitioned NX*NY*NZ box of elements.
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)
ScalarType f(const Teuchos::Array< ScalarType > &x, double a, double b)
Stokhos::StandardStorage< int, double > Storage
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< View< T, P... > >::value, unsigned >::type dimension_scalar(const View< T, P... > &view)
static void eval(Kokkos::Example::FENL::DeviceConfig &dev_config_elem, Kokkos::Example::FENL::DeviceConfig &dev_config_bc)
void scale(double s)
void increment(const Perf &p)
Kokkos::Example::FENL::DeviceConfig dev_config
Storage::execution_space Device
PerformanceDriverOp(const Teuchos::RCP< const Teuchos::Comm< int > > &comm_, const int use_print_, const int use_trials_, const int use_nodes_[], const bool check_, Kokkos::Example::FENL::DeviceConfig dev_config_)
Teuchos::RCP< const Teuchos::Comm< int > > comm