45#include "Kokkos_Core.hpp"
46#include "Kokkos_Timer.hpp"
62#if defined(HAVE_MPI) && 0
75#ifdef HAVE_STOKHOS_KOKKOSLINALG
76#include "KokkosSparse_CrsMatrix.hpp"
77#include "KokkosSparse_spmv.hpp"
78#include "KokkosBlas1_update.hpp"
83template<
typename IntType >
90 return k + N * (
j + N * i );
95 std::vector< std::vector<size_t> > & graph )
97 graph.resize( N * N * N , std::vector<size_t>() );
101 for (
int i = 0 ; i < (int) N ; ++i ) {
102 for (
int j = 0 ;
j < (int) N ; ++
j ) {
103 for (
int k = 0 ; k < (int) N ; ++k ) {
107 graph[row].reserve(27);
109 for (
int ii = -1 ; ii < 2 ; ++ii ) {
110 for (
int jj = -1 ; jj < 2 ; ++jj ) {
111 for (
int kk = -1 ; kk < 2 ; ++kk ) {
112 if ( 0 <= i + ii && i + ii < (
int) N &&
113 0 <=
j + jj &&
j + jj < (
int) N &&
114 0 <= k + kk && k + kk < (
int) N ) {
117 graph[row].push_back(col);
120 total += graph[row].size();
138template<
typename ScalarType ,
typename TensorType,
class Device >
141 const std::vector<int> & var_degree ,
143 const int iterCount ,
144 const bool symmetric )
146 typedef ScalarType value_type ;
147 typedef Kokkos::View< value_type** ,
149 Device > block_vector_type ;
154 typedef typename matrix_type::graph_type graph_type ;
156 typedef ScalarType value_type ;
165 using Teuchos::Array;
168 const size_t num_KL = var_degree.size();
169 Array< RCP<const abstract_basis_type> > bases(num_KL);
170 for (
size_t i=0; i<num_KL; i++) {
172 bases[i] = Teuchos::rcp(
new basis_type(var_degree[i],1.0,1.0,
true));
174 bases[i] = Teuchos::rcp(
new basis_type(var_degree[i],1.0,2.0,
true));
176 RCP<const product_basis_type> basis =
177 rcp(
new product_basis_type(
179 RCP<Cijk_type> Cijk = basis->computeTripleProductTensor();
184 std::vector< std::vector<size_t> > graph ;
186 const size_t outer_length = nGrid * nGrid * nGrid ;
195 Stokhos::create_stochastic_product_tensor< TensorType >( *basis,
197 matrix.graph = Kokkos::create_staticcrsgraph<graph_type>( std::string(
"test crs graph") , graph );
199 const size_t inner_length = matrix.block.dimension();
200 const size_t inner_length_aligned = matrix.block.aligned_dimension();
203 block_vector_type( Kokkos::ViewAllocateWithoutInitializing(
"matrix"), inner_length_aligned , graph_length );
205 block_vector_type x =
206 block_vector_type( Kokkos::ViewAllocateWithoutInitializing(
"x"), inner_length_aligned , outer_length );
207 block_vector_type y =
208 block_vector_type( Kokkos::ViewAllocateWithoutInitializing(
"y"), inner_length_aligned , outer_length );
210 Kokkos::deep_copy( matrix.values , ScalarType(1.0) );
215 Kokkos::deep_copy( x , ScalarType(1.0) );
216 block_vector_type x0 =
217 block_vector_type( Kokkos::ViewAllocateWithoutInitializing(
"x"),
218 inner_length_aligned , outer_length );
219 Kokkos::deep_copy( x0 , ScalarType(1.0) );
224 Kokkos::Timer clock ;
225 for (
int iter = 0 ; iter < iterCount ; ++iter ) {
226 Kokkos::deep_copy( x, x0 );
231 const double seconds_per_iter = clock.seconds() / ((
double) iterCount );
232 const double flops_per_block = matrix.block.tensor().num_flops();
233 const double flops = 1.0e-9*graph_length*flops_per_block;
235 std::vector<double> perf(6) ;
237 perf[0] = outer_length * inner_length ;
238 perf[1] = seconds_per_iter ;
239 perf[2] = flops / seconds_per_iter;
240 perf[3] = matrix.block.tensor().entry_count();
241 perf[4] = inner_length ;
242 perf[5] = flops_per_block;
247template<
typename ScalarType ,
class Device >
250 const std::vector<int> & var_degree ,
252 const int iterCount ,
253 const bool symmetric )
255 typedef ScalarType value_type ;
256 typedef Kokkos::View< value_type**,
258 Device > block_vector_type ;
263 value_type , Device > matrix_type ;
265 typedef typename matrix_type::graph_type graph_type ;
267 typedef ScalarType value_type ;
276 using Teuchos::Array;
279 const size_t num_KL = var_degree.size();
280 Array< RCP<const abstract_basis_type> > bases(num_KL);
281 for (
size_t i=0; i<num_KL; i++) {
283 bases[i] = Teuchos::rcp(
new basis_type(var_degree[i],1.0,1.0,
true));
285 bases[i] = Teuchos::rcp(
new basis_type(var_degree[i],1.0,2.0,
true));
287 RCP<const product_basis_type> basis =
288 rcp(
new product_basis_type(
290 RCP<Cijk_type> Cijk = basis->computeTripleProductTensor();
295 std::vector< std::vector<size_t> > fem_graph ;
297 const size_t fem_length = nGrid * nGrid * nGrid ;
300 const size_t stoch_length = basis->size();
304 block_vector_type x = block_vector_type( Kokkos::ViewAllocateWithoutInitializing(
"x"), stoch_length , fem_length );
305 block_vector_type y = block_vector_type( Kokkos::ViewAllocateWithoutInitializing(
"y"), stoch_length , fem_length );
307 Kokkos::deep_copy( x , ScalarType(1.0) );
315 matrix.graph = Kokkos::create_staticcrsgraph<graph_type>(
316 std::string(
"test product tensor graph") , fem_graph );
317 matrix.values = block_vector_type(
318 Kokkos::ViewAllocateWithoutInitializing(
"matrix"), matrix.block.matrix_size() , fem_graph_length );
320 Kokkos::deep_copy( matrix.values , ScalarType(1.0) );
325 Kokkos::Timer clock ;
326 for (
int iter = 0 ; iter < iterCount ; ++iter ) {
331 const double seconds_per_iter = clock.seconds() / ((
double) iterCount );
332 const double flops_per_block = 2.0*stoch_length*stoch_length;
333 const double flops = 1e-9*fem_graph_length*flops_per_block;
335 std::vector<double> perf(6);
336 perf[0] = fem_length * stoch_length ;
337 perf[1] = seconds_per_iter;
338 perf[2] = flops / seconds_per_iter;
339 perf[3] = Cijk->num_entries();
340 perf[4] = stoch_length;
341 perf[5] = flops_per_block;
351template<
typename ScalarType ,
class Device >
354 const std::vector<int> & var_degree ,
356 const int iterCount ,
357 const bool symmetric )
359 typedef ScalarType value_type ;
360 typedef Kokkos::View< value_type* , Device > vector_type ;
365 typedef typename matrix_type::values_type matrix_values_type;
366 typedef typename matrix_type::graph_type matrix_graph_type;
368 typedef ScalarType value_type ;
377 using Teuchos::Array;
380 const size_t num_KL = var_degree.size();
381 Array< RCP<const abstract_basis_type> > bases(num_KL);
382 for (
size_t i=0; i<num_KL; i++) {
384 bases[i] = Teuchos::rcp(
new basis_type(var_degree[i],1.0,1.0,
true));
386 bases[i] = Teuchos::rcp(
new basis_type(var_degree[i],1.0,2.0,
true));
388 RCP<const product_basis_type> basis =
389 rcp(
new product_basis_type(
391 RCP<Cijk_type> Cijk = basis->computeTripleProductTensor();
396 std::vector< std::vector<size_t> > fem_graph ;
398 const size_t fem_length = nGrid * nGrid * nGrid ;
405 const size_t stoch_length = basis->size();
406 std::vector< std::vector< int > > stoch_graph( stoch_length );
407#if defined(HAVE_MPI) && 0
413 *basis, *Cijk, comm);
414 for (
size_t i = 0 ; i < stoch_length ; ++i ) {
415 int len = cijk_graph->NumGlobalIndices(i);
416 stoch_graph[i].resize(len);
418 cijk_graph->ExtractGlobalRowCopy(i, len, len2, &stoch_graph[i][0]);
424 const size_t flat_length = fem_length * stoch_length ;
426 std::vector< std::vector<size_t> > flat_graph( flat_length );
428 for (
size_t iOuterRow = 0 ; iOuterRow < fem_length ; ++iOuterRow ) {
430 const size_t iOuterRowNZ = fem_graph[iOuterRow].size();
432 for (
size_t iInnerRow = 0 ; iInnerRow < stoch_length ; ++iInnerRow ) {
434 const size_t iInnerRowNZ = stoch_graph[ iInnerRow ].size(); ;
435 const size_t iFlatRowNZ = iOuterRowNZ * iInnerRowNZ ;
436 const size_t iFlatRow = iInnerRow + iOuterRow * stoch_length ;
438 flat_graph[iFlatRow].resize( iFlatRowNZ );
440 size_t iFlatEntry = 0 ;
442 for (
size_t iOuterEntry = 0 ; iOuterEntry < iOuterRowNZ ; ++iOuterEntry ) {
444 const size_t iOuterCol = fem_graph[iOuterRow][iOuterEntry];
446 for (
size_t iInnerEntry = 0 ; iInnerEntry < iInnerRowNZ ; ++iInnerEntry ) {
448 const size_t iInnerCol = stoch_graph[iInnerRow][iInnerEntry] ;
449 const size_t iFlatColumn = iInnerCol + iOuterCol * stoch_length ;
451 flat_graph[iFlatRow][iFlatEntry] = iFlatColumn ;
461 vector_type x = vector_type( Kokkos::ViewAllocateWithoutInitializing(
"x"), flat_length );
462 vector_type y = vector_type( Kokkos::ViewAllocateWithoutInitializing(
"y"), flat_length );
464 Kokkos::deep_copy( x , ScalarType(1.0) );
470 matrix.graph = Kokkos::create_staticcrsgraph<matrix_graph_type>(
471 std::string(
"testing") , flat_graph );
473 const size_t flat_graph_length = matrix.graph.entries.extent(0);
475 matrix.values = matrix_values_type( Kokkos::ViewAllocateWithoutInitializing(
"matrix"), flat_graph_length );
477 Kokkos::deep_copy( matrix.values , ScalarType(1.0) );
484 Kokkos::Timer clock ;
485 for (
int iter = 0 ; iter < iterCount ; ++iter ) {
490 const double seconds_per_iter = clock.seconds() / ((
double) iterCount );
491 const double flops = 2.0*1e-9*flat_graph_length / seconds_per_iter;
493 std::vector<double> perf(4);
494 perf[0] = flat_length ;
495 perf[1] = seconds_per_iter;
497 perf[3] = flat_graph_length ;
508template<
typename ScalarType ,
class Device >
511 const std::vector<int> & var_degree ,
513 const int iterCount ,
514 const bool symmetric )
516 typedef ScalarType value_type ;
517 typedef Kokkos::View< value_type* , Device > vector_type ;
522 typedef typename matrix_type::values_type matrix_values_type;
523 typedef typename matrix_type::graph_type matrix_graph_type;
525 typedef ScalarType value_type ;
534 using Teuchos::Array;
537 const size_t num_KL = var_degree.size();
538 Array< RCP<const abstract_basis_type> > bases(num_KL);
539 for (
size_t i=0; i<num_KL; i++) {
541 bases[i] = Teuchos::rcp(
new basis_type(var_degree[i],1.0,1.0,
true));
543 bases[i] = Teuchos::rcp(
new basis_type(var_degree[i],1.0,2.0,
true));
545 RCP<const product_basis_type> basis =
546 rcp(
new product_basis_type(
548 RCP<Cijk_type> Cijk = basis->computeTripleProductTensor();
553 std::vector< std::vector<size_t> > fem_graph ;
555 const size_t fem_length = nGrid * nGrid * nGrid ;
562 const size_t stoch_length = basis->size();
563 std::vector< std::vector< int > > stoch_graph( stoch_length );
564#if defined(HAVE_MPI) && 0
570 *basis, *Cijk, comm);
571 for (
size_t i = 0 ; i < stoch_length ; ++i ) {
572 int len = cijk_graph->NumGlobalIndices(i);
573 stoch_graph[i].resize(len);
575 cijk_graph->ExtractGlobalRowCopy(i, len, len2, &stoch_graph[i][0]);
581 const size_t flat_length = fem_length * stoch_length ;
583 std::vector< std::vector<size_t> > flat_graph( flat_length );
585 for (
size_t iOuterRow = 0 ; iOuterRow < stoch_length ; ++iOuterRow ) {
587 const size_t iOuterRowNZ = stoch_graph[iOuterRow].size();
589 for (
size_t iInnerRow = 0 ; iInnerRow < fem_length ; ++iInnerRow ) {
591 const size_t iInnerRowNZ = fem_graph[iInnerRow].size();
592 const size_t iFlatRowNZ = iOuterRowNZ * iInnerRowNZ ;
593 const size_t iFlatRow = iInnerRow + iOuterRow * fem_length ;
595 flat_graph[iFlatRow].resize( iFlatRowNZ );
597 size_t iFlatEntry = 0 ;
599 for (
size_t iOuterEntry = 0 ; iOuterEntry < iOuterRowNZ ; ++iOuterEntry ) {
601 const size_t iOuterCol = stoch_graph[ iOuterRow ][ iOuterEntry ];
603 for (
size_t iInnerEntry = 0 ; iInnerEntry < iInnerRowNZ ; ++iInnerEntry ) {
605 const size_t iInnerCol = fem_graph[ iInnerRow][iInnerEntry];
606 const size_t iFlatColumn = iInnerCol + iOuterCol * fem_length ;
608 flat_graph[iFlatRow][iFlatEntry] = iFlatColumn ;
617 vector_type x = vector_type( Kokkos::ViewAllocateWithoutInitializing(
"x"), flat_length );
618 vector_type y = vector_type( Kokkos::ViewAllocateWithoutInitializing(
"y"), flat_length );
620 Kokkos::deep_copy( x , ScalarType(1.0) );
626 matrix.graph = Kokkos::create_staticcrsgraph<matrix_graph_type>( std::string(
"testing") , flat_graph );
628 const size_t flat_graph_length = matrix.graph.entries.extent(0);
630 matrix.values = matrix_values_type( Kokkos::ViewAllocateWithoutInitializing(
"matrix"), flat_graph_length );
632 Kokkos::deep_copy( matrix.values , ScalarType(1.0) );
639 Kokkos::Timer clock ;
640 for (
int iter = 0 ; iter < iterCount ; ++iter ) {
645 const double seconds_per_iter = clock.seconds() / ((
double) iterCount );
646 const double flops = 2.0*1e-9*flat_graph_length / seconds_per_iter;
648 std::vector<double> perf(4);
649 perf[0] = flat_length ;
650 perf[1] = seconds_per_iter;
652 perf[3] = flat_graph_length ;
656template<
typename ScalarType ,
class Device >
659 const std::vector<int> & var_degree ,
661 const int iterCount ,
662 const bool symmetric )
664 typedef ScalarType value_type ;
665 typedef Kokkos::View< value_type** ,
667 Device > block_vector_type ;
673 typedef typename matrix_type::graph_type graph_type ;
675 typedef ScalarType value_type ;
684 using Teuchos::Array;
687 const size_t num_KL = var_degree.size();
688 Array< RCP<const abstract_basis_type> > bases(num_KL);
689 for (
size_t i=0; i<num_KL; i++) {
691 bases[i] = Teuchos::rcp(
new basis_type(var_degree[i],1.0,1.0,
true));
693 bases[i] = Teuchos::rcp(
new basis_type(var_degree[i],1.0,2.0,
true));
695 RCP<const product_basis_type> basis =
696 rcp(
new product_basis_type(
698 RCP<Cijk_type> Cijk = basis->computeTripleProductTensor();
703 std::vector< std::vector<size_t> > graph ;
705 const size_t outer_length = nGrid * nGrid * nGrid ;
713 Teuchos::ParameterList params;
714 params.set(
"Tile Size", 128);
715 params.set(
"Max Tiles", 10000);
717 Stokhos::create_stochastic_product_tensor< TensorType >( *basis, *Cijk,
719 matrix.graph = Kokkos::create_staticcrsgraph<graph_type>( std::string(
"test crs graph") , graph );
721 const size_t inner_length = matrix.block.dimension();
722 const size_t inner_length_aligned = matrix.block.aligned_dimension();
725 block_vector_type( Kokkos::ViewAllocateWithoutInitializing(
"matrix"), inner_length_aligned , graph_length );
727 block_vector_type x =
728 block_vector_type( Kokkos::ViewAllocateWithoutInitializing(
"x"), inner_length_aligned , outer_length );
729 block_vector_type y =
730 block_vector_type( Kokkos::ViewAllocateWithoutInitializing(
"y"), inner_length_aligned , outer_length );
732 Kokkos::deep_copy( matrix.values , ScalarType(1.0) );
737 Kokkos::deep_copy( x , ScalarType(1.0) );
742 Kokkos::Timer clock ;
743 for (
int iter = 0 ; iter < iterCount ; ++iter ) {
748 const double seconds_per_iter = clock.seconds() / ((
double) iterCount );
749 const double flops_per_block = matrix.block.tensor().num_flops();
750 const double flops = 1.0e-9*graph_length*flops_per_block;
755 std::vector<double> perf(6) ;
757 perf[0] = outer_length * inner_length ;
758 perf[1] = seconds_per_iter ;
759 perf[2] = flops / seconds_per_iter;
760 perf[3] = matrix.block.tensor().entry_count();
761 perf[4] = inner_length ;
762 perf[5] = flops_per_block;
767template<
typename ScalarType ,
class Device >
770 const std::vector<int> & var_degree ,
772 const int iterCount ,
773 const bool symmetric )
775 typedef ScalarType value_type ;
776 typedef Kokkos::View< value_type** ,
778 Device > block_vector_type ;
784 typedef typename matrix_type::graph_type graph_type ;
786 typedef ScalarType value_type ;
795 using Teuchos::Array;
798 const size_t num_KL = var_degree.size();
799 Array< RCP<const abstract_basis_type> > bases(num_KL);
800 for (
size_t i=0; i<num_KL; i++) {
802 bases[i] = Teuchos::rcp(
new basis_type(var_degree[i],1.0,1.0,
true));
804 bases[i] = Teuchos::rcp(
new basis_type(var_degree[i],1.0,2.0,
true));
806 RCP<const product_basis_type> basis =
807 rcp(
new product_basis_type(
809 RCP<Cijk_type> Cijk = basis->computeTripleProductTensor();
814 std::vector< std::vector<size_t> > graph ;
816 const size_t outer_length = nGrid * nGrid * nGrid ;
824 Teuchos::ParameterList params;
825 params.set(
"Tile Size", 128);
827 Stokhos::create_stochastic_product_tensor< TensorType >( *basis, *Cijk,
829 matrix.graph = Kokkos::create_staticcrsgraph<graph_type>( std::string(
"test crs graph") , graph );
831 const size_t inner_length = matrix.block.dimension();
832 const size_t inner_length_aligned = matrix.block.aligned_dimension();
835 block_vector_type( Kokkos::ViewAllocateWithoutInitializing(
"matrix"), inner_length_aligned , graph_length );
837 block_vector_type x =
838 block_vector_type( Kokkos::ViewAllocateWithoutInitializing(
"x"), inner_length_aligned , outer_length );
839 block_vector_type y =
840 block_vector_type( Kokkos::ViewAllocateWithoutInitializing(
"y"), inner_length_aligned , outer_length );
842 Kokkos::deep_copy( matrix.values , ScalarType(1.0) );
847 Kokkos::deep_copy( x , ScalarType(1.0) );
852 Kokkos::Timer clock ;
853 for (
int iter = 0 ; iter < iterCount ; ++iter ) {
858 const double seconds_per_iter = clock.seconds() / ((
double) iterCount );
859 const double flops_per_block = matrix.block.tensor().num_flops();
860 const double flops = 1.0e-9*graph_length*flops_per_block;
865 std::vector<double> perf(6) ;
867 perf[0] = outer_length * inner_length ;
868 perf[1] = seconds_per_iter ;
869 perf[2] = flops / seconds_per_iter;
870 perf[3] = matrix.block.tensor().entry_count();
871 perf[4] = inner_length ;
872 perf[5] = flops_per_block;
877template<
typename ScalarType ,
class Device >
880 const std::vector<int> & var_degree ,
882 const int iterCount ,
883 const bool symmetric )
885 typedef ScalarType value_type ;
886 typedef Kokkos::View< value_type** ,
888 Device > block_vector_type ;
894 typedef typename matrix_type::graph_type graph_type ;
896 typedef ScalarType value_type ;
905 using Teuchos::Array;
908 const size_t num_KL = var_degree.size();
909 Array< RCP<const abstract_basis_type> > bases(num_KL);
910 for (
size_t i=0; i<num_KL; i++) {
912 bases[i] = Teuchos::rcp(
new basis_type(var_degree[i],1.0,1.0,
true));
914 bases[i] = Teuchos::rcp(
new basis_type(var_degree[i],1.0,2.0,
true));
916 RCP<const product_basis_type> basis =
917 rcp(
new product_basis_type(
919 RCP<Cijk_type> Cijk =
925 std::vector< std::vector<size_t> > graph ;
927 const size_t outer_length = nGrid * nGrid * nGrid ;
936 Stokhos::create_stochastic_product_tensor< TensorType >( *basis,
938 matrix.graph = Kokkos::create_staticcrsgraph<graph_type>( std::string(
"test crs graph") , graph );
940 const size_t inner_length = matrix.block.dimension();
942 matrix.values = block_vector_type( Kokkos::ViewAllocateWithoutInitializing(
"matrix"), inner_length , graph_length );
944 block_vector_type x = block_vector_type( Kokkos::ViewAllocateWithoutInitializing(
"x"), inner_length , outer_length );
945 block_vector_type y = block_vector_type( Kokkos::ViewAllocateWithoutInitializing(
"y"), inner_length , outer_length );
947 Kokkos::deep_copy( matrix.values , ScalarType(1.0) );
952 Kokkos::deep_copy( x , ScalarType(1.0) );
957 Kokkos::Timer clock ;
958 for (
int iter = 0 ; iter < iterCount ; ++iter ) {
963 const double seconds_per_iter = clock.seconds() / ((
double) iterCount );
964 const double flops_per_block = matrix.block.tensor().num_flops();
965 const double flops = 1.0e-9*graph_length*flops_per_block;
970 std::vector<double> perf(6) ;
972 perf[0] = outer_length * inner_length ;
973 perf[1] = seconds_per_iter ;
974 perf[2] = flops / seconds_per_iter;
975 perf[3] = matrix.block.tensor().num_non_zeros();
976 perf[4] = inner_length ;
977 perf[5] = flops_per_block;
982template<
typename ScalarType ,
class Device >
985 const std::vector<int> & var_degree ,
987 const int iterCount ,
988 const bool symmetric )
990 typedef ScalarType value_type ;
991 typedef Kokkos::View< value_type** ,
993 Device > block_vector_type ;
999 typedef typename matrix_type::graph_type graph_type ;
1001 typedef ScalarType value_type ;
1010 using Teuchos::Array;
1013 const size_t num_KL = var_degree.size();
1014 Array< RCP<const abstract_basis_type> > bases(num_KL);
1015 for (
size_t i=0; i<num_KL; i++) {
1017 bases[i] = Teuchos::rcp(
new basis_type(var_degree[i],1.0,1.0,
true));
1019 bases[i] = Teuchos::rcp(
new basis_type(var_degree[i],1.0,2.0,
true));
1021 RCP<const product_basis_type> basis =
1022 rcp(
new product_basis_type(
1024 RCP<Cijk_type> Cijk = basis->computeTripleProductTensor();
1029 std::vector< std::vector<size_t> > graph ;
1031 const size_t outer_length = nGrid * nGrid * nGrid ;
1037 matrix_type matrix ;
1039 Teuchos::ParameterList params;
1040 params.set(
"Symmetric", symmetric);
1042 Stokhos::create_stochastic_product_tensor< TensorType >( *basis,
1045 matrix.graph = Kokkos::create_staticcrsgraph<graph_type>( std::string(
"test crs graph") , graph );
1047 const size_t inner_length = matrix.block.tensor().dimension();
1048 const size_t inner_length_aligned = matrix.block.tensor().aligned_dimension();
1050 matrix.values = block_vector_type( Kokkos::ViewAllocateWithoutInitializing(
"matrix"), inner_length_aligned , graph_length );
1052 block_vector_type x = block_vector_type( Kokkos::ViewAllocateWithoutInitializing(
"x"), inner_length_aligned , outer_length );
1053 block_vector_type y = block_vector_type( Kokkos::ViewAllocateWithoutInitializing(
"y"), inner_length_aligned , outer_length );
1055 Kokkos::deep_copy( matrix.values , ScalarType(1.0) );
1060 Kokkos::deep_copy( x , ScalarType(1.0) );
1065 Kokkos::Timer clock ;
1066 for (
int iter = 0 ; iter < iterCount ; ++iter ) {
1071 const double seconds_per_iter = clock.seconds() / ((
double) iterCount );
1072 const double flops_per_block = matrix.block.tensor().num_flops();
1073 const double flops = 1.0e-9*graph_length*flops_per_block;
1078 std::vector<double> perf(6) ;
1080 perf[0] = outer_length * inner_length ;
1081 perf[1] = seconds_per_iter ;
1082 perf[2] = flops / seconds_per_iter;
1083 perf[3] = matrix.block.tensor().num_non_zeros();
1084 perf[4] = inner_length ;
1085 perf[5] = flops_per_block;
1090template<
typename ScalarType ,
class Device ,
class SparseMatOps >
1093 const std::vector<int> & var_degree ,
1095 const int iterCount ,
1096 const bool test_block ,
1097 const bool symmetric )
1099 typedef ScalarType value_type ;
1108 using Teuchos::Array;
1111 const size_t num_KL = var_degree.size();
1112 Array< RCP<const abstract_basis_type> > bases(num_KL);
1113 for (
size_t i=0; i<num_KL; i++) {
1115 bases[i] = Teuchos::rcp(
new basis_type(var_degree[i],1.0,1.0,
true));
1117 bases[i] = Teuchos::rcp(
new basis_type(var_degree[i],1.0,2.0,
true));
1119 RCP<const product_basis_type> basis =
1120 rcp(
new product_basis_type(
1122 const size_t outer_length = basis->size();
1123 RCP<Cijk_type> Cijk = basis->computeTripleProductTensor();
1128 typedef typename matrix_type::values_type matrix_values_type;
1129 typedef typename matrix_type::graph_type matrix_graph_type;
1134 std::vector< std::vector<size_t> > fem_graph ;
1136 const size_t inner_length = nGrid * nGrid * nGrid ;
1137 const size_t graph_length =
1142 typedef Kokkos::View<value_type*,Device> vec_type ;
1144 std::vector<matrix_type> matrix( outer_length ) ;
1145 std::vector<vec_type> x( outer_length ) ;
1146 std::vector<vec_type> y( outer_length ) ;
1147 std::vector<vec_type> tmp( outer_length ) ;
1149 for (
size_t block=0; block<outer_length; ++block) {
1150 matrix[block].graph = Kokkos::create_staticcrsgraph<matrix_graph_type>( std::string(
"testing") , fem_graph );
1152 matrix[block].values = matrix_values_type( Kokkos::ViewAllocateWithoutInitializing(
"matrix"), graph_length );
1154 x[block] = vec_type( Kokkos::ViewAllocateWithoutInitializing(
"x"), inner_length );
1155 y[block] = vec_type( Kokkos::ViewAllocateWithoutInitializing(
"y"), inner_length );
1156 tmp[block] = vec_type( Kokkos::ViewAllocateWithoutInitializing(
"tmp"), inner_length );
1158 Kokkos::deep_copy( matrix[block].values , ScalarType(1.0) );
1160 Kokkos::deep_copy( x[block] , ScalarType(1.0) );
1161 Kokkos::deep_copy( y[block] , ScalarType(1.0) );
1166 Kokkos::Timer clock ;
1169 for (
int iter = 0 ; iter < iterCount ; ++iter ) {
1174 typename Cijk_type::k_iterator k_begin = Cijk->k_begin();
1175 typename Cijk_type::k_iterator k_end = Cijk->k_end();
1176 for (
typename Cijk_type::k_iterator k_it=k_begin; k_it!=k_end; ++k_it) {
1177 int nj = Cijk->num_j(k_it);
1179 int k = index(k_it);
1180 typename Cijk_type::kj_iterator j_begin = Cijk->j_begin(k_it);
1181 typename Cijk_type::kj_iterator j_end = Cijk->j_end(k_it);
1182 std::vector<vec_type> xx(nj), yy(nj);
1184 for (
typename Cijk_type::kj_iterator j_it = j_begin; j_it != j_end;
1186 int j = index(j_it);
1194 for (
typename Cijk_type::kj_iterator j_it = j_begin; j_it != j_end;
1196 typename Cijk_type::kji_iterator i_begin = Cijk->i_begin(j_it);
1197 typename Cijk_type::kji_iterator i_end = Cijk->i_end(j_it);
1198 for (
typename Cijk_type::kji_iterator i_it = i_begin; i_it != i_end;
1200 int i = index(i_it);
1201 value_type c = value(i_it);
1213 const double seconds_per_iter = clock.seconds() / ((
double) iterCount );
1214 const double flops = 1.0e-9*(2.0*
static_cast<double>(n_apply)*graph_length+
1215 static_cast<double>(n_add)*inner_length);
1220 std::vector<double> perf(4);
1221 perf[0] = outer_length * inner_length;
1222 perf[1] = seconds_per_iter ;
1223 perf[2] = flops/seconds_per_iter;
1229template<
typename ScalarType ,
class Device ,
class SparseMatOps >
1232 const std::vector<int> & var_degree ,
1234 const int iterCount ,
1235 const bool test_block ,
1236 const bool symmetric )
1238 typedef ScalarType value_type ;
1247 using Teuchos::Array;
1250 const size_t num_KL = var_degree.size();
1251 Array< RCP<const abstract_basis_type> > bases(num_KL);
1252 for (
size_t i=0; i<num_KL; i++) {
1254 bases[i] = Teuchos::rcp(
new basis_type(var_degree[i],1.0,1.0,
true));
1256 bases[i] = Teuchos::rcp(
new basis_type(var_degree[i],1.0,2.0,
true));
1258 RCP<const product_basis_type> basis =
1259 rcp(
new product_basis_type(
1261 const size_t outer_length = basis->size();
1262 RCP<Cijk_type> Cijk = basis->computeTripleProductTensor();
1267 typedef typename matrix_type::values_type matrix_values_type;
1268 typedef typename matrix_type::graph_type matrix_graph_type;
1273 std::vector< std::vector<size_t> > fem_graph ;
1275 const size_t inner_length = nGrid * nGrid * nGrid ;
1276 const size_t graph_length =
1281 typedef Kokkos::View<value_type*, Kokkos::LayoutLeft, Device, Kokkos::MemoryUnmanaged> vec_type ;
1282 typedef Kokkos::View<value_type**, Kokkos::LayoutLeft, Device> multi_vec_type ;
1284 std::vector<matrix_type> matrix( outer_length ) ;
1285 multi_vec_type x( Kokkos::ViewAllocateWithoutInitializing(
"x"),
1286 inner_length, outer_length ) ;
1287 multi_vec_type y(
"y", inner_length, outer_length ) ;
1288 multi_vec_type tmp_x(
"tmp_x", inner_length, outer_length ) ;
1289 multi_vec_type tmp_y(
"tmp_y", inner_length, outer_length ) ;
1291 Kokkos::deep_copy( x , ScalarType(1.0) );
1293 for (
size_t block=0; block<outer_length; ++block) {
1294 matrix[block].graph = Kokkos::create_staticcrsgraph<matrix_graph_type>(
1295 std::string(
"testing") , fem_graph );
1297 matrix[block].values = matrix_values_type(
"matrix" , graph_length );
1299 Kokkos::deep_copy( matrix[block].values , ScalarType(1.0) );
1304 Kokkos::Timer clock ;
1307 for (
int iter = 0 ; iter < iterCount ; ++iter ) {
1310 typedef typename Cijk_type::k_iterator k_iterator;
1311 typedef typename Cijk_type::kj_iterator kj_iterator;
1312 typedef typename Cijk_type::kji_iterator kji_iterator;
1315 k_iterator k_begin = Cijk->k_begin();
1316 k_iterator k_end = Cijk->k_end();
1317 for (k_iterator k_it=k_begin; k_it!=k_end; ++k_it) {
1318 unsigned nj = Cijk->num_j(k_it);
1320 int k = index(k_it);
1321 kj_iterator j_begin = Cijk->j_begin(k_it);
1322 kj_iterator j_end = Cijk->j_end(k_it);
1323 std::vector<int> j_indices(nj);
1325 for (kj_iterator j_it = j_begin; j_it != j_end; ++j_it) {
1326 int j = index(j_it);
1327 vec_type xx = Kokkos::subview( x, Kokkos::ALL(),
j );
1328 vec_type tt = Kokkos::subview( tmp_x, Kokkos::ALL(), jdx++ );
1329 Kokkos::deep_copy(tt, xx);
1331 multi_vec_type tmp_x_view =
1332 Kokkos::subview( tmp_x, Kokkos::ALL(),
1333 std::make_pair(0u,nj));
1334 multi_vec_type tmp_y_view =
1335 Kokkos::subview( tmp_y, Kokkos::ALL(),
1336 std::make_pair(0u,nj));
1340 for (kj_iterator j_it = j_begin; j_it != j_end; ++j_it) {
1341 vec_type tmp_y_view =
1342 Kokkos::subview( tmp_y, Kokkos::ALL(), jdx++ );
1343 kji_iterator i_begin = Cijk->i_begin(j_it);
1344 kji_iterator i_end = Cijk->i_end(j_it);
1345 for (kji_iterator i_it = i_begin; i_it != i_end; ++i_it) {
1346 int i = index(i_it);
1347 value_type c = value(i_it);
1348 vec_type y_view = Kokkos::subview( y, Kokkos::ALL(), i );
1359 const double seconds_per_iter = clock.seconds() / ((
double) iterCount );
1360 const double flops = 1.0e-9*(2.0*
static_cast<double>(n_apply)*graph_length+
1361 static_cast<double>(n_add)*inner_length);
1366 std::vector<double> perf(4);
1367 perf[0] = outer_length * inner_length;
1368 perf[1] = seconds_per_iter ;
1369 perf[2] = flops/seconds_per_iter;
1375#ifdef HAVE_STOKHOS_KOKKOSLINALG
1376template<
typename ScalarType ,
class Device >
1378test_original_matrix_free_kokkos(
1379 const std::vector<int> & var_degree ,
1381 const int iterCount ,
1382 const bool test_block ,
1383 const bool symmetric )
1385 typedef ScalarType value_type ;
1394 using Teuchos::Array;
1397 const size_t num_KL = var_degree.size();
1398 Array< RCP<const abstract_basis_type> > bases(num_KL);
1399 for (
size_t i=0; i<num_KL; i++) {
1401 bases[i] = Teuchos::rcp(
new basis_type(var_degree[i],1.0,1.0,
true));
1403 bases[i] = Teuchos::rcp(
new basis_type(var_degree[i],1.0,2.0,
true));
1405 RCP<const product_basis_type> basis =
1406 rcp(
new product_basis_type(
1407 bases, ScalarTolerances<value_type>::sparse_cijk_tol()));
1408 const size_t outer_length = basis->size();
1409 RCP<Cijk_type> Cijk = basis->computeTripleProductTensor();
1414 typedef KokkosSparse::CrsMatrix<value_type,ordinal_type,Device> matrix_type;
1415 typedef typename matrix_type::values_type matrix_values_type;
1416 typedef typename matrix_type::StaticCrsGraphType matrix_graph_type;
1421 std::vector< std::vector<size_t> > fem_graph ;
1423 const size_t inner_length = nGrid * nGrid * nGrid ;
1424 const size_t graph_length =
1429 typedef Kokkos::View<value_type*,Kokkos::LayoutLeft,Device, Kokkos::MemoryUnmanaged>
vec_type ;
1430 typedef Kokkos::View<value_type**, Kokkos::LayoutLeft, Device> multi_vec_type;
1432 std::vector<matrix_type> matrix( outer_length ) ;
1433 multi_vec_type
x( Kokkos::ViewAllocateWithoutInitializing(
"x"),
1434 inner_length, outer_length ) ;
1435 multi_vec_type
y(
"y", inner_length, outer_length ) ;
1436 multi_vec_type tmp_x(
"tmp_x", inner_length, outer_length ) ;
1437 multi_vec_type tmp_y(
"tmp_y", inner_length, outer_length ) ;
1441 for (
size_t block=0; block<outer_length; ++block) {
1442 matrix_graph_type matrix_graph =
1443 Kokkos::create_staticcrsgraph<matrix_graph_type>(
1444 std::string(
"test crs graph") , fem_graph );
1446 matrix_values_type matrix_values = matrix_values_type(
1447 Kokkos::ViewAllocateWithoutInitializing(
"matrix"), graph_length );
1449 matrix[block] = matrix_type(
"matrix", outer_length, matrix_values,
1454 Kokkos::Timer clock ;
1457 for (
int iter = 0 ; iter < iterCount ; ++iter ) {
1460 typedef typename Cijk_type::k_iterator k_iterator;
1461 typedef typename Cijk_type::kj_iterator kj_iterator;
1462 typedef typename Cijk_type::kji_iterator kji_iterator;
1465 k_iterator k_begin = Cijk->k_begin();
1466 k_iterator k_end = Cijk->k_end();
1467 for (k_iterator k_it=k_begin; k_it!=k_end; ++k_it) {
1468 unsigned nj = Cijk->num_j(k_it);
1470 int k = index(k_it);
1471 kj_iterator j_begin = Cijk->j_begin(k_it);
1472 kj_iterator j_end = Cijk->j_end(k_it);
1474 for (kj_iterator j_it = j_begin; j_it != j_end; ++j_it) {
1475 int j = index(j_it);
1476 vec_type xx = Kokkos::subview( x, Kokkos::ALL(),
j );
1477 vec_type tt = Kokkos::subview( tmp_x, Kokkos::ALL(), jdx++ );
1480 multi_vec_type tmp_x_view =
1481 Kokkos::subview( tmp_x, Kokkos::ALL(),
1482 std::make_pair(0u,nj));
1483 multi_vec_type tmp_y_view =
1484 Kokkos::subview( tmp_y, Kokkos::ALL(),
1485 std::make_pair(0u,nj));
1489 for (kj_iterator j_it = j_begin; j_it != j_end; ++j_it) {
1491 Kokkos::subview( tmp_y, Kokkos::ALL(), jdx++ );
1492 kji_iterator i_begin = Cijk->i_begin(j_it);
1493 kji_iterator i_end = Cijk->i_end(j_it);
1494 for (kji_iterator i_it = i_begin; i_it != i_end; ++i_it) {
1495 int i = index(i_it);
1497 vec_type y_view = Kokkos::subview( y, Kokkos::ALL(), i );
1509 const double seconds_per_iter = clock.seconds() / ((
double) iterCount );
1510 const double flops = 1.0e-9*(2.0*
static_cast<double>(n_apply)*graph_length+
1511 static_cast<double>(n_add)*inner_length);
1516 std::vector<double> perf(4);
1517 perf[0] = outer_length * inner_length;
1518 perf[1] = seconds_per_iter ;
1519 perf[2] = flops/seconds_per_iter;
1526template<
class Scalar,
class Device >
1532 const bool test_block ,
1533 const bool symmetric )
1538 std::vector< std::vector<size_t> > fem_graph ;
1540 const size_t fem_nonzeros =
1545 std::cout.precision(8);
1549 std::cout << std::endl <<
"\"FEM NNZ = " << fem_nonzeros <<
"\"" << std::endl;
1551 std::cout << std::endl
1553 <<
"\"#Variable\" , "
1554 <<
"\"PolyDegree\" , "
1556 <<
"\"#TensorEntry\" , "
1557 <<
"\"VectorSize\" , "
1558 <<
"\"Original-Flat MXV-Time\" , "
1559 <<
"\"Original-Flat MXV-Speedup\" , "
1560 <<
"\"Original-Flat MXV-GFLOPS\" , "
1561 <<
"\"Commuted-Flat MXV-Speedup\" , "
1562 <<
"\"Commuted-Flat MXV-GFLOPS\" , "
1563 <<
"\"Block-Diagonal MXV-Speedup\" , "
1564 <<
"\"Block-Diagonal MXV-GFLOPS\" , "
1565 <<
"\"Block-Crs-Tensor MXV-Speedup\" , "
1566 <<
"\"Block-Crs-Tensor MXV-GFLOPS\" , "
1569 for (
int nvar = minvar ; nvar <= maxvar ; ++nvar ) {
1571 std::vector<int> var_degree( nvar , pdeg );
1575 const std::vector<double> perf_flat_original =
1576 test_product_flat_original_matrix<Scalar,Device>(
1577 var_degree , nGrid , nIter , symmetric );
1579 const std::vector<double> perf_flat_commuted =
1580 test_product_flat_commuted_matrix<Scalar,Device>(
1581 var_degree , nGrid , nIter , symmetric );
1583 const std::vector<double> perf_matrix =
1584 test_product_tensor_diagonal_matrix<Scalar,Device>(
1585 var_degree , nGrid , nIter , symmetric );
1587 const std::vector<double> perf_crs_tensor =
1588 test_product_tensor_matrix<Scalar,Stokhos::CrsProductTensor<Scalar,Device>,Device>(
1589 var_degree , nGrid , nIter , symmetric );
1591 if ( perf_flat_commuted[0] != perf_flat_original[0] ||
1592 perf_flat_commuted[3] != perf_flat_original[3] ) {
1593 std::cout <<
"ERROR: Original and commuted matrix sizes do not match"
1595 <<
" original size = " << perf_flat_original[0]
1596 <<
" , nonzero = " << perf_flat_original[3]
1598 <<
" commuted size = " << perf_flat_commuted[0]
1599 <<
" , nonzero = " << perf_flat_commuted[3]
1603 std::cout << nGrid <<
" , "
1606 << perf_crs_tensor[4] <<
" , "
1607 << perf_crs_tensor[3] <<
" , "
1608 << perf_flat_original[0] <<
" , "
1609 << perf_flat_original[1] <<
" , "
1610 << perf_flat_original[1] / perf_flat_original[1] <<
" , "
1611 << perf_flat_original[2] <<
" , "
1612 << perf_flat_original[1] / perf_flat_commuted[1] <<
" , "
1613 << perf_flat_commuted[2] <<
" , "
1614 << perf_flat_original[1] / perf_matrix[1] <<
" , "
1615 << perf_matrix[2] <<
" , "
1616 << perf_flat_original[1] / perf_crs_tensor[1] <<
" , "
1617 << perf_crs_tensor[2] <<
" , "
1624template<
class Scalar,
class Device ,
class SparseMatOps >
1630 const bool test_block ,
1631 const bool symmetric )
1633 std::cout.precision(8);
1637 std::vector< std::vector<size_t> > fem_graph ;
1638 const size_t graph_length =
1640 std::cout << std::endl <<
"\"FEM NNZ = " << graph_length <<
"\"" << std::endl;
1642 std::cout << std::endl
1644 <<
"\"#Variable\" , "
1645 <<
"\"PolyDegree\" , "
1647 <<
"\"#TensorEntry\" , "
1648 <<
"\"VectorSize\" , "
1649 <<
"\"Original-Matrix-Free-Block-MXV-Time\" , "
1650 <<
"\"Original-Matrix-Free-Block-MXV-Speedup\" , "
1651 <<
"\"Original-Matrix-Free-Block-MXV-GFLOPS\" , "
1652 <<
"\"Block-Crs-Tensor MXV-Speedup\" , "
1653 <<
"\"Block-Crs-Tensor MXV-GFLOPS\" , "
1660 for (
int nvar = minvar ; nvar <= maxvar ; ++nvar ) {
1661 std::vector<int> var_degree( nvar , pdeg );
1663 const std::vector<double> perf_crs_tensor =
1664 test_product_tensor_matrix<Scalar,Stokhos::CrsProductTensor<Scalar,Device>,Device>(
1665 var_degree , nGrid , nIter , symmetric );
1676 std::vector<double> perf_original_mat_free_block;
1677#if defined(HAVE_STOKHOS_KOKKOSLINALG)
1678#if defined( KOKKOS_ENABLE_CUDA )
1679 enum { is_cuda = std::is_same<Device,Kokkos::Cuda>::value };
1681 enum { is_cuda =
false };
1684 perf_original_mat_free_block =
1685 test_original_matrix_free_kokkos<Scalar,Device>(
1686 var_degree , nGrid , nIter , test_block , symmetric );
1688 perf_original_mat_free_block =
1689 test_original_matrix_free_view<Scalar,Device,SparseMatOps>(
1690 var_degree , nGrid , nIter , test_block , symmetric );
1692 perf_original_mat_free_block =
1693 test_original_matrix_free_view<Scalar,Device,SparseMatOps>(
1694 var_degree , nGrid , nIter , test_block , symmetric );
1697 std::cout << nGrid <<
" , "
1700 << perf_crs_tensor[4] <<
" , "
1701 << perf_crs_tensor[3] <<
" , "
1702 << perf_original_mat_free_block[0] <<
" , "
1703 << perf_original_mat_free_block[1] <<
" , "
1704 << perf_original_mat_free_block[1] /
1705 perf_original_mat_free_block[1] <<
" , "
1706 << perf_original_mat_free_block[2] <<
" , "
1707 << perf_original_mat_free_block[1] / perf_crs_tensor[1] <<
" , "
1708 << perf_crs_tensor[2] <<
" , "
1719template<
class Scalar,
class Device ,
class SparseMatOps >
1725 const bool test_block ,
1726 const bool symmetric )
1728 bool do_flat_sparse =
1729 std::is_same<typename Device::memory_space,Kokkos::HostSpace>::value ;
1731 std::cout.precision(8);
1735 std::vector< std::vector<size_t> > fem_graph ;
1736 const size_t graph_length =
1738 std::cout << std::endl <<
"\"FEM NNZ = " << graph_length <<
"\"" << std::endl;
1740 std::cout << std::endl
1742 <<
"\"#Variable\" , "
1743 <<
"\"PolyDegree\" , "
1745 <<
"\"#TensorEntry\" , "
1746 <<
"\"VectorSize\" , "
1747 <<
"\"Original-Matrix-Free-Block-MXV-Time\" , "
1748 <<
"\"Original-Matrix-Free-Block-MXV-Speedup\" , "
1749 <<
"\"Original-Matrix-Free-Block-MXV-GFLOPS\" , "
1750 <<
"\"Block-Crs-Tensor MXV-Speedup\" , "
1751 <<
"\"Block-Crs-Tensor MXV-GFLOPS\" , ";
1753 std::cout <<
"\"Block-Lexicographic-Sparse-3-Tensor MXV-Speedup\" , "
1754 <<
"\"Block-Lexicographic-Sparse-3-Tensor MXV-GFLOPS\" , "
1755 <<
"\"Lexicographic FLOPS / Crs FLOPS\" , ";
1756 std::cout << std::endl ;
1758 for (
int p = minp ; p <= maxp ; ++p ) {
1759 std::vector<int> var_degree( nvar , p );
1761 const std::vector<double> perf_crs_tensor =
1762 test_product_tensor_matrix<Scalar,Stokhos::CrsProductTensor<Scalar,Device>,Device>(
1763 var_degree , nGrid , nIter , symmetric );
1765 std::vector<double> perf_lexo_sparse_3_tensor;
1766 if (do_flat_sparse) {
1767 perf_lexo_sparse_3_tensor =
1768 test_lexo_block_tensor<Scalar,Device>( var_degree , nGrid , nIter , symmetric );
1771 const std::vector<double> perf_original_mat_free_block =
1772 test_original_matrix_free_vec<Scalar,Device,SparseMatOps>(
1773 var_degree , nGrid , nIter , test_block , symmetric );
1775 std::cout << nGrid <<
" , "
1778 << perf_crs_tensor[4] <<
" , "
1779 << perf_crs_tensor[3] <<
" , "
1780 << perf_original_mat_free_block[0] <<
" , "
1781 << perf_original_mat_free_block[1] <<
" , "
1782 << perf_original_mat_free_block[1] / perf_original_mat_free_block[1] <<
" , "
1783 << perf_original_mat_free_block[2] <<
" , "
1784 << perf_original_mat_free_block[1] / perf_crs_tensor[1] <<
" , "
1785 << perf_crs_tensor[2] <<
" , ";
1786 if (do_flat_sparse) {
1787 std::cout << perf_original_mat_free_block[1] / perf_lexo_sparse_3_tensor[1] <<
" , "
1788 << perf_lexo_sparse_3_tensor[2] <<
" , "
1789 << perf_lexo_sparse_3_tensor[5] / perf_crs_tensor[5];
1793 std::cout << std::endl ;
1799template<
class Scalar,
class Device ,
class SparseMatOps >
1805 const bool test_block ,
1806 const bool symmetric )
1808 std::cout.precision(8);
1812 std::vector< std::vector<size_t> > fem_graph ;
1813 const size_t graph_length =
1815 std::cout << std::endl <<
"\"FEM NNZ = " << graph_length <<
"\"" << std::endl;
1817 std::cout << std::endl
1819 <<
"\"#Variable\" , "
1820 <<
"\"PolyDegree\" , "
1822 <<
"\"#TensorEntry\" , "
1823 <<
"\"VectorSize\" , "
1824 <<
"\"Original-Matrix-Free-Block-MXV-Time\" , "
1825 <<
"\"Original-Matrix-Free-Block-MXV-Speedup\" , "
1826 <<
"\"Original-Matrix-Free-Block-MXV-GFLOPS\" , "
1827 <<
"\"Block-Crs-Tensor MXV-Speedup\" , "
1828 <<
"\"Block-Crs-Tensor MXV-GFLOPS\" , "
1829 <<
"\"Linear-Sparse-3-Tensor MXV-Speedup\" , "
1830 <<
"\"Linear-Sparse-3-Tensor MXV-GFLOPS\" , "
1833 for (
int nvar = minvar ; nvar <= maxvar ; nvar+=varinc ) {
1834 std::vector<int> var_degree( nvar , 1 );
1836 const std::vector<double> perf_crs_tensor =
1837 test_product_tensor_matrix<Scalar,Stokhos::CrsProductTensor<Scalar,Device>,Device>(
1838 var_degree , nGrid , nIter , symmetric );
1840 const std::vector<double> perf_linear_sparse_3_tensor =
1841 test_linear_tensor<Scalar,Device>( var_degree , nGrid , nIter , symmetric );
1843 const std::vector<double> perf_original_mat_free_block =
1844 test_original_matrix_free_vec<Scalar,Device,SparseMatOps>(
1845 var_degree , nGrid , nIter , test_block , symmetric );
1847 std::cout << nGrid <<
" , "
1850 << perf_crs_tensor[4] <<
" , "
1851 << perf_crs_tensor[3] <<
" , "
1852 << perf_original_mat_free_block[0] <<
" , "
1853 << perf_original_mat_free_block[1] <<
" , "
1854 << perf_original_mat_free_block[1] / perf_original_mat_free_block[1] <<
" , "
1855 << perf_original_mat_free_block[2] <<
" , "
1856 << perf_original_mat_free_block[1] / perf_crs_tensor[1] <<
" , "
1857 << perf_crs_tensor[2] <<
" , "
1858 << perf_original_mat_free_block[1] / perf_linear_sparse_3_tensor[1] <<
" , "
1859 << perf_linear_sparse_3_tensor[2] <<
" , "
1866template<
class Scalar,
class Device >
void performance_test_driver(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
CRS matrix of dense blocks.
Data structure storing a sparse 3-tensor C(i,j,k) in a a tree-based format for lexicographically orde...
Sparse product tensor with replicated entries to provide subsets with a given coordinate.
A comparison functor implementing a strict weak ordering based lexographic ordering.
Sparse product tensor with replicated entries to provide subsets with a given coordinate.
Abstract base class for 1-D orthogonal polynomials.
Data structure storing a sparse 3-tensor C(i,j,k) in a a compressed format.
Bases defined by combinatorial product of polynomial bases.
Symmetric diagonal storage for a dense matrix.
Multivariate orthogonal polynomial basis generated from a total order tensor product of univariate po...
Stokhos::LegendreBasis< int, double > basis_type
Sacado::MP::Vector< storage_type > vec_type
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< XD, XP... > >::value &&Kokkos::is_view_uq_pce< Kokkos::View< YD, YP... > >::value &&Kokkos::is_view_uq_pce< Kokkos::View< ZD, ZP... > >::value >::type update(const typename Kokkos::View< XD, XP... >::array_type::non_const_value_type &alpha, const Kokkos::View< XD, XP... > &x, const typename Kokkos::View< YD, YP... >::array_type::non_const_value_type &beta, const Kokkos::View< YD, YP... > &y, const typename Kokkos::View< ZD, ZP... >::array_type::non_const_value_type &gamma, const Kokkos::View< ZD, ZP... > &z)
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value >::type spmv(const char mode[], const AlphaType &a, const MatrixType &A, const Kokkos::View< InputType, InputP... > &x, const BetaType &b, const Kokkos::View< OutputType, OutputP... > &y, const RANK_ONE)
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
Teuchos::RCP< LTBSparse3Tensor< ordinal_type, value_type > > computeTripleProductTensorLTBBlockLeaf(const TotalOrderBasis< ordinal_type, value_type, LexographicLess< MultiIndex< ordinal_type > > > &product_basis, bool symmetric=false)
Teuchos::RCP< Epetra_CrsGraph > sparse3Tensor2CrsGraph(const Stokhos::OrthogPolyBasis< ordinal_type, value_type > &basis, const Stokhos::Sparse3Tensor< ordinal_type, value_type > &Cijk, const Epetra_Comm &comm)
Build an Epetra_CrsGraph from a sparse 3 tensor.
void update(const ValueType &alpha, VectorType &x, const ValueType &beta, const VectorType &y)
void multiply(const CrsMatrix< MatrixValue, Device, Layout > &A, const InputMultiVectorType &x, OutputMultiVectorType &y, const std::vector< OrdinalType > &col_indices, SingleColumnMultivectorMultiply)
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType * x
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType ValueType * y
IntType map_fem_graph_coord(const IntType &N, const IntType &i, const IntType &j, const IntType &k)
void performance_test_driver_linear(const int minvar, const int maxvar, const int varinc, const int nGrid, const int nIter, const bool test_block, const bool symmetric)
void performance_test_driver_poly_deg(const int nvar, const int minp, const int maxp, const int nGrid, const int nIter, const bool test_block, const bool symmetric)
size_t generate_fem_graph(size_t N, std::vector< std::vector< size_t > > &graph)
std::vector< double > test_product_flat_original_matrix(const std::vector< int > &var_degree, const int nGrid, const int iterCount, const bool symmetric)
std::vector< double > test_product_tensor_matrix(const std::vector< int > &var_degree, const int nGrid, const int iterCount, const bool symmetric)
void performance_test_driver_poly(const int pdeg, const int minvar, const int maxvar, const int nGrid, const int nIter, const bool test_block, const bool symmetric)
std::vector< double > test_product_tensor_diagonal_matrix(const std::vector< int > &var_degree, const int nGrid, const int iterCount, const bool symmetric)
std::vector< double > test_simple_tiled_product_tensor_matrix(const std::vector< int > &var_degree, const int nGrid, const int iterCount, const bool symmetric)
std::vector< double > test_linear_tensor(const std::vector< int > &var_degree, const int nGrid, const int iterCount, const bool symmetric)
std::vector< double > test_lexo_block_tensor(const std::vector< int > &var_degree, const int nGrid, const int iterCount, const bool symmetric)
std::vector< double > test_tiled_product_tensor_matrix(const std::vector< int > &var_degree, const int nGrid, const int iterCount, const bool symmetric)
std::vector< double > test_original_matrix_free_view(const std::vector< int > &var_degree, const int nGrid, const int iterCount, const bool test_block, const bool symmetric)
std::vector< double > test_original_matrix_free_vec(const std::vector< int > &var_degree, const int nGrid, const int iterCount, const bool test_block, const bool symmetric)
void performance_test_driver_all(const int pdeg, const int minvar, const int maxvar, const int nGrid, const int nIter, const bool test_block, const bool symmetric)
std::vector< double > test_product_flat_commuted_matrix(const std::vector< int > &var_degree, const int nGrid, const int iterCount, const bool symmetric)
static scalar_type sparse_cijk_tol()
static scalar_type sparse_cijk_tol()