Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
TestMeanMultiply.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 #include <iostream>
43 
44 // PCE scalar type
46 
47 // Kokkos CrsMatrix
48 #include "KokkosSparse_CrsMatrix.hpp"
49 #include "KokkosSparse_spmv.hpp"
52 
53 // Stokhos
57 
58 // Utilities
59 #include "impl/Kokkos_Timer.hpp"
60 
61 template< typename IntType >
62 inline
63 IntType map_fem_graph_coord( const IntType & N ,
64  const IntType & i ,
65  const IntType & j ,
66  const IntType & k )
67 {
68  return k + N * ( j + N * i );
69 }
70 
71 inline
72 size_t generate_fem_graph( size_t N ,
73  std::vector< std::vector<size_t> > & graph )
74 {
75  graph.resize( N * N * N , std::vector<size_t>() );
76 
77  size_t total = 0 ;
78 
79  for ( int i = 0 ; i < (int) N ; ++i ) {
80  for ( int j = 0 ; j < (int) N ; ++j ) {
81  for ( int k = 0 ; k < (int) N ; ++k ) {
82 
83  const size_t row = map_fem_graph_coord((int)N,i,j,k);
84 
85  graph[row].reserve(27);
86 
87  for ( int ii = -1 ; ii < 2 ; ++ii ) {
88  for ( int jj = -1 ; jj < 2 ; ++jj ) {
89  for ( int kk = -1 ; kk < 2 ; ++kk ) {
90  if ( 0 <= i + ii && i + ii < (int) N &&
91  0 <= j + jj && j + jj < (int) N &&
92  0 <= k + kk && k + kk < (int) N ) {
93  size_t col = map_fem_graph_coord((int)N,i+ii,j+jj,k+kk);
94 
95  graph[row].push_back(col);
96  }
97  }}}
98  total += graph[row].size();
99  }}}
100 
101  return total ;
102 }
103 
104 template <typename ScalarType, typename OrdinalType, typename Device>
105 void
106 test_mean_multiply(const OrdinalType order,
107  const OrdinalType dim,
108  const OrdinalType nGrid,
109  const OrdinalType iterCount,
110  std::vector<double>& scalar_perf,
111  std::vector<double>& block_left_perf,
112  std::vector<double>& block_right_perf,
113  std::vector<double>& pce_perf,
114  std::vector<double>& block_pce_perf)
115 {
116  typedef ScalarType value_type;
117  typedef OrdinalType ordinal_type;
118  typedef Device execution_space;
119 
122 
123  typedef Kokkos::View< value_type*, Kokkos::LayoutLeft, execution_space > scalar_vector_type;
124  typedef Kokkos::View< value_type**, Kokkos::LayoutLeft, execution_space > scalar_left_multi_vector_type;
125  typedef Kokkos::View< value_type**, Kokkos::LayoutRight, execution_space > scalar_right_multi_vector_type;
126  typedef Kokkos::View< pce_type*, Kokkos::LayoutLeft, execution_space > pce_vector_type;
127  typedef Kokkos::View< pce_type**, Kokkos::LayoutLeft, execution_space > pce_multi_vector_type;
128 
129  typedef KokkosSparse::CrsMatrix< value_type, ordinal_type, execution_space > scalar_matrix_type;
130  typedef KokkosSparse::CrsMatrix< pce_type, ordinal_type, execution_space > pce_matrix_type;
131  typedef typename scalar_matrix_type::StaticCrsGraphType matrix_graph_type;
132  typedef typename scalar_matrix_type::values_type scalar_matrix_values_type;
133  typedef typename pce_matrix_type::values_type pce_matrix_values_type;
134 
135  typedef Stokhos::OneDOrthogPolyBasis<ordinal_type,value_type> abstract_basis_type;
140  typedef typename pce_type::cijk_type kokkos_cijk_type;
141 
142  using Teuchos::rcp;
143  using Teuchos::RCP;
144  using Teuchos::Array;
145 
146  // Number of columns for PCE multi-vector apply
147  const ordinal_type num_pce_col = 5;
148 
149  // Create Stochastic Galerkin basis and expansion
150  Array< RCP<const abstract_basis_type> > bases(dim);
151  for (ordinal_type i=0; i<dim; ++i) {
152  bases[i] = Teuchos::rcp(new basis_type(order, true));
153  }
154  RCP<const product_basis_type> basis = rcp(new product_basis_type(bases));
155  RCP<cijk_type> cijk = basis->computeTripleProductTensor();
156  kokkos_cijk_type kokkos_cijk =
157  Stokhos::create_product_tensor<execution_space>(*basis, *cijk);
158  Kokkos::setGlobalCijkTensor(kokkos_cijk);
159 
160  //------------------------------
161  // Generate graph for "FEM" box structure:
162 
163  std::vector< std::vector<size_t> > fem_graph;
164  const size_t fem_length = nGrid * nGrid * nGrid;
165  const size_t graph_length = generate_fem_graph( nGrid , fem_graph );
166 
167  //------------------------------
168  // Generate input vectors:
169 
170  ordinal_type pce_size = basis->size();
171  scalar_left_multi_vector_type xl(Kokkos::ViewAllocateWithoutInitializing("scalar left x"), fem_length, pce_size);
172  scalar_left_multi_vector_type yl(Kokkos::ViewAllocateWithoutInitializing("scalar right y"), fem_length, pce_size);
173  scalar_right_multi_vector_type xr(Kokkos::ViewAllocateWithoutInitializing("scalar right x"), fem_length, pce_size);
174  scalar_right_multi_vector_type yr(Kokkos::ViewAllocateWithoutInitializing("scalar right y"), fem_length, pce_size);
175  std::vector<scalar_vector_type> x_col(pce_size), y_col(pce_size);
176  for (ordinal_type i=0; i<pce_size; ++i) {
177  x_col[i] = scalar_vector_type (Kokkos::ViewAllocateWithoutInitializing("scalar x col"), fem_length);
178  y_col[i] = scalar_vector_type(Kokkos::ViewAllocateWithoutInitializing("scalar y col"), fem_length);
179  Kokkos::deep_copy( x_col[i] , value_type(1.0) );
180  Kokkos::deep_copy( y_col[i] , value_type(0.0) );
181  }
182  pce_vector_type x_pce =
183  Kokkos::make_view<pce_vector_type>(Kokkos::ViewAllocateWithoutInitializing("pce x"),
184  kokkos_cijk, fem_length, pce_size);
185  pce_vector_type y_pce =
186  Kokkos::make_view<pce_vector_type>(Kokkos::ViewAllocateWithoutInitializing("pce y"),
187  kokkos_cijk, fem_length, pce_size);
188  pce_multi_vector_type x_multi_pce =
189  Kokkos::make_view<pce_multi_vector_type>(Kokkos::ViewAllocateWithoutInitializing("pce multi x"),
190  kokkos_cijk, fem_length,
191  num_pce_col, pce_size);
192  pce_multi_vector_type y_multi_pce =
193  Kokkos::make_view<pce_multi_vector_type>(Kokkos::ViewAllocateWithoutInitializing("pce multi y"),
194  kokkos_cijk, fem_length,
195  num_pce_col, pce_size);
196 
197  Kokkos::deep_copy( xl , value_type(1.0) );
198  Kokkos::deep_copy( yl , value_type(0.0) );
199  Kokkos::deep_copy( xr , value_type(1.0) );
200  Kokkos::deep_copy( yr , value_type(0.0) );
201  Kokkos::deep_copy( x_pce , value_type(1.0) );
202  Kokkos::deep_copy( y_pce , value_type(0.0) );
203  Kokkos::deep_copy( x_multi_pce , value_type(1.0) );
204  Kokkos::deep_copy( y_multi_pce , value_type(0.0) );
205 
206  //------------------------------
207  // Generate matrix
208 
209  matrix_graph_type matrix_graph =
210  Kokkos::create_staticcrsgraph<matrix_graph_type>(
211  std::string("test crs graph"), fem_graph);
212  scalar_matrix_values_type scalar_matrix_values =
213  scalar_matrix_values_type(Kokkos::ViewAllocateWithoutInitializing("scalar matrix"), graph_length);
214  pce_matrix_values_type pce_matrix_values =
215  Kokkos::make_view<pce_matrix_values_type>(Kokkos::ViewAllocateWithoutInitializing("pce matrix"), kokkos_cijk, graph_length, 1);
216  scalar_matrix_type scalar_matrix("scalar matrix", fem_length,
217  scalar_matrix_values, matrix_graph);
218  pce_matrix_type pce_matrix("pce matrix", fem_length,
219  pce_matrix_values, matrix_graph);
220 
221  Kokkos::deep_copy( scalar_matrix_values , value_type(1.0) );
222  Kokkos::deep_copy( pce_matrix_values , value_type(1.0) );
223 
224  //------------------------------
225  // Scalar multiply
226 
227  {
228  // warm up
229  for (ordinal_type iter = 0; iter < iterCount; ++iter) {
230  for (ordinal_type col=0; col<pce_size; ++col) {
231  // scalar_vector_type xc =
232  // Kokkos::subview(x, Kokkos::ALL(), col);
233  // scalar_vector_type yc =
234  // Kokkos::subview(y, Kokkos::ALL(), col);
235  // Kokkos::MV_Multiply( yc, scalar_matrix, xc );
236  KokkosSparse::spmv( "N" , value_type(1.0) , scalar_matrix, x_col[col] , value_type(0.0) ,y_col[col]);
237  }
238  }
239 
240  execution_space::fence();
241  Kokkos::Impl::Timer clock ;
242  for (ordinal_type iter = 0; iter < iterCount; ++iter) {
243  for (ordinal_type col=0; col<pce_size; ++col) {
244  // scalar_vector_type xc =
245  // Kokkos::subview(x, Kokkos::ALL(), col);
246  // scalar_vector_type yc =
247  // Kokkos::subview(y, Kokkos::ALL(), col);
248  // Kokkos::MV_Multiply( yc, scalar_matrix, xc );
249  KokkosSparse::spmv( "N" , value_type(1.0) , scalar_matrix, x_col[col] , value_type(0.0) ,y_col[col]);
250  }
251  }
252  execution_space::fence();
253 
254  const double seconds_per_iter = clock.seconds() / ((double) iterCount );
255  const double flops = 1.0e-9 * 2.0 * graph_length * pce_size;
256 
257  scalar_perf.resize(5);
258  scalar_perf[0] = fem_length;
259  scalar_perf[1] = pce_size;
260  scalar_perf[2] = graph_length;
261  scalar_perf[3] = seconds_per_iter;
262  scalar_perf[4] = flops / seconds_per_iter;
263  }
264 
265  //------------------------------
266  // Block-left multiply
267 
268  {
269  // warm up
270  for (ordinal_type iter = 0; iter < iterCount; ++iter) {
271  KokkosSparse::spmv( "N" , value_type(1.0) , scalar_matrix, xl , value_type(0.0) ,yl);
272  }
273 
274  execution_space::fence();
275  Kokkos::Impl::Timer clock ;
276  for (ordinal_type iter = 0; iter < iterCount; ++iter) {
277  KokkosSparse::spmv( "N" , value_type(1.0) , scalar_matrix, xl , value_type(0.0) ,yl);
278  }
279  execution_space::fence();
280 
281  const double seconds_per_iter = clock.seconds() / ((double) iterCount );
282  const double flops = 1.0e-9 * 2.0 * graph_length * pce_size;
283 
284  block_left_perf.resize(5);
285  block_left_perf[0] = fem_length;
286  block_left_perf[1] = pce_size;
287  block_left_perf[2] = graph_length;
288  block_left_perf[3] = seconds_per_iter;
289  block_left_perf[4] = flops / seconds_per_iter;
290  }
291 
292  //------------------------------
293  // Block-right multiply
294 
295  {
296  // warm up
297  for (ordinal_type iter = 0; iter < iterCount; ++iter) {
298  KokkosSparse::spmv( "N" , value_type(1.0) , scalar_matrix, xr , value_type(0.0) ,yr);
299  }
300 
301  execution_space::fence();
302  Kokkos::Impl::Timer clock ;
303  for (ordinal_type iter = 0; iter < iterCount; ++iter) {
304  KokkosSparse::spmv( "N" , value_type(1.0) , scalar_matrix, xr , value_type(0.0) ,yr);
305  }
306  execution_space::fence();
307 
308  const double seconds_per_iter = clock.seconds() / ((double) iterCount );
309  const double flops = 1.0e-9 * 2.0 * graph_length * pce_size;
310 
311  block_right_perf.resize(5);
312  block_right_perf[0] = fem_length;
313  block_right_perf[1] = pce_size;
314  block_right_perf[2] = graph_length;
315  block_right_perf[3] = seconds_per_iter;
316  block_right_perf[4] = flops / seconds_per_iter;
317  }
318 
319  //------------------------------
320  // PCE multiply
321 
322  {
323  // warm up
324  for (ordinal_type iter = 0; iter < iterCount; ++iter) {
325  KokkosSparse::spmv( "N" , value_type(1.0) , pce_matrix, x_pce , value_type(0.0) ,y_pce);
326  }
327 
328  execution_space::fence();
329  Kokkos::Impl::Timer clock ;
330  for (ordinal_type iter = 0; iter < iterCount; ++iter) {
331  KokkosSparse::spmv( "N" , value_type(1.0) , pce_matrix, x_pce , value_type(0.0) ,y_pce);
332  }
333  execution_space::fence();
334 
335  const double seconds_per_iter = clock.seconds() / ((double) iterCount );
336  const double flops = 1.0e-9 * 2.0 * graph_length * pce_size;
337 
338  pce_perf.resize(5);
339  pce_perf[0] = fem_length;
340  pce_perf[1] = pce_size;
341  pce_perf[2] = graph_length;
342  pce_perf[3] = seconds_per_iter;
343  pce_perf[4] = flops / seconds_per_iter;
344  }
345 
346  //------------------------------
347  // PCE multi-vector multiply
348 
349  {
350  // warm up
351  for (ordinal_type iter = 0; iter < iterCount; ++iter) {
352  KokkosSparse::spmv( "N" , value_type(1.0) , pce_matrix, x_multi_pce , value_type(0.0) ,y_multi_pce);
353  }
354 
355  execution_space::fence();
356  Kokkos::Impl::Timer clock ;
357  for (ordinal_type iter = 0; iter < iterCount; ++iter) {
358  KokkosSparse::spmv( "N" , value_type(1.0) , pce_matrix, x_multi_pce , value_type(0.0) ,y_multi_pce);
359  }
360  execution_space::fence();
361 
362  const double seconds_per_iter = clock.seconds() / ((double) iterCount );
363  const double flops = 1.0e-9 * 2.0 * graph_length * pce_size * num_pce_col;
364 
365  block_pce_perf.resize(5);
366  block_pce_perf[0] = fem_length;
367  block_pce_perf[1] = pce_size;
368  block_pce_perf[2] = graph_length;
369  block_pce_perf[3] = seconds_per_iter;
370  block_pce_perf[4] = flops / seconds_per_iter;
371  }
372 
373 }
374 
375 template <typename Scalar, typename Ordinal, typename Device>
377  const Ordinal nIter,
378  const Ordinal order,
379  const Ordinal min_var,
380  const Ordinal max_var )
381 {
382  std::cout.precision(8);
383  std::cout << std::endl
384  << "\"Grid Size\" , "
385  << "\"FEM Size\" , "
386  << "\"FEM Graph Size\" , "
387  << "\"Dimension\" , "
388  << "\"Order\" , "
389  << "\"PCE Size\" , "
390  << "\"Scalar SpMM Time\" , "
391  << "\"Scalar SpMM Speedup\" , "
392  << "\"Scalar SpMM GFLOPS\" , "
393  << "\"Block-Left SpMM Speedup\" , "
394  << "\"Block-Left SpMM GFLOPS\" , "
395  << "\"Block-Right SpMM Speedup\" , "
396  << "\"Block-Right SpMM GFLOPS\" , "
397  << "\"PCE SpMM Speedup\" , "
398  << "\"PCE SpMM GFLOPS\" , "
399  << "\"Block PCE SpMM Speedup\" , "
400  << "\"Block PCE SpMM GFLOPS\" , "
401  << std::endl;
402 
403  std::vector<double> perf_scalar, perf_block_left, perf_block_right,
404  perf_pce, perf_block_pce;
405  for (Ordinal dim=min_var; dim<=max_var; ++dim) {
406 
407  test_mean_multiply<Scalar,Ordinal,Device>(
408  order, dim, nGrid, nIter, perf_scalar, perf_block_left, perf_block_right,
409  perf_pce, perf_block_pce );
410 
411  std::cout << nGrid << " , "
412  << perf_scalar[0] << " , "
413  << perf_scalar[2] << " , "
414  << dim << " , "
415  << order << " , "
416  << perf_scalar[1] << " , "
417  << perf_scalar[3] << " , "
418  << perf_scalar[4] / perf_scalar[4] << " , "
419  << perf_scalar[4] << " , "
420  << perf_block_left[4]/ perf_scalar[4] << " , "
421  << perf_block_left[4] << " , "
422  << perf_block_right[4]/ perf_scalar[4] << " , "
423  << perf_block_right[4] << " , "
424  << perf_pce[4]/ perf_scalar[4] << " , "
425  << perf_pce[4] << " , "
426  << perf_block_pce[4]/ perf_scalar[4] << " , "
427  << perf_block_pce[4] << " , "
428  << std::endl;
429 
430  }
431 }
432 
433 #define INST_PERF_DRIVER(SCALAR, ORDINAL, DEVICE) \
434  template void performance_test_driver< SCALAR, ORDINAL, DEVICE >( \
435  const ORDINAL nGrid, const ORDINAL nIter, const ORDINAL order, \
436  const ORDINAL min_var, const ORDINAL max_var);
Stokhos::StandardStorage< int, double > storage_type
Multivariate orthogonal polynomial basis generated from a total order tensor product of univariate po...
Data structure storing a sparse 3-tensor C(i,j,k) in a a compressed format.
IntType map_fem_graph_coord(const IntType &N, const IntType &i, const IntType &j, const IntType &k)
void performance_test_driver(const Ordinal nGrid, const Ordinal nIter, const Ordinal order, const Ordinal min_var, const Ordinal max_var)
Kokkos::DefaultExecutionSpace execution_space
Sacado::PCE::OrthogPoly< double, Storage > pce_type
void test_mean_multiply(const OrdinalType order, const OrdinalType dim, const OrdinalType nGrid, const OrdinalType iterCount, std::vector< double > &scalar_perf, std::vector< double > &block_left_perf, std::vector< double > &block_right_perf, std::vector< double > &pce_perf, std::vector< double > &block_pce_perf)
Stokhos::LegendreBasis< int, double > basis_type
void deep_copy(const Stokhos::CrsMatrix< ValueType, DstDevice, Layout > &dst, const Stokhos::CrsMatrix< ValueType, SrcDevice, Layout > &src)
void setGlobalCijkTensor(const cijk_type &cijk)
KOKKOS_INLINE_FUNCTION constexpr std::enable_if< is_view_uq_pce< view_type >::value, typename CijkType< view_type >::type >::type cijk(const view_type &view)
Legendre polynomial basis.
Abstract base class for 1-D orthogonal polynomials.
size_t generate_fem_graph(size_t N, std::vector< std::vector< size_t > > &graph)
A comparison functor implementing a strict weak ordering based lexographic ordering.
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< InputType, InputP... > >::value &&Kokkos::is_view_uq_pce< Kokkos::View< OutputType, OutputP... > >::value >::type spmv(const char mode[], const AlphaType &a, const MatrixType &A, const Kokkos::View< InputType, InputP... > &x, const BetaType &b, const Kokkos::View< OutputType, OutputP... > &y, const RANK_ONE)