MueLu  Version of the Day
MueLu_CoalesceDropFactory_kokkos_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // MueLu: A package for multigrid based preconditioning
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact
39 // Jonathan Hu (jhu@sandia.gov)
40 // Andrey Prokopenko (aprokop@sandia.gov)
41 // Ray Tuminaro (rstumin@sandia.gov)
42 //
43 // ***********************************************************************
44 //
45 // @HEADER
46 #ifndef MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
47 #define MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
48 
49 #ifdef HAVE_MUELU_KOKKOS_REFACTOR
50 #include <Kokkos_Core.hpp>
51 #include <KokkosSparse_CrsMatrix.hpp>
52 
53 #include "Xpetra_Matrix.hpp"
54 
56 
57 #include "MueLu_AmalgamationInfo.hpp"
58 #include "MueLu_Exceptions.hpp"
59 #include "MueLu_Level.hpp"
60 #include "MueLu_LWGraph_kokkos.hpp"
61 #include "MueLu_MasterList.hpp"
62 #include "MueLu_Monitor.hpp"
63 #include "MueLu_Utilities_kokkos.hpp"
64 
65 namespace MueLu {
66 
67 
68  namespace { // anonymous
69 
70  template<class LO, class RowType>
71  class ScanFunctor {
72  public:
73  ScanFunctor(RowType rows_) : rows(rows_) { }
74 
75  KOKKOS_INLINE_FUNCTION
76  void operator()(const LO i, LO& upd, const bool& final) const {
77  upd += rows(i);
78  if (final)
79  rows(i) = upd;
80  }
81 
82  private:
83  RowType rows;
84  };
85 
86  template<class LO, class GhostedViewType>
87  class ClassicalDropFunctor {
88  private:
89  typedef typename GhostedViewType::value_type SC;
90  typedef Kokkos::ArithTraits<SC> ATS;
91  typedef typename ATS::magnitudeType magnitudeType;
92 
93  GhostedViewType diag; // corresponds to overlapped diagonal multivector (2D View)
94  magnitudeType eps;
95 
96  public:
97  ClassicalDropFunctor(GhostedViewType ghostedDiag, magnitudeType threshold) :
98  diag(ghostedDiag),
99  eps(threshold)
100  { }
101 
102  // Return true if we drop, false if not
103  KOKKOS_FORCEINLINE_FUNCTION
104  bool operator()(LO row, LO col, SC val) const {
105  // We avoid square root by using squared values
106  auto aiiajj = ATS::magnitude(diag(row, 0)) * ATS::magnitude(diag(col, 0)); // |a_ii|*|a_jj|
107  auto aij2 = ATS::magnitude(val) * ATS::magnitude(val); // |a_ij|^2
108 
109  return (aij2 <= eps*eps * aiiajj);
110  }
111  };
112 
113  template<class LO, class CoordsType>
114  class DistanceFunctor {
115  private:
116  typedef typename CoordsType::value_type SC;
117  typedef Kokkos::ArithTraits<SC> ATS;
118  typedef typename ATS::magnitudeType magnitudeType;
119 
120  public:
121  typedef SC value_type;
122 
123  public:
124  DistanceFunctor(CoordsType coords_) : coords(coords_) { }
125 
126  KOKKOS_INLINE_FUNCTION
127  magnitudeType distance2(LO row, LO col) const {
128  SC d = ATS::zero(), s;
129  for (size_t j = 0; j < coords.extent(1); j++) {
130  s = coords(row,j) - coords(col,j);
131  d += s*s;
132  }
133  return ATS::magnitude(d);
134  }
135  private:
136  CoordsType coords;
137  };
138 
139  template<class LO, class GhostedViewType, class DistanceFunctor>
140  class DistanceLaplacianDropFunctor {
141  private:
142  typedef typename GhostedViewType::value_type SC;
143  typedef Kokkos::ArithTraits<SC> ATS;
144  typedef typename ATS::magnitudeType magnitudeType;
145 
146  public:
147  DistanceLaplacianDropFunctor(GhostedViewType ghostedLaplDiag, DistanceFunctor distFunctor_, magnitudeType threshold) :
148  diag(ghostedLaplDiag),
149  distFunctor(distFunctor_),
150  eps(threshold)
151  { }
152 
153  // Return true if we drop, false if not
154  KOKKOS_INLINE_FUNCTION
155  bool operator()(LO row, LO col, SC val) const {
156  // We avoid square root by using squared values
157 
158  // We ignore incoming value of val as we operate on an auxiliary
159  // distance Laplacian matrix
160  typedef typename DistanceFunctor::value_type dSC;
161  typedef Kokkos::ArithTraits<dSC> dATS;
162  auto fval = dATS::one() / distFunctor.distance2(row, col);
163 
164  auto aiiajj = ATS::magnitude(diag(row, 0)) * ATS::magnitude(diag(col, 0)); // |a_ii|*|a_jj|
165  auto aij2 = ATS::magnitude(fval) * ATS::magnitude(fval); // |a_ij|^2
166 
167  return (aij2 <= eps*eps * aiiajj);
168  }
169 
170  private:
171  GhostedViewType diag; // corresponds to overlapped diagonal multivector (2D View)
172  DistanceFunctor distFunctor;
173  magnitudeType eps;
174  };
175 
176  template<class SC, class LO, class MatrixType, class BndViewType, class DropFunctorType>
177  class ScalarFunctor {
178  private:
179  typedef typename MatrixType::StaticCrsGraphType graph_type;
180  typedef typename graph_type::row_map_type rows_type;
181  typedef typename graph_type::entries_type cols_type;
182  typedef typename MatrixType::values_type vals_type;
183  typedef Kokkos::ArithTraits<SC> ATS;
184  typedef typename ATS::magnitudeType magnitudeType;
185 
186  public:
187  ScalarFunctor(MatrixType A_, BndViewType bndNodes_, DropFunctorType dropFunctor_,
188  typename rows_type::non_const_type rows_,
189  typename cols_type::non_const_type colsAux_,
190  typename vals_type::non_const_type valsAux_,
191  bool reuseGraph_, bool lumping_, SC threshold_) :
192  A(A_),
193  bndNodes(bndNodes_),
194  dropFunctor(dropFunctor_),
195  rows(rows_),
196  colsAux(colsAux_),
197  valsAux(valsAux_),
198  reuseGraph(reuseGraph_),
199  lumping(lumping_)
200  {
201  rowsA = A.graph.row_map;
202  zero = ATS::zero();
203  }
204 
205  KOKKOS_INLINE_FUNCTION
206  void operator()(const LO row, LO& nnz) const {
207  auto rowView = A.rowConst(row);
208  auto length = rowView.length;
209  auto offset = rowsA(row);
210 
211  SC diag = zero;
212  LO rownnz = 0;
213  LO diagID = -1;
214  for (decltype(length) colID = 0; colID < length; colID++) {
215  LO col = rowView.colidx(colID);
216  SC val = rowView.value (colID);
217 
218  if (!dropFunctor(row, col, rowView.value(colID)) || row == col) {
219  colsAux(offset+rownnz) = col;
220 
221  LO valID = (reuseGraph ? colID : rownnz);
222  valsAux(offset+valID) = val;
223  if (row == col)
224  diagID = valID;
225 
226  rownnz++;
227 
228  } else {
229  // Rewrite with zeros (needed for reuseGraph)
230  valsAux(offset+colID) = zero;
231  diag += val;
232  }
233  }
234  // How to assert on the device?
235  // assert(diagIndex != -1);
236  rows(row+1) = rownnz;
237  // if (lumping && diagID != -1) {
238  if (lumping) {
239  // Add diag to the diagonal
240 
241  // NOTE_KOKKOS: valsAux was allocated with
242  // ViewAllocateWithoutInitializing. This is not a problem here
243  // because we explicitly set this value above.
244  valsAux(offset+diagID) += diag;
245  }
246 
247  // If the only element remaining after filtering is diagonal, mark node as boundary
248  // FIXME: this should really be replaced by the following
249  // if (indices.size() == 1 && indices[0] == row)
250  // boundaryNodes[row] = true;
251  // We do not do it this way now because there is no framework for distinguishing isolated
252  // and boundary nodes in the aggregation algorithms
253  bndNodes(row) = (rownnz == 1);
254 
255  nnz += rownnz;
256  }
257 
258  private:
259  MatrixType A;
260  BndViewType bndNodes;
261  DropFunctorType dropFunctor;
262 
263  rows_type rowsA;
264 
265  typename rows_type::non_const_type rows;
266  typename cols_type::non_const_type colsAux;
267  typename vals_type::non_const_type valsAux;
268 
269  bool reuseGraph;
270  bool lumping;
271  SC zero;
272  };
273 
274  // collect number nonzeros of blkSize rows in nnz_(row+1)
275  template<class MatrixType, class NnzType, class blkSizeType>
276  class Stage1aVectorFunctor {
277  private:
278  typedef typename MatrixType::ordinal_type LO;
279 
280  public:
281  Stage1aVectorFunctor(MatrixType kokkosMatrix_, NnzType nnz_, blkSizeType blkSize_) :
282  kokkosMatrix(kokkosMatrix_),
283  nnz(nnz_),
284  blkSize(blkSize_) { }
285 
286  KOKKOS_INLINE_FUNCTION
287  void operator()(const LO row, LO& totalnnz) const {
288 
289  // the following code is more or less what MergeRows is doing
290  // count nonzero entries in all dof rows associated with node row
291  LO nodeRowMaxNonZeros = 0;
292  for (LO j = 0; j < blkSize; j++) {
293  auto rowView = kokkosMatrix.row(row * blkSize + j);
294  nodeRowMaxNonZeros += rowView.length;
295  }
296  nnz(row + 1) = nodeRowMaxNonZeros;
297  totalnnz += nodeRowMaxNonZeros;
298  }
299 
300 
301  private:
302  MatrixType kokkosMatrix; //< local matrix part
303  NnzType nnz; //< View containing number of nonzeros for current row
304  blkSizeType blkSize; //< block size (or partial block size in strided maps)
305  };
306 
307 
308  // build the dof-based column map containing the local dof ids belonging to blkSize rows in matrix
309  // the DofIds may not be sorted.
310  template<class MatrixType, class NnzType, class blkSizeType, class ColDofType>
311  class Stage1bVectorFunctor {
312  private:
313  typedef typename MatrixType::ordinal_type LO;
314  //typedef typename MatrixType::value_type SC;
315  //typedef typename MatrixType::device_type NO;
316 
317  private:
318  MatrixType kokkosMatrix; //< local matrix part
319  NnzType nnz; //< View containing dof offsets for dof columns
320  blkSizeType blkSize; //< block size (or partial block size in strided maps)
321  ColDofType coldofs; //< view containing the local dof ids associated with columns for the blkSize rows (not sorted)
322 
323  public:
324  Stage1bVectorFunctor(MatrixType kokkosMatrix_,
325  NnzType nnz_,
326  blkSizeType blkSize_,
327  ColDofType coldofs_) :
328  kokkosMatrix(kokkosMatrix_),
329  nnz(nnz_),
330  blkSize(blkSize_),
331  coldofs(coldofs_) {
332  }
333 
334  KOKKOS_INLINE_FUNCTION
335  void operator()(const LO rowNode) const {
336 
337  LO pos = nnz(rowNode);
338  for (LO j = 0; j < blkSize; j++) {
339  auto rowView = kokkosMatrix.row(rowNode * blkSize + j);
340  auto numIndices = rowView.length;
341 
342  for (decltype(numIndices) k = 0; k < numIndices; k++) {
343  auto dofID = rowView.colidx(k);
344  coldofs(pos) = dofID;
345  pos ++;
346  }
347  }
348  }
349 
350  };
351 
352  // sort column ids
353  // translate them into (unique) node ids
354  // count the node column ids per node row
355  template<class MatrixType, class ColDofNnzType, class ColDofType, class Dof2NodeTranslationType, class BdryNodeType>
356  class Stage1cVectorFunctor {
357  private:
358  typedef typename MatrixType::ordinal_type LO;
359 
360  private:
361  ColDofNnzType coldofnnz; //< view containing start and stop indices for subviews
362  ColDofType coldofs; //< view containing the local dof ids associated with columns for the blkSize rows (not sorted)
363  Dof2NodeTranslationType dof2node; //< view containing the local node id associated with the local dof id
364  ColDofNnzType colnodennz; //< view containing number of column nodes for each node row
365  BdryNodeType bdrynode; //< view containing with numNodes booleans. True if node is (full) dirichlet boundardy node.
366 
367  public:
368  Stage1cVectorFunctor(ColDofNnzType coldofnnz_, ColDofType coldofs_, Dof2NodeTranslationType dof2node_, ColDofNnzType colnodennz_, BdryNodeType bdrynode_) :
369  coldofnnz(coldofnnz_),
370  coldofs(coldofs_),
371  dof2node(dof2node_),
372  colnodennz(colnodennz_),
373  bdrynode(bdrynode_) {
374  }
375 
376  KOKKOS_INLINE_FUNCTION
377  void operator()(const LO rowNode, LO& nnz) const {
378  LO begin = coldofnnz(rowNode);
379  LO end = coldofnnz(rowNode+1);
380  LO n = end - begin;
381  for (LO i = 0; i < (n-1); i++) {
382  for (LO j = 0; j < (n-i-1); j++) {
383  if (coldofs(j+begin) > coldofs(j+begin+1)) {
384  LO temp = coldofs(j+begin);
385  coldofs(j+begin) = coldofs(j+begin+1);
386  coldofs(j+begin+1) = temp;
387  }
388  }
389  }
390  size_t cnt = 0;
391  LO lastNodeID = -1;
392  for (LO i = 0; i < n; i++) {
393  LO dofID = coldofs(begin + i);
394  LO nodeID = dof2node(dofID);
395  if(nodeID != lastNodeID) {
396  lastNodeID = nodeID;
397  coldofs(begin+cnt) = nodeID;
398  cnt++;
399  }
400  }
401  if(cnt == 1)
402  bdrynode(rowNode) = true;
403  else
404  bdrynode(rowNode) = false;
405  colnodennz(rowNode+1) = cnt;
406  nnz += cnt;
407  }
408 
409  };
410 
411  // fill column node id view
412  template<class MatrixType, class ColDofNnzType, class ColDofType, class ColNodeNnzType, class ColNodeType>
413  class Stage1dVectorFunctor {
414  private:
415  typedef typename MatrixType::ordinal_type LO;
416  typedef typename MatrixType::value_type SC;
417 
418  private:
419  ColDofType coldofs; //< view containing mixed node and dof indices (only input)
420  ColDofNnzType coldofnnz; //< view containing the start and stop indices for subviews (dofs)
421  ColNodeType colnodes; //< view containing the local node ids associated with columns
422  ColNodeNnzType colnodennz; //< view containing start and stop indices for subviews
423 
424  public:
425  Stage1dVectorFunctor(ColDofType coldofs_, ColDofNnzType coldofnnz_, ColNodeType colnodes_, ColNodeNnzType colnodennz_) :
426  coldofs(coldofs_),
427  coldofnnz(coldofnnz_),
428  colnodes(colnodes_),
429  colnodennz(colnodennz_) {
430  }
431 
432  KOKKOS_INLINE_FUNCTION
433  void operator()(const LO rowNode) const {
434  auto dofbegin = coldofnnz(rowNode);
435  auto nodebegin = colnodennz(rowNode);
436  auto nodeend = colnodennz(rowNode+1);
437  auto n = nodeend - nodebegin;
438 
439  for (decltype(nodebegin) i = 0; i < n; i++) {
440  colnodes(nodebegin + i) = coldofs(dofbegin + i);
441  }
442  }
443  };
444 
445 
446  } // namespace
447 
448  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class DeviceType>
449  RCP<const ParameterList> CoalesceDropFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::GetValidParameterList() const {
450  RCP<ParameterList> validParamList = rcp(new ParameterList());
451 
452 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
453  SET_VALID_ENTRY("aggregation: drop tol");
454  SET_VALID_ENTRY("aggregation: Dirichlet threshold");
455  SET_VALID_ENTRY("aggregation: drop scheme");
456  SET_VALID_ENTRY("filtered matrix: use lumping");
457  SET_VALID_ENTRY("filtered matrix: reuse graph");
458  SET_VALID_ENTRY("filtered matrix: reuse eigenvalue");
459  {
461  validParamList->getEntry("aggregation: drop scheme").setValidator(
462  rcp(new validatorType(Teuchos::tuple<std::string>("classical", "distance laplacian"), "aggregation: drop scheme")));
463  }
464 #undef SET_VALID_ENTRY
465  validParamList->set< bool > ("lightweight wrap", true, "Experimental option for lightweight graph access");
466 
467  validParamList->set< RCP<const FactoryBase> >("A", Teuchos::null, "Generating factory of the matrix A");
468  validParamList->set< RCP<const FactoryBase> >("UnAmalgamationInfo", Teuchos::null, "Generating factory for UnAmalgamationInfo");
469  validParamList->set< RCP<const FactoryBase> >("Coordinates", Teuchos::null, "Generating factory for Coordinates");
470 
471  return validParamList;
472  }
473 
474  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
475  void CoalesceDropFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::DeclareInput(Level &currentLevel) const {
476  Input(currentLevel, "A");
477  Input(currentLevel, "UnAmalgamationInfo");
478 
479  const ParameterList& pL = GetParameterList();
480  if (pL.get<bool>("lightweight wrap") == true) {
481  if (pL.get<std::string>("aggregation: drop scheme") == "distance laplacian")
482  Input(currentLevel, "Coordinates");
483  }
484  }
485 
486  template <class Scalar,class LocalOrdinal, class GlobalOrdinal, class DeviceType>
487  void CoalesceDropFactory_kokkos<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosDeviceWrapperNode<DeviceType>>::
488  Build(Level& currentLevel) const {
489  FactoryMonitor m(*this, "Build", currentLevel);
490 
491  typedef Teuchos::ScalarTraits<SC> STS;
492  const SC zero = STS::zero();
493 
494  auto A = Get< RCP<Matrix> >(currentLevel, "A");
495  LO blkSize = A->GetFixedBlockSize();
496 
497  auto amalInfo = Get< RCP<AmalgamationInfo> >(currentLevel, "UnAmalgamationInfo");
498 
499  const ParameterList& pL = GetParameterList();
500 
501  bool doLightWeightWrap = pL.get<bool>("lightweight wrap");
502  GetOStream(Warnings0) << "lightweight wrap is deprecated" << std::endl;
503  TEUCHOS_TEST_FOR_EXCEPTION(!doLightWeightWrap, Exceptions::RuntimeError,
504  "MueLu KokkosRefactor only supports \"lightweight wrap\"=\"true\"");
505 
506  std::string algo = pL.get<std::string>("aggregation: drop scheme");
507 
508  double threshold = pL.get<double>("aggregation: drop tol");
509  GetOStream(Runtime0) << "algorithm = \"" << algo << "\": threshold = " << threshold
510  << ", blocksize = " << A->GetFixedBlockSize() << std::endl;
511 
512  const typename STS::magnitudeType dirichletThreshold =
513  STS::magnitude(as<SC>(pL.get<double>("aggregation: Dirichlet threshold")));
514 
515  GO numDropped = 0, numTotal = 0;
516 
517  RCP<LWGraph_kokkos> graph;
518  LO dofsPerNode = -1;
519 
520  typedef typename LWGraph_kokkos::boundary_nodes_type boundary_nodes_type;
521  boundary_nodes_type boundaryNodes;
522 
523  RCP<Matrix> filteredA;
524  if (blkSize == 1 && threshold == zero) {
525  // Scalar problem without dropping
526 
527  // Detect and record rows that correspond to Dirichlet boundary conditions
528  boundaryNodes = Utilities_kokkos::DetectDirichletRows(*A, dirichletThreshold);
529 
530  // Trivial LWGraph construction
531  graph = rcp(new LWGraph_kokkos(A->getLocalMatrix().graph, A->getRowMap(), A->getColMap(), "graph of A"));
532  graph->SetBoundaryNodeMap(boundaryNodes);
533 
534  numTotal = A->getNodeNumEntries();
535  dofsPerNode = 1;
536 
537  filteredA = A;
538 
539  } else if (blkSize == 1 && threshold != zero) {
540  // Scalar problem with dropping
541 
542  typedef typename Matrix::local_matrix_type local_matrix_type;
543  typedef typename LWGraph_kokkos::local_graph_type kokkos_graph_type;
544  typedef typename kokkos_graph_type::row_map_type::non_const_type rows_type;
545  typedef typename kokkos_graph_type::entries_type::non_const_type cols_type;
546  typedef typename local_matrix_type::values_type::non_const_type vals_type;
547 
548  LO numRows = A->getNodeNumRows();
549  auto kokkosMatrix = A->getLocalMatrix();
550  auto nnzA = kokkosMatrix.nnz();
551  auto rowsA = kokkosMatrix.graph.row_map;
552 
553  typedef Kokkos::ArithTraits<SC> ATS;
554  typedef typename ATS::magnitudeType magnitudeType;
555 
556  bool reuseGraph = pL.get<bool>("filtered matrix: reuse graph");
557  bool lumping = pL.get<bool>("filtered matrix: use lumping");
558  if (lumping)
559  GetOStream(Runtime0) << "Lumping dropped entries" << std::endl;
560 
561  // FIXME_KOKKOS: replace by ViewAllocateWithoutInitializing + setting a single value
562  rows_type rows ("FA_rows", numRows+1);
563  cols_type colsAux(Kokkos::ViewAllocateWithoutInitializing("FA_aux_cols"), nnzA);
564  vals_type valsAux;
565  if (reuseGraph) {
566  SubFactoryMonitor m2(*this, "CopyMatrix", currentLevel);
567 
568  // Share graph with the original matrix
569  filteredA = MatrixFactory::Build(A->getCrsGraph());
570 
571  // Do a no-op fill-complete
572  RCP<ParameterList> fillCompleteParams(new ParameterList);
573  fillCompleteParams->set("No Nonlocal Changes", true);
574  filteredA->fillComplete(fillCompleteParams);
575 
576  // No need to reuseFill, just modify in place
577  valsAux = filteredA->getLocalMatrix().values;
578 
579  } else {
580  // Need an extra array to compress
581  valsAux = vals_type(Kokkos::ViewAllocateWithoutInitializing("FA_aux_vals"), nnzA);
582  }
583 
584  typename boundary_nodes_type::non_const_type bndNodes(Kokkos::ViewAllocateWithoutInitializing("boundaryNodes"), numRows);
585 
586  LO nnzFA = 0;
587  {
588  if (algo == "classical") {
589  // Construct overlapped matrix diagonal
590  RCP<Vector> ghostedDiag;
591  {
592  SubFactoryMonitor m2(*this, "Ghosted diag construction", currentLevel);
593  ghostedDiag = Utilities_kokkos::GetMatrixOverlappedDiagonal(*A);
594  }
595 
596  // Filter out entries
597  {
598  SubFactoryMonitor m2(*this, "MainLoop", currentLevel);
599 
600  auto ghostedDiagView = ghostedDiag->template getLocalView<DeviceType>();
601 
602  ClassicalDropFunctor<LO, decltype(ghostedDiagView)> dropFunctor(ghostedDiagView, threshold);
603  ScalarFunctor<SC, LO, local_matrix_type, decltype(bndNodes), decltype(dropFunctor)>
604  scalarFunctor(kokkosMatrix, bndNodes, dropFunctor, rows, colsAux, valsAux, reuseGraph, lumping, threshold);
605 
606  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:main_loop", range_type(0,numRows),
607  scalarFunctor, nnzFA);
608  }
609 
610  } else if (algo == "distance laplacian") {
611  typedef Xpetra::MultiVector<double,LO,GO,NO> doubleMultiVector;
612  auto coords = Get<RCP<doubleMultiVector> >(currentLevel, "Coordinates");
613 
614  auto uniqueMap = A->getRowMap();
615  auto nonUniqueMap = A->getColMap();
616 
617  // Construct ghosted coordinates
618  RCP<const Import> importer;
619  {
620  SubFactoryMonitor m2(*this, "Coords Import construction", currentLevel);
621  importer = ImportFactory::Build(uniqueMap, nonUniqueMap);
622  }
623  RCP<doubleMultiVector> ghostedCoords;
624  {
625  SubFactoryMonitor m2(*this, "Ghosted coords construction", currentLevel);
626  ghostedCoords = Xpetra::MultiVectorFactory<double,LO,GO,NO>::Build(nonUniqueMap, coords->getNumVectors());
627  ghostedCoords->doImport(*coords, *importer, Xpetra::INSERT);
628  }
629 
630  auto ghostedCoordsView = ghostedCoords->template getLocalView<DeviceType>();
631  DistanceFunctor<LO, decltype(ghostedCoordsView)> distFunctor(ghostedCoordsView);
632 
633  // Construct Laplacian diagonal
634  RCP<Vector> localLaplDiag;
635  {
636  SubFactoryMonitor m2(*this, "Local Laplacian diag construction", currentLevel);
637 
638  localLaplDiag = VectorFactory::Build(uniqueMap);
639 
640  auto localLaplDiagView = localLaplDiag->template getLocalView<DeviceType>();
641  auto kokkosGraph = kokkosMatrix.graph;
642 
643  Kokkos::parallel_for("MueLu:CoalesceDropF:Build:scalar_filter:laplacian_diag", range_type(0,numRows),
644  KOKKOS_LAMBDA(const LO row) {
645  auto rowView = kokkosGraph.rowConst(row);
646  auto length = rowView.length;
647 
648  SC d = ATS::zero();
649  for (decltype(length) colID = 0; colID < length; colID++) {
650  auto col = rowView(colID);
651  if (row != col)
652  d += ATS::one()/distFunctor.distance2(row, col);
653  }
654  localLaplDiagView(row,0) = d;
655  });
656  }
657 
658  // Construct ghosted Laplacian diagonal
659  RCP<Vector> ghostedLaplDiag;
660  {
661  SubFactoryMonitor m2(*this, "Ghosted Laplacian diag construction", currentLevel);
662  ghostedLaplDiag = VectorFactory::Build(nonUniqueMap);
663  ghostedLaplDiag->doImport(*localLaplDiag, *importer, Xpetra::INSERT);
664  }
665 
666  // Filter out entries
667  {
668  SubFactoryMonitor m2(*this, "MainLoop", currentLevel);
669 
670  auto ghostedLaplDiagView = ghostedLaplDiag->template getLocalView<DeviceType>();
671 
672  DistanceLaplacianDropFunctor<LO, decltype(ghostedLaplDiagView), decltype(distFunctor)>
673  dropFunctor(ghostedLaplDiagView, distFunctor, threshold);
674  ScalarFunctor<SC, LO, local_matrix_type, decltype(bndNodes), decltype(dropFunctor)>
675  scalarFunctor(kokkosMatrix, bndNodes, dropFunctor, rows, colsAux, valsAux, reuseGraph, lumping, threshold);
676 
677  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:main_loop", range_type(0,numRows),
678  scalarFunctor, nnzFA);
679  }
680  }
681 
682  }
683  numDropped = nnzA - nnzFA;
684 
685  boundaryNodes = bndNodes;
686 
687  {
688  SubFactoryMonitor m2(*this, "CompressRows", currentLevel);
689 
690  // parallel_scan (exclusive)
691  Kokkos::parallel_scan("MueLu:CoalesceDropF:Build:scalar_filter:compress_rows", range_type(0,numRows+1),
692  KOKKOS_LAMBDA(const LO i, LO& update, const bool& final_pass) {
693  update += rows(i);
694  if (final_pass)
695  rows(i) = update;
696  });
697  }
698 
699  // Compress cols (and optionally vals)
700  // We use a trick here: we moved all remaining elements to the beginning
701  // of the original row in the main loop, so we don't need to check for
702  // INVALID here, and just stop when achieving the new number of elements
703  // per row.
704  cols_type cols(Kokkos::ViewAllocateWithoutInitializing("FA_cols"), nnzFA);
705  vals_type vals;
706  if (reuseGraph) {
707  // Only compress cols
708  SubFactoryMonitor m2(*this, "CompressColsAndVals", currentLevel);
709 
710  Kokkos::parallel_for("MueLu:TentativePF:Build:compress_cols", range_type(0,numRows),
711  KOKKOS_LAMBDA(const LO i) {
712  // Is there Kokkos memcpy?
713  LO rowStart = rows(i);
714  LO rowAStart = rowsA(i);
715  size_t rownnz = rows(i+1) - rows(i);
716  for (size_t j = 0; j < rownnz; j++)
717  cols(rowStart+j) = colsAux(rowAStart+j);
718  });
719  } else {
720  // Compress cols and vals
721  SubFactoryMonitor m2(*this, "CompressColsAndVals", currentLevel);
722 
723  vals = vals_type(Kokkos::ViewAllocateWithoutInitializing("FA_vals"), nnzFA);
724 
725  Kokkos::parallel_for("MueLu:TentativePF:Build:compress_cols", range_type(0,numRows),
726  KOKKOS_LAMBDA(const LO i) {
727  LO rowStart = rows(i);
728  LO rowAStart = rowsA(i);
729  size_t rownnz = rows(i+1) - rows(i);
730  for (size_t j = 0; j < rownnz; j++) {
731  cols(rowStart+j) = colsAux(rowAStart+j);
732  vals(rowStart+j) = colsAux(rowAStart+j);
733  }
734  });
735  }
736 
737  kokkos_graph_type kokkosGraph(cols, rows);
738 
739  {
740  SubFactoryMonitor m2(*this, "LWGraph construction", currentLevel);
741 
742  graph = rcp(new LWGraph_kokkos(kokkosGraph, A->getRowMap(), A->getColMap(), "filtered graph of A"));
743  graph->SetBoundaryNodeMap(boundaryNodes);
744  }
745 
746  numTotal = A->getNodeNumEntries();
747 
748  dofsPerNode = 1;
749 
750  if (!reuseGraph) {
751  SubFactoryMonitor m2(*this, "LocalMatrix+FillComplete", currentLevel);
752 
753  local_matrix_type localFA = local_matrix_type("A", numRows, kokkosMatrix.numCols(), nnzFA, vals, rows, cols);
754 #if 1
755  // FIXME: this should be gone once Xpetra propagate "local matrix + 4 maps" constructor
756  // FIXME: we don't need to rebuild importers/exporters. We should reuse A's
757  auto filteredACrs = CrsMatrixFactory::Build(A->getRowMap(), A->getColMap(), localFA);
758  filteredACrs->resumeFill(); // we need that for rectangular matrices
759  filteredACrs->expertStaticFillComplete(A->getDomainMap(), A->getRangeMap());
760 #endif
761  filteredA = rcp(new CrsMatrixWrap(filteredACrs));
762  }
763 
764  filteredA->SetFixedBlockSize(A->GetFixedBlockSize());
765 
766  if (pL.get<bool>("filtered matrix: reuse eigenvalue")) {
767  // Reuse max eigenvalue from A
768  // It is unclear what eigenvalue is the best for the smoothing, but we already may have
769  // the D^{-1}A estimate in A, may as well use it.
770  // NOTE: ML does that too
771  filteredA->SetMaxEigenvalueEstimate(A->GetMaxEigenvalueEstimate());
772  } else {
773  filteredA->SetMaxEigenvalueEstimate(-Teuchos::ScalarTraits<SC>::one());
774  }
775 
776  } else if (blkSize > 1 && threshold == zero) {
777  // Case 3: block problem without filtering
778  //
779  // FIXME_KOKKOS: this code is completely unoptimized. It really should do
780  // a very simple thing: merge rows and produce nodal graph. But the code
781  // seems very complicated. Can we do better?
782 
783  TEUCHOS_TEST_FOR_EXCEPTION(A->getRowMap()->getNodeNumElements() % blkSize != 0, MueLu::Exceptions::RuntimeError, "MueLu::CoalesceDropFactory: Number of local elements is " << A->getRowMap()->getNodeNumElements() << " but should be a multiply of " << blkSize);
784 
785  const RCP<const Map> rowMap = A->getRowMap();
786  const RCP<const Map> colMap = A->getColMap();
787 
788  // build a node row map (uniqueMap = non-overlapping) and a node column map
789  // (nonUniqueMap = overlapping). The arrays rowTranslation and colTranslation
790  // stored in the AmalgamationInfo class container contain the local node id
791  // given a local dof id. The data is calculated in the AmalgamationFactory and
792  // stored in the variable "UnAmalgamationInfo" (which is of type AmalagamationInfo)
793  const RCP<const Map> uniqueMap = amalInfo->getNodeRowMap();
794  const RCP<const Map> nonUniqueMap = amalInfo->getNodeColMap();
795  Array<LO> rowTranslationArray = *(amalInfo->getRowTranslation()); // TAW should be transform that into a View?
796  Array<LO> colTranslationArray = *(amalInfo->getColTranslation());
797 
798  // get number of local nodes
799  LO numNodes = Teuchos::as<LocalOrdinal>(uniqueMap->getNodeNumElements());
800  typedef typename Kokkos::View<LocalOrdinal*, DeviceType> id_translation_type;
801  id_translation_type rowTranslation("dofId2nodeId",rowTranslationArray.size());
802  id_translation_type colTranslation("ov_dofId2nodeId",colTranslationArray.size());
803 
804  // TODO change this to lambdas
805  for (decltype(rowTranslationArray.size()) i = 0; i < rowTranslationArray.size(); ++i)
806  rowTranslation(i) = rowTranslationArray[i];
807  for (decltype(colTranslationArray.size()) i = 0; i < colTranslationArray.size(); ++i)
808  colTranslation(i) = colTranslationArray[i];
809  // extract striding information
810  blkSize = A->GetFixedBlockSize(); //< the full block size (number of dofs per node in strided map)
811  LocalOrdinal blkId = -1; //< the block id within a strided map or -1 if it is a full block map
812  LocalOrdinal blkPartSize = A->GetFixedBlockSize(); //< stores block size of part blkId (or the full block size)
813  if(A->IsView("stridedMaps") == true) {
814  const RCP<const Map> myMap = A->getRowMap("stridedMaps");
815  const RCP<const StridedMap> strMap = Teuchos::rcp_dynamic_cast<const StridedMap>(myMap);
816  TEUCHOS_TEST_FOR_EXCEPTION(strMap.is_null() == true, Exceptions::RuntimeError, "Map is not of type stridedMap");
817  blkSize = Teuchos::as<const LocalOrdinal>(strMap->getFixedBlockSize());
818  blkId = strMap->getStridedBlockId();
819  if (blkId > -1)
820  blkPartSize = Teuchos::as<LocalOrdinal>(strMap->getStridingData()[blkId]);
821  }
822  auto kokkosMatrix = A->getLocalMatrix(); // access underlying kokkos data
823 
824  //
825  typedef typename Matrix::local_matrix_type local_matrix_type;
826  typedef typename LWGraph_kokkos::local_graph_type kokkos_graph_type;
827  typedef typename kokkos_graph_type::row_map_type row_map_type;
828  //typedef typename row_map_type::HostMirror row_map_type_h;
829  typedef typename kokkos_graph_type::entries_type entries_type;
830 
831  // Stage 1c: get number of dof-nonzeros per blkSize node rows
832  typename row_map_type::non_const_type dofNnz("nnz_map", numNodes + 1);
833  LO numDofCols = 0;
834  Stage1aVectorFunctor<decltype(kokkosMatrix), decltype(dofNnz), decltype(blkPartSize)> stage1aFunctor(kokkosMatrix, dofNnz, blkPartSize);
835  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:stage1a", range_type(0,numNodes), stage1aFunctor, numDofCols);
836  // parallel_scan (exclusive)
837  ScanFunctor<LO,decltype(dofNnz)> scanFunctor(dofNnz);
838  Kokkos::parallel_scan("MueLu:CoalesceDropF:Build:scalar_filter:stage1_scan", range_type(0,numNodes+1), scanFunctor);
839 
840  typename entries_type::non_const_type dofcols("dofcols", numDofCols/*dofNnz(numNodes)*/); // why does dofNnz(numNodes) work? should be a parallel reduce, i guess
841  Stage1bVectorFunctor <decltype(kokkosMatrix), decltype(dofNnz), decltype(blkPartSize), decltype(dofcols)> stage1bFunctor(kokkosMatrix, dofNnz, blkPartSize, dofcols);
842  Kokkos::parallel_for("MueLu:CoalesceDropF:Build:scalar_filter:stage1b", range_type(0,numNodes), stage1bFunctor);
843 
844  // we have dofcols and dofids from Stage1dVectorFunctor
845  LO numNodeCols = 0;
846  typename row_map_type::non_const_type rows("nnz_nodemap", numNodes + 1);
847  typename boundary_nodes_type::non_const_type bndNodes("boundaryNodes", numNodes);
848  Stage1cVectorFunctor <decltype(kokkosMatrix), decltype(dofNnz), decltype(dofcols), decltype(colTranslation), decltype(bndNodes)> stage1cFunctor(dofNnz, dofcols, colTranslation,rows,bndNodes);
849  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:scalar_filter:stage1c", range_type(0,numNodes), stage1cFunctor,numNodeCols);
850 
851  // parallel_scan (exclusive)
852  ScanFunctor<LO,decltype(rows)> scanNodeFunctor(rows);
853  Kokkos::parallel_scan("MueLu:CoalesceDropF:Build:scalar_filter:stage1_scan", range_type(0,numNodes+1), scanNodeFunctor);
854 
855  // create column node view
856  typename entries_type::non_const_type cols("nodecols", numNodeCols);
857 
858 
859  Stage1dVectorFunctor <decltype(kokkosMatrix), decltype(dofNnz), decltype(dofcols), decltype(rows), decltype(cols)> stage1dFunctor(dofcols, dofNnz, cols, rows);
860  Kokkos::parallel_for("MueLu:CoalesceDropF:Build:scalar_filter:stage1c", range_type(0,numNodes), stage1dFunctor);
861  kokkos_graph_type kokkosGraph(cols, rows);
862 
863  // create LW graph
864  graph = rcp(new LWGraph_kokkos(kokkosGraph, uniqueMap, nonUniqueMap, "amalgamated graph of A"));
865 
866  boundaryNodes = bndNodes;
867  graph->SetBoundaryNodeMap(boundaryNodes);
868  numTotal = A->getNodeNumEntries();
869 
870  dofsPerNode = blkSize;
871 
872  filteredA = A;
873 
874  } else {
875  TEUCHOS_TEST_FOR_EXCEPTION(true, Exceptions::RuntimeError, "MueLu: CoalesceDropFactory_kokkos: Block filtering is not implemented");
876  }
877 
878  if (GetVerbLevel() & Statistics1) {
879  GO numLocalBoundaryNodes = 0;
880  GO numGlobalBoundaryNodes = 0;
881 
882  Kokkos::parallel_reduce("MueLu:CoalesceDropF:Build:bnd", range_type(0, boundaryNodes.extent(0)),
883  KOKKOS_LAMBDA(const LO i, GO& n) {
884  if (boundaryNodes(i))
885  n++;
886  }, numLocalBoundaryNodes);
887 
888  auto comm = A->getRowMap()->getComm();
889  MueLu_sumAll(comm, numLocalBoundaryNodes, numGlobalBoundaryNodes);
890  GetOStream(Statistics1) << "Detected " << numGlobalBoundaryNodes << " Dirichlet nodes" << std::endl;
891  }
892 
893  if ((GetVerbLevel() & Statistics1) && threshold != zero) {
894  auto comm = A->getRowMap()->getComm();
895 
896  GO numGlobalTotal, numGlobalDropped;
897  MueLu_sumAll(comm, numTotal, numGlobalTotal);
898  MueLu_sumAll(comm, numDropped, numGlobalDropped);
899 
900  if (numGlobalTotal != 0) {
901  GetOStream(Statistics1) << "Number of dropped entries: "
902  << numGlobalDropped << "/" << numGlobalTotal
903  << " (" << 100*Teuchos::as<double>(numGlobalDropped)/Teuchos::as<double>(numGlobalTotal) << "%)" << std::endl;
904  }
905  }
906 
907  Set(currentLevel, "DofsPerNode", dofsPerNode);
908  Set(currentLevel, "Graph", graph);
909  Set(currentLevel, "A", filteredA);
910  }
911 }
912 #endif // HAVE_MUELU_KOKKOS_REFACTOR
913 #endif // MUELU_COALESCEDROPFACTORY_KOKKOS_DEF_HPP
Important warning messages (one line)
#define MueLu_sumAll(rcpComm, in, out)
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
Print more statistics.
One-liner description of what is happening.
Namespace for MueLu classes and methods.
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Exception throws to report errors in the internal logical of the program.
Kokkos::View< const bool *, typename NO::device_type > DetectDirichletRows(const Xpetra::Matrix< SC, LO, GO, NO > &A, const typename Teuchos::ScalarTraits< SC >::magnitudeType &tol, const bool count_twos_as_dirichlet)
#define SET_VALID_ENTRY(name)