Zoltan2
Loading...
Searching...
No Matches
Zoltan2_TaskMapping.hpp
Go to the documentation of this file.
1
2#ifndef _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
3#define _ZOLTAN2_COORD_PARTITIONMAPPING_HPP_
4
5#include <fstream>
6#include <ctime>
7#include <vector>
8#include <set>
9#include <tuple>
10
11#include "Zoltan2_Standards.hpp"
13#include "Teuchos_ArrayViewDecl.hpp"
16#include "Teuchos_ReductionOp.hpp"
17
19
21
22#include <Zoltan2_TPLTraits.hpp>
23
24#include "Teuchos_Comm.hpp"
25#ifdef HAVE_ZOLTAN2_MPI
26#include "Teuchos_DefaultMpiComm.hpp"
27#endif // HAVE_ZOLTAN2_MPI
28#include <Teuchos_DefaultSerialComm.hpp>
29
30//#define gnuPlot
33
34namespace Teuchos{
35
39template <typename Ordinal, typename T>
40class Zoltan2_ReduceBestMapping : public ValueTypeReductionOp<Ordinal,T>
41{
42
43private:
44 T epsilon;
45
46public:
49 Zoltan2_ReduceBestMapping() : epsilon(std::numeric_limits<T>::epsilon()) {}
50
53 void reduce(const Ordinal count,
54 const T inBuffer[],
55 T inoutBuffer[]) const {
56
57 for (Ordinal i = 0; i < count; i++) {
58 if (inBuffer[0] - inoutBuffer[0] < -epsilon) {
59 inoutBuffer[0] = inBuffer[0];
60 inoutBuffer[1] = inBuffer[1];
61 } else if(inBuffer[0] - inoutBuffer[0] < epsilon &&
62 inBuffer[1] - inoutBuffer[1] < epsilon) {
63 inoutBuffer[0] = inBuffer[0];
64 inoutBuffer[1] = inBuffer[1];
65 }
66 }
67 }
68};
69
70} // namespace Teuchos
71
72
73namespace Zoltan2{
74
75template <typename it>
76inline it z2Fact(it x) {
77 return (x == 1 ? x : x * z2Fact<it>(x - 1));
78}
79
80template <typename gno_t, typename part_t>
82public:
85};
86
87//returns the ith permutation indices.
88template <typename IT>
89void ithPermutation(const IT n, IT i, IT *perm)
90{
91 IT j, k = 0;
92 IT *fact = new IT[n];
93
94
95 // compute factorial numbers
96 fact[k] = 1;
97 while (++k < n)
98 fact[k] = fact[k - 1] * k;
99
100 // compute factorial code
101 for (k = 0; k < n; ++k)
102 {
103 perm[k] = i / fact[n - 1 - k];
104 i = i % fact[n - 1 - k];
105 }
106
107 // readjust values to obtain the permutation
108 // start from the end and check if preceding values are lower
109 for (k = n - 1; k > 0; --k)
110 for (j = k - 1; j >= 0; --j)
111 if (perm[j] <= perm[k])
112 perm[k]++;
113
114 delete [] fact;
115}
116
117template <typename part_t>
119 part_t *&task_comm_xadj,
120 part_t *&task_comm_adj,
121 std::vector<int> grid_dims) {
122 int dim = grid_dims.size();
123 int neighborCount = 2 * dim;
124 task_comm_xadj = new part_t[taskCount + 1];
125 task_comm_adj = new part_t[taskCount * neighborCount];
126
127 part_t neighBorIndex = 0;
128 task_comm_xadj[0] = 0;
129 for (part_t i = 0; i < taskCount; ++i) {
130 part_t prevDimMul = 1;
131 for (int j = 0; j < dim; ++j) {
132 part_t lNeighbor = i - prevDimMul;
133 part_t rNeighbor = i + prevDimMul;
134 prevDimMul *= grid_dims[j];
135 if (lNeighbor >= 0 &&
136 lNeighbor/ prevDimMul == i / prevDimMul &&
137 lNeighbor < taskCount) {
138 task_comm_adj[neighBorIndex++] = lNeighbor;
139 }
140 if (rNeighbor >= 0 &&
141 rNeighbor/ prevDimMul == i / prevDimMul &&
142 rNeighbor < taskCount) {
143 task_comm_adj[neighBorIndex++] = rNeighbor;
144 }
145 }
146 task_comm_xadj[i + 1] = neighBorIndex;
147 }
148
149}
150//returns the center of the parts.
151template <typename Adapter, typename scalar_t, typename part_t>
153 const Environment *envConst,
154 const Teuchos::Comm<int> *comm,
156 //const Zoltan2::PartitioningSolution<Adapter> *soln_,
157 const part_t *parts,
158 int coordDim,
159 part_t ntasks,
160 scalar_t **partCenters) {
161
162 typedef typename Adapter::lno_t lno_t;
163 typedef typename Adapter::gno_t gno_t;
164
165 typedef StridedData<lno_t, scalar_t> input_t;
166 ArrayView<const gno_t> gnos;
167 ArrayView<input_t> xyz;
168 ArrayView<input_t> wgts;
169 coords->getCoordinates(gnos, xyz, wgts);
170
171 //local and global num coordinates.
172 lno_t numLocalCoords = coords->getLocalNumCoordinates();
173 //gno_t numGlobalCoords = coords->getGlobalNumCoordinates();
174
175 //local number of points in each part.
176 gno_t *point_counts = new gno_t[ntasks];
177 memset(point_counts, 0, sizeof(gno_t) * ntasks);
178
179 //global number of points in each part.
180 gno_t *global_point_counts = new gno_t[ntasks];
181
182 scalar_t **multiJagged_coordinates = new scalar_t*[coordDim];
183
184 ArrayRCP<ArrayRCP<const scalar_t>> ar = arcp(new ArrayRCP<const scalar_t>[coordDim], 0, coordDim);
185 for (int dim=0; dim < coordDim; dim++){
186 xyz[dim].getInputArray(ar[dim]);
187 //multiJagged coordinate values assignment
188 multiJagged_coordinates[dim] = (scalar_t *)ar[dim].getRawPtr();
189 memset(partCenters[dim], 0, sizeof(scalar_t) * ntasks);
190 }
191
192 //get parts with parallel gnos.
193 //const part_t *parts = soln_->getPartListView();
194
195 envConst->timerStart(MACRO_TIMERS, "Mapping - Center Calculation");
196
197 for (lno_t i=0; i < numLocalCoords; i++) {
198 part_t p = parts[i];
199 //add up all coordinates in each part.
200 for(int j = 0; j < coordDim; ++j) {
201 scalar_t c = multiJagged_coordinates[j][i];
202 partCenters[j][p] += c;
203 }
204 ++point_counts[p];
205 }
206
207 //get global number of points in each part.
208 reduceAll<int, gno_t>(*comm, Teuchos::REDUCE_SUM,
209 ntasks, point_counts, global_point_counts);
210
211 for(int j = 0; j < coordDim; ++j) {
212 for (part_t i=0; i < ntasks; ++i) {
213 if (global_point_counts[i] > 0)
214 partCenters[j][i] /= global_point_counts[i];
215 }
216 }
217
218 scalar_t *tmpCoords = new scalar_t[ntasks];
219 for(int j = 0; j < coordDim; ++j) {
220 reduceAll<int, scalar_t>(*comm, Teuchos::REDUCE_SUM,
221 ntasks, partCenters[j], tmpCoords);
222
223 scalar_t *tmp = partCenters[j];
224 partCenters[j] = tmpCoords;
225 tmpCoords = tmp;
226 }
227
228 envConst->timerStop(MACRO_TIMERS, "Mapping - Center Calculation");
229
230 delete [] point_counts;
231 delete [] global_point_counts;
232
233 delete [] tmpCoords;
234 delete [] multiJagged_coordinates;
235}
236
237//returns the coarsend part graph.
238template <typename Adapter, typename scalar_t, typename part_t>
240 const Environment *envConst,
241 const Teuchos::Comm<int> *comm,
243 //const Zoltan2::PartitioningSolution<Adapter> *soln_,
244 part_t np,
245 const part_t *parts,
246 ArrayRCP<part_t> &g_part_xadj,
247 ArrayRCP<part_t> &g_part_adj,
248 ArrayRCP<scalar_t> &g_part_ew) {
249
250 typedef typename Adapter::lno_t t_lno_t;
251 typedef typename Adapter::gno_t t_gno_t;
252 typedef typename Adapter::scalar_t t_scalar_t;
253 typedef typename Adapter::offset_t t_offset_t;
254 typedef typename Zoltan2::GraphModel<
255 typename Adapter::base_adapter_t>::input_t t_input_t;
256
257 //int numRanks = comm->getSize();
258 //int myRank = comm->getRank();
259
260 //get parts with parallel gnos.
261 /*
262 const part_t *parts = soln_->getPartListView();
263
264 part_t np = soln_->getActualGlobalNumberOfParts();
265 if (part_t(soln_->getTargetGlobalNumberOfParts()) > np) {
266 np = soln_->getTargetGlobalNumberOfParts();
267 }
268 */
269
270
271 t_lno_t localNumVertices = graph->getLocalNumVertices();
272 t_lno_t localNumEdges = graph->getLocalNumEdges();
273
274 //get the vertex global ids, and weights
275 ArrayView<const t_gno_t> Ids;
276 ArrayView<t_input_t> v_wghts;
277 graph->getVertexList(Ids, v_wghts);
278
279 //get the edge ids, and weights
280 ArrayView<const t_gno_t> edgeIds;
281 ArrayView<const t_offset_t> offsets;
282 ArrayView<t_input_t> e_wgts;
283 graph->getEdgeList(edgeIds, offsets, e_wgts);
284
285 std::vector<t_scalar_t> edge_weights;
286 int numWeightPerEdge = graph->getNumWeightsPerEdge();
287
288 if (numWeightPerEdge > 0) {
289 edge_weights = std::vector<t_scalar_t>(localNumEdges);
290 for (t_lno_t i = 0; i < localNumEdges; ++i) {
291 edge_weights[i] = e_wgts[0][i];
292 }
293 }
294
295 //create a zoltan dictionary to get the parts of the vertices
296 //at the other end of edges
297 std::vector<part_t> e_parts(localNumEdges);
298#ifdef HAVE_ZOLTAN2_MPI
299 if (comm->getSize() > 1) {
300 const bool bUseLocalIDs = false; // Local IDs not needed
302 int debug_level = 0;
303 const RCP<const Comm<int> > rcp_comm(comm,false);
304 directory_t directory(rcp_comm, bUseLocalIDs, debug_level);
305 directory.update(localNumVertices, &Ids[0], NULL, &parts[0],
306 NULL, directory_t::Update_Mode::Replace);
307 directory.find(localNumEdges, &edgeIds[0], NULL, &e_parts[0],
308 NULL, NULL, false);
309 } else
310#endif
311 {
312
313 /*
314 std::cout << "localNumVertices:" << localNumVertices
315 << " np:" << np
316 << " globalNumVertices:" << graph->getGlobalNumVertices()
317 << " localNumEdges:" << localNumEdges << std::endl;
318 */
319
320 for (t_lno_t i = 0; i < localNumEdges; ++i) {
321 t_gno_t ei = edgeIds[i];
322 part_t p = parts[ei];
323 e_parts[i] = p;
324 }
325
326 //get the vertices in each part in my part.
327 std::vector<t_lno_t> part_begins(np, -1);
328 std::vector<t_lno_t> part_nexts(localNumVertices, -1);
329
330 //cluster vertices according to their parts.
331 //create local part graph.
332 for (t_lno_t i = 0; i < localNumVertices; ++i) {
333 part_t ap = parts[i];
334 part_nexts[i] = part_begins[ap];
335 part_begins[ap] = i;
336 }
337
338
339 g_part_xadj = ArrayRCP<part_t>(np + 1);
340 g_part_adj = ArrayRCP<part_t>(localNumEdges);
341 g_part_ew = ArrayRCP<t_scalar_t>(localNumEdges);
342 part_t nindex = 0;
343 g_part_xadj[0] = 0;
344 std::vector<part_t> part_neighbors(np);
345 std::vector<t_scalar_t> part_neighbor_weights(np, 0);
346 std::vector<t_scalar_t> part_neighbor_weights_ordered(np);
347
348 //coarsen for all vertices in my part in order with parts.
349 for (t_lno_t i = 0; i < np; ++i) {
350 part_t num_neighbor_parts = 0;
351 t_lno_t v = part_begins[i];
352 //get part i, and first vertex in this part v.
353 while (v != -1) {
354 //now get the neightbors of v.
355 for (t_offset_t j = offsets[v]; j < offsets[v+1]; ++j) {
356 //get the part of the second vertex.
357 part_t ep = e_parts[j];
358
359 t_scalar_t ew = 1;
360 if (numWeightPerEdge > 0) {
361 ew = edge_weights[j];
362 }
363
364// std::cout << "part:" << i << " v:" << v
365// << " part2:" << ep << " v2:" << edgeIds[j]
366// << " w:" << ew << std::endl;
367
368 if (part_neighbor_weights[ep] < 0.00001) {
369 part_neighbors[num_neighbor_parts++] = ep;
370 }
371
372 part_neighbor_weights[ep] += ew;
373 }
374
375 v = part_nexts[v];
376 }
377
378
379 //now get the part list.
380 for (t_lno_t j = 0; j < num_neighbor_parts; ++j) {
381 part_t neighbor_part = part_neighbors[j];
382 g_part_adj[nindex] = neighbor_part;
383 g_part_ew[nindex++] = part_neighbor_weights[neighbor_part];
384 part_neighbor_weights[neighbor_part] = 0;
385 }
386 g_part_xadj[i + 1] = nindex;
387 }
388 }
389
390#ifdef HAVE_ZOLTAN2_MPI
391 if(comm->getSize() > 1) { // Otherwise it's already handled above and done
392
393 // struct for directory data - note more extensive comments in
394 // Zoltan2_GraphMetricsUtility.hpp which I didn't want to duplicate
395 // because this is in progress. Similar concept there.
396 struct part_info {
397 part_info() : weight(0) {
398 }
399
400 const part_info & operator+=(const part_info & src) {
401 this->weight += src.weight;
402 return *this;
403 }
404
405 bool operator>(const part_info & src) {
406 return (destination_part > src.destination_part);
407 }
408
409 bool operator==(const part_info & src) {
410 return (destination_part == src.destination_part);
411 }
412
413 part_t destination_part;
414 scalar_t weight;
415 };
416
417 bool bUseLocalIDs = false;
418 const int debug_level = 0;
420 directory_t;
421 const RCP<const Comm<int> > rcp_comm(comm,false);
422 directory_t directory(rcp_comm, bUseLocalIDs, debug_level);
423 std::vector<part_t> part_data;
424 std::vector<std::vector<part_info>> user_data;
425
426 envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE Coarsen");
427 {
428 //get the vertices in each part in my part.
429 std::vector<t_lno_t> part_begins(np, -1);
430 std::vector<t_lno_t> part_nexts(localNumVertices, -1);
431
432 //cluster vertices according to their parts.
433 //create local part graph.
434 for (t_lno_t i = 0; i < localNumVertices; ++i) {
435 part_t ap = parts[i];
436 part_nexts[i] = part_begins[ap];
437 part_begins[ap] = i;
438 }
439
440 std::vector<part_t> part_neighbors(np);
441 std::vector<t_scalar_t> part_neighbor_weights(np, 0);
442 std::vector<t_scalar_t> part_neighbor_weights_ordered(np);
443
444 //coarsen for all vertices in my part in order with parts.
445 for (t_lno_t i = 0; i < np; ++i) {
446 part_t num_neighbor_parts = 0;
447 t_lno_t v = part_begins[i];
448
449 //get part i, and first vertex in this part v.
450 while (v != -1) {
451 //now get the neightbors of v.
452 for (t_offset_t j = offsets[v]; j < offsets[v+1]; ++j) {
453 //get the part of the second vertex.
454 part_t ep = e_parts[j];
455
456 t_scalar_t ew = 1;
457 if (numWeightPerEdge > 0) {
458 ew = edge_weights[j];
459 }
460
461 //add it to my local part neighbors for part i.
462 if (part_neighbor_weights[ep] < 0.00001) {
463 part_neighbors[num_neighbor_parts++] = ep;
464 }
465
466 part_neighbor_weights[ep] += ew;
467 }
468
469 v = part_nexts[v];
470 }
471
472 //now get the part list.
473 for (t_lno_t j = 0; j < num_neighbor_parts; ++j) {
474 part_t neighbor_part = part_neighbors[j];
475 part_neighbor_weights_ordered[j] =
476 part_neighbor_weights[neighbor_part];
477 part_neighbor_weights[neighbor_part] = 0;
478 }
479
480 //insert it to tpetra crsmatrix.
481 if (num_neighbor_parts > 0) {
482 part_data.push_back(i); // TODO: optimize to avoid push_back
483 std::vector<part_info> new_user_data(num_neighbor_parts);
484 for(int q = 0; q < num_neighbor_parts; ++q) {
485 part_info & info = new_user_data[q];
486 info.weight = part_neighbor_weights_ordered[q];
487 info.destination_part = part_neighbors[q];
488 }
489 // TODO: optimize to avoid push_back
490 user_data.push_back(new_user_data);
491 }
492 }
493 }
494 envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE Coarsen");
495
496 std::vector<part_t> part_indices(np);
497
498 for (part_t i = 0; i < np; ++i) part_indices[i] = i;
499
500 envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE directory update");
501 directory.update(part_data.size(), &part_data[0], NULL, &user_data[0],
502 NULL, directory_t::Update_Mode::AggregateAdd);
503 envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE directory update");
504
505 envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE directory find");
506 std::vector<std::vector<part_info>> find_user_data(part_indices.size());
507 directory.find(find_user_data.size(), &part_indices[0], NULL,
508 &find_user_data[0], NULL, NULL, false);
509 envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE directory find");
510
511 // Now reconstruct the output data from the directory find data
512 // This code was designed to reproduce the exact format of the original
513 // setup for g_part_xadj, g_part_adj, and g_part_ew but before making any
514 // further changes I wanted to verify if this formatting should be
515 // preserved or potentially changed further.
516
517 // first thing is get the total number of elements
518 int get_total_length = 0;
519 for (size_t n = 0; n < find_user_data.size(); ++n) {
520 get_total_length += find_user_data[n].size();
521 }
522
523 // setup data space
524 g_part_xadj = ArrayRCP<part_t>(np + 1);
525 g_part_adj = ArrayRCP<part_t>(get_total_length);
526 g_part_ew = ArrayRCP<t_scalar_t>(get_total_length);
527
528 // loop through again and fill to match the original formatting
529 int track_insert_index = 0;
530 for(size_t n = 0; n < find_user_data.size(); ++n) {
531 g_part_xadj[n] = track_insert_index;
532 const std::vector<part_info> & user_data_vector = find_user_data[n];
533 for(size_t q = 0; q < user_data_vector.size(); ++q) {
534 const part_info & info = user_data_vector[q];
535 g_part_adj[track_insert_index] = info.destination_part;
536 g_part_ew[track_insert_index] = info.weight;
537 ++track_insert_index;
538 }
539 }
540 g_part_xadj[np] = get_total_length; // complete the series
541 }
542#endif // HAVE_ZOLTAN2_MPI
543}
544
545
548template <class IT, class WT>
550
551 IT heapSize;
552 IT *indices;
553 WT *values;
554 WT epsilon;
555
556
557public:
559 delete [] this->indices;
560 delete [] this->values;
561 }
562
563 void setHeapsize(IT heapsize_){
564 this->heapSize = heapsize_;
565 this->indices = new IT[heapsize_];
566 this->values = new WT[heapsize_];
567 this->epsilon = std::numeric_limits<WT>::epsilon();
568 }
569
570 void addPoint(IT index, WT distance) {
571 WT maxVal = this->values[0];
572 //add only the distance is smaller than the maximum distance.
573// std::cout << "indeX:" << index
574// << " distance:" << distance
575// << " maxVal:" << maxVal << endl;
576 if (distance >= maxVal)
577 return;
578 else {
579 this->values[0] = distance;
580 this->indices[0] = index;
581 this->push_down(0);
582 }
583 }
584
585 //heap push down operation
586 void push_down(IT index_on_heap) {
587 IT child_index1 = 2 * index_on_heap + 1;
588 IT child_index2 = 2 * index_on_heap + 2;
589
590 IT biggerIndex = -1;
591 if(child_index1 < this->heapSize && child_index2 < this->heapSize) {
592
593 if (this->values[child_index1] < this->values[child_index2]) {
594 biggerIndex = child_index2;
595 }
596 else {
597 biggerIndex = child_index1;
598 }
599 }
600 else if(child_index1 < this->heapSize) {
601 biggerIndex = child_index1;
602
603 }
604 else if(child_index2 < this->heapSize) {
605 biggerIndex = child_index2;
606 }
607 if (biggerIndex >= 0 &&
608 this->values[biggerIndex] > this->values[index_on_heap]) {
609 WT tmpVal = this->values[biggerIndex];
610 this->values[biggerIndex] = this->values[index_on_heap];
611 this->values[index_on_heap] = tmpVal;
612
613 IT tmpIndex = this->indices[biggerIndex];
614 this->indices[biggerIndex] = this->indices[index_on_heap];
615 this->indices[index_on_heap] = tmpIndex;
616 this->push_down(biggerIndex);
617 }
618 }
619
620 void initValues() {
621 WT MAXVAL = std::numeric_limits<WT>::max();
622
623 for(IT i = 0; i < this->heapSize; ++i) {
624 this->values[i] = MAXVAL;
625 this->indices[i] = -1;
626 }
627 }
628
629 //returns the total distance to center in the cluster.
631
632 WT nc = 0;
633 for(IT j = 0; j < this->heapSize; ++j) {
634 nc += this->values[j];
635
636// std::cout << "index:" << this->indices[j]
637// << " distance:" << this->values[j] << endl;
638 }
639 return nc;
640 }
641
642 //returns the new center of the cluster.
643 bool getNewCenters(WT *center, WT **coords, int dimension) {
644 bool moved = false;
645
646 for(int i = 0; i < dimension; ++i) {
647 WT nc = 0;
648
649 for(IT j = 0; j < this->heapSize; ++j) {
650 IT k = this->indices[j];
651// std::cout << "i:" << i
652// << " dim:" << dimension
653// << " k:" << k
654// << " heapSize:" << heapSize << endl; nc += coords[i][k];
655 nc += coords[i][k];
656 }
657
658 nc /= this->heapSize;
659 moved = (std::abs(center[i] - nc) > this->epsilon || moved );
660 center[i] = nc;
661 }
662 return moved;
663 }
664
665 void copyCoordinates(IT *permutation) {
666 for (IT i = 0; i < this->heapSize; ++i) {
667 permutation[i] = this->indices[i];
668 }
669 }
670};
671
674template <class IT, class WT>
676
677 int dimension;
678 KmeansHeap<IT,WT> closestPoints;
679
680public:
681
683
685 delete [] center;
686 }
687
688 void setParams(int dimension_, int heapsize) {
689 this->dimension = dimension_;
690 this->center = new WT[dimension_];
691 this->closestPoints.setHeapsize(heapsize);
692 }
693
694 void clearHeap(){
695 this->closestPoints.initValues();
696 }
697
698 bool getNewCenters( WT **coords) {
699 return this->closestPoints.getNewCenters(center, coords, dimension);
700 }
701
702 //returns the distance of the coordinate to the center.
703 //also adds it to the heap.
704 WT getDistance(IT index, WT **elementCoords) {
705 WT distance = 0;
706
707 for (int i = 0; i < this->dimension; ++i) {
708 WT d = (center[i] - elementCoords[i][index]);
709 distance += d * d;
710 }
711 distance = pow(distance, WT(1.0 / this->dimension));
712 closestPoints.addPoint(index, distance);
713
714 return distance;
715 }
716
718 return closestPoints.getTotalDistance();
719 }
720
721 void copyCoordinates(IT *permutation) {
722 closestPoints.copyCoordinates(permutation);
723 }
724};
725
732template <class IT, class WT>
734
735 int dim;
736 IT numElements;
737 WT **elementCoords;
738 IT numClusters;
739 IT required_elements;
740 KMeansCluster<IT,WT> *clusters;
741 WT *maxCoordinates;
742 WT *minCoordinates;
743public:
744
746 delete [] clusters;
747 delete [] maxCoordinates;
748 delete [] minCoordinates;
749 }
750
754 int dim_ ,
755 IT numElements_,
756 WT **elementCoords_,
757 IT required_elements_):
758 dim(dim_),
759 numElements(numElements_),
760 elementCoords(elementCoords_),
761 numClusters((1 << dim_) + 1),
762 required_elements(required_elements_) {
763 this->clusters = new KMeansCluster<IT,WT>[this->numClusters];
764
765 //set dimension and the number of required elements for all clusters.
766 for (int i = 0; i < numClusters; ++i) {
767 this->clusters[i].setParams(this->dim, this->required_elements);
768 }
769
770 this->maxCoordinates = new WT[this->dim];
771 this->minCoordinates = new WT[this->dim];
772
773 //obtain the min and max coordinates for each dimension.
774 for (int j = 0; j < dim; ++j) {
775 this->minCoordinates[j] = this->elementCoords[j][0];
776 this->maxCoordinates[j] = this->elementCoords[j][0];
777
778 for(IT i = 1; i < numElements; ++i) {
779 WT t = this->elementCoords[j][i];
780 if(t > this->maxCoordinates[j]){
781 this->maxCoordinates[j] = t;
782 }
783
784 if (t < minCoordinates[j]) {
785 this->minCoordinates[j] = t;
786 }
787 }
788 }
789
790
791 //assign initial cluster centers.
792 for (int j = 0; j < dim; ++j) {
793 int mod = (1 << (j+1));
794 for (int i = 0; i < numClusters - 1; ++i) {
795 WT c = 0;
796
797 if ( (i % mod) < mod / 2) {
798 c = this->maxCoordinates[j];
799// std::cout << "i:" << i << " j:" << j
800// << " setting max:" << c << endl;
801 }
802 else {
803 c = this->minCoordinates[j];
804 }
805
806 this->clusters[i].center[j] = c;
807 }
808 }
809
810 //last cluster center is placed to middle.
811 for (int j = 0; j < dim; ++j) {
812 this->clusters[numClusters - 1].center[j] =
813 (this->maxCoordinates[j] + this->minCoordinates[j]) / 2;
814 }
815
816
817/*
818 for (int i = 0; i < numClusters; ++i) {
819// std::cout << endl << "cluster:" << i << endl << "\t";
820
821 for (int j = 0; j < dim; ++j) {
822 std::cout << this->clusters[i].center[j] << " ";
823 }
824 }
825*/
826 }
827
828 // Performs kmeans clustering of coordinates.
829 void kmeans() {
830 for (int it = 0; it < 10; ++it) {
831// std::cout << "it:" << it << endl;
832
833 for (IT j = 0; j < this->numClusters; ++j) {
834 this->clusters[j].clearHeap();
835 }
836
837 for (IT i = 0; i < this->numElements; ++i) {
838// std::cout << "i:" << i << " numEl:" << this->numElements << endl;
839
840 for (IT j = 0; j < this->numClusters; ++j) {
841// std::cout << "j:" << j
842// << " numClusters:" << this->numClusters << endl;
843
844 this->clusters[j].getDistance(i,this->elementCoords);
845 }
846 }
847
848 bool moved = false;
849
850 for (IT j = 0; j < this->numClusters; ++j) {
851 moved =
852 (this->clusters[j].getNewCenters(this->elementCoords) || moved );
853 }
854 if (!moved) {
855 break;
856 }
857 }
858 }
859
860 // Finds the cluster in which the coordinates are the closest to each
861 // other.
862 void getMinDistanceCluster(IT *procPermutation) {
863
864 WT minDistance = this->clusters[0].getDistanceToCenter();
865 IT minCluster = 0;
866
867// std::cout << "j:" << 0 << " minDistance:" << minDistance
868// << " minTmpDistance:" << minDistance
869// << " minCluster:" << minCluster << endl;
870
871 for (IT j = 1; j < this->numClusters; ++j) {
872 WT minTmpDistance = this->clusters[j].getDistanceToCenter();
873
874// std::cout << "j:" << j << " minDistance:" << minDistance
875// << " minTmpDistance:" << minTmpDistance
876// << " minCluster:" << minCluster << endl;
877
878 if (minTmpDistance < minDistance) {
879 minDistance = minTmpDistance;
880 minCluster = j;
881 }
882 }
883
884// std::cout << "minCluster:" << minCluster << endl;
885 this->clusters[minCluster].copyCoordinates(procPermutation);
886 }
887};
888
889
890#define MINOF(a,b) (((a)<(b))?(a):(b))
891
899template <typename T>
900void fillContinousArray(T *arr, size_t arrSize, T *val) {
901 if (val == NULL) {
902
903#ifdef HAVE_ZOLTAN2_OMP
904#pragma omp parallel for
905#endif
906 for (size_t i = 0; i < arrSize; ++i) {
907 arr[i] = i;
908 }
909
910 }
911 else {
912 T v = *val;
913#ifdef HAVE_ZOLTAN2_OMP
914#pragma omp parallel for
915#endif
916 for (size_t i = 0; i < arrSize; ++i) {
917// std::cout << "writing to i:" << i << " arr:" << arrSize << endl;
918 arr[i] = v;
919 }
920 }
921}
922
926template <typename part_t, typename pcoord_t, typename node_t>
928
929protected:
930 double commCost;
931
932public:
933
934 // Number of processors and number of tasks
937
938
940 CommunicationModel(part_t no_procs_, part_t no_tasks_):
941 commCost(),
942 no_procs(no_procs_),
943 no_tasks(no_tasks_) {}
944
946
948 return this->no_procs;
949 }
950
952 return this->no_tasks;
953 }
954
956 part_t *task_to_proc,
957 part_t *task_communication_xadj,
958 part_t *task_communication_adj,
959 pcoord_t *task_communication_edge_weight) {
960
961 double totalCost = 0;
962
963 part_t commCount = 0;
964 for (part_t task = 0; task < this->no_tasks; ++task) {
965 int assigned_proc = task_to_proc[task];
966
967 part_t task_adj_begin = task_communication_xadj[task];
968 part_t task_adj_end = task_communication_xadj[task + 1];
969
970 commCount += task_adj_end - task_adj_begin;
971
972 for (part_t task2 = task_adj_begin; task2 < task_adj_end; ++task2) {
973
974 part_t neighborTask = task_communication_adj[task2];
975 int neighborProc = task_to_proc[neighborTask];
976 double distance = getProcDistance(assigned_proc, neighborProc);
977
978 if (task_communication_edge_weight == NULL) {
979 totalCost += distance ;
980 }
981 else {
982 totalCost += distance * task_communication_edge_weight[task2];
983 }
984 }
985 }
986
987 // commCount
988 this->commCost = totalCost;
989 }
990
992 return this->commCost;
993 }
994
995 virtual double getProcDistance(int procId1, int procId2) const = 0;
996
1006 virtual void getMapping(
1007 int myRank,
1008 const RCP<const Environment> &env,
1009 ArrayRCP <part_t> &proc_to_task_xadj,
1010 ArrayRCP <part_t> &proc_to_task_adj,
1011 ArrayRCP <part_t> &task_to_proc,
1012 const Teuchos::RCP <const Teuchos::Comm<int> > comm_
1013 ) const = 0;
1014};
1015
1016
1020template <typename pcoord_t, typename tcoord_t, typename part_t, typename node_t>
1022 public CommunicationModel<part_t, pcoord_t, node_t> {
1023public:
1024 //private:
1025
1026 // Dimension of the processors
1028 // Processor coordinates (allocated outside of the class)
1029 pcoord_t **proc_coords;
1030 // Dimension of the tasks coordinates.
1032 // Task coordinates (allocated outside of the class)
1033 tcoord_t **task_coords;
1034
1035 // TODO: Perhaps delete this and just reference the view size?
1036 // Need to check the handling of size -1 versus size 0
1038
1039 Kokkos::View<part_t *, Kokkos::HostSpace> kokkos_partNoArray;
1040
1044
1047
1048 //public:
1050 CommunicationModel<part_t, pcoord_t, node_t>(),
1051 proc_coord_dim(0),
1052 proc_coords(0),
1053 task_coord_dim(0),
1054 task_coords(0),
1055 partArraySize(-1),
1056 machine_extent(NULL),
1058 machine(NULL),
1060 divide_to_prime_first(false){}
1061
1063
1075 int pcoord_dim_,
1076 pcoord_t **pcoords_,
1077 int tcoord_dim_,
1078 tcoord_t **tcoords_,
1079 part_t no_procs_,
1080 part_t no_tasks_,
1081 int *machine_extent_,
1082 bool *machine_extent_wrap_around_,
1083 const MachineRepresentation<pcoord_t,part_t> *machine_ = NULL
1084 ):
1085 CommunicationModel<part_t, pcoord_t, node_t>(no_procs_, no_tasks_),
1086 proc_coord_dim(pcoord_dim_), proc_coords(pcoords_),
1087 task_coord_dim(tcoord_dim_), task_coords(tcoords_),
1088 partArraySize(-1),
1089 machine_extent(machine_extent_),
1090 machine_extent_wrap_around(machine_extent_wrap_around_),
1091 machine(machine_),
1093 divide_to_prime_first(false) {
1094 }
1095
1096
1097 void setPartArraySize(int psize) {
1098 this->partArraySize = psize;
1099 }
1100
1101 void setPartArray(Kokkos::View<part_t *, Kokkos::HostSpace> pNo) {
1102 this->kokkos_partNoArray = pNo;
1103 }
1104
1112 void getClosestSubset(part_t *proc_permutation, part_t nprocs, part_t ntasks) const{
1113 //currently returns a random subset.
1114
1115 part_t minCoordDim = MINOF(this->task_coord_dim, this->proc_coord_dim);
1117 minCoordDim, nprocs,
1118 this->proc_coords, ntasks);
1119
1120 kma.kmeans();
1121 kma.getMinDistanceCluster(proc_permutation);
1122
1123 for(int i = ntasks; i < nprocs; ++i) {
1124 proc_permutation[i] = -1;
1125 }
1126 /*
1127 //fill array.
1128 fillContinousArray<part_t>(proc_permutation, nprocs, NULL);
1129 '
1130 int _u_umpa_seed = 847449649;
1131 srand (time(NULL));
1132
1133 int a = rand() % 1000 + 1;
1134 _u_umpa_seed -= a;
1135
1136 //permute array randomly.
1137 update_visit_order(proc_permutation, nprocs,_u_umpa_seed, 1);
1138 */
1139 }
1140
1141 // Temporary, necessary for random permutation.
1142 static part_t umpa_uRandom(part_t l, int &_u_umpa_seed)
1143 {
1144 int a = 16807;
1145 int m = 2147483647;
1146 int q = 127773;
1147 int r = 2836;
1148 int lo, hi, test;
1149 double d;
1150
1151 lo = _u_umpa_seed % q;
1152 hi = _u_umpa_seed / q;
1153 test = (a * lo) - (r * hi);
1154 if (test>0)
1155 _u_umpa_seed = test;
1156 else
1157 _u_umpa_seed = test + m;
1158
1159 d = (double) ((double) _u_umpa_seed / (double) m);
1160
1161 return (part_t) (d*(double)l);
1162 }
1163
1164 virtual double getProcDistance(int procId1, int procId2) const {
1165 pcoord_t distance = 0;
1166 if (machine == NULL) {
1167 for (int i = 0 ; i < this->proc_coord_dim; ++i) {
1168 double d =
1169 std::abs(proc_coords[i][procId1] - proc_coords[i][procId2]);
1171 if (machine_extent[i] - d < d) {
1172 d = machine_extent[i] - d;
1173 }
1174 }
1175 distance += d;
1176 }
1177 }
1178 else {
1179 this->machine->getHopCount(procId1, procId2, distance);
1180 }
1181
1182 return distance;
1183 }
1184
1185
1186 // Temporary, does random permutation.
1187 void update_visit_order(part_t* visitOrder, part_t n,
1188 int &_u_umpa_seed, part_t rndm) {
1189 part_t *a = visitOrder;
1190
1191 if (rndm) {
1192 part_t i, u, v, tmp;
1193
1194 if (n <= 4)
1195 return;
1196
1197// srand ( time(NULL) );
1198// _u_umpa_seed = _u_umpa_seed1 - (rand()%100);
1199
1200 for (i = 0; i < n; i += 16) {
1201 u = umpa_uRandom(n-4, _u_umpa_seed);
1202 v = umpa_uRandom(n-4, _u_umpa_seed);
1203
1204 // FIXME (mfh 30 Sep 2015) This requires including
1205 // Zoltan2_AlgMultiJagged.hpp.
1206
1207 ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v], a[u], tmp);
1208 ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 1], a[u + 1], tmp);
1209 ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 2], a[u + 2], tmp);
1210 ZOLTAN2_ALGMULTIJAGGED_SWAP(a[v + 3], a[u + 3], tmp);
1211 }
1212 }
1213 else {
1214 part_t i, end = n / 4;
1215
1216 for (i = 1; i < end; i++) {
1217 part_t j = umpa_uRandom(n - i, _u_umpa_seed);
1218 part_t t = a[j];
1219 a[j] = a[n - i];
1220 a[n - i] = t;
1221 }
1222 }
1223// PermuteInPlace(visitOrder, n);
1224 }
1225
1226
1227
1236 virtual void getMapping(
1237 int myRank,
1238 const RCP<const Environment> &env,
1239 ArrayRCP <part_t> &rcp_proc_to_task_xadj,
1240 ArrayRCP <part_t> &rcp_proc_to_task_adj,
1241 ArrayRCP <part_t> &rcp_task_to_proc,
1242 const Teuchos::RCP <const Teuchos::Comm<int> > comm_
1243 ) const {
1244
1245 rcp_proc_to_task_xadj = ArrayRCP <part_t>(this->no_procs + 1);
1246 rcp_proc_to_task_adj = ArrayRCP <part_t>(this->no_tasks);
1247 rcp_task_to_proc = ArrayRCP <part_t>(this->no_tasks);
1248
1249 // Holds the pointer to the task array
1250 part_t *proc_to_task_xadj = rcp_proc_to_task_xadj.getRawPtr();
1251
1252 // Holds the indices of task wrt to proc_to_task_xadj array.
1253 part_t *proc_to_task_adj = rcp_proc_to_task_adj.getRawPtr();
1254
1255 // Holds the processors mapped to tasks.
1256 part_t *task_to_proc = rcp_task_to_proc.getRawPtr();
1257
1258 part_t invalid = 0;
1259 fillContinousArray<part_t> (proc_to_task_xadj, this->no_procs + 1, &invalid);
1260
1261 // Obtain the number of parts that should be divided.
1262 part_t num_parts = MINOF(this->no_procs, this->no_tasks);
1263
1264 // Obtain the min coordinate dim.
1265 // No more want to do min coord dim. If machine dimension > task_dim,
1266 // we end up with a long line.
1267// part_t minCoordDim = MINOF(this->task_coord_dim, this->proc_coord_dim);
1268
1269 int recursion_depth = partArraySize;
1270
1271// if (partArraySize < minCoordDim)
1272// recursion_depth = minCoordDim;
1273 if (partArraySize == -1) {
1274
1276 // It is difficult to estimate the number of steps in this case
1277 // as each branch will have different depth.
1278 // The worst case happens when all prime factors are 3s.
1279 // P = 3^n, n recursion depth will divide parts to 2x and x
1280 // and n recursion depth with divide 2x into x and x.
1281 // Set it to upperbound here.
1282 // We could calculate the exact value here as well, but the
1283 // partitioning algorithm skips further ones anyways. recursion_depth = log(float(this->no_procs)) / log(2.0) * 2 + 1;
1284 }
1285 else {
1286 recursion_depth = log(float(this->no_procs)) / log(2.0) + 1;
1287 }
1288 }
1289
1290 // Number of permutations for tasks and processors
1291 int taskPerm = 1;
1292 int procPerm = 1;
1293
1294 // Get number of different permutations for task dimension ordering
1295 if (this->task_coord_dim <= 8)
1296 taskPerm = z2Fact<int>(this->task_coord_dim);
1297 // Prevent overflow
1298 else
1299 taskPerm = z2Fact<int>(8);
1300
1301 // Get number of different permutations for proc dimension ordering
1302 if (this->proc_coord_dim <= 8)
1303 procPerm = z2Fact<int>(this->proc_coord_dim);
1304 // Prevent overflow
1305 else
1306 procPerm = z2Fact<int>(8);
1307
1308
1309 // Total number of permutations (both task and proc permuted)
1310 int permutations = taskPerm * procPerm;
1311
1312 // Add permutations where we divide the processors with longest
1313 // dimension but permute tasks.
1314 permutations += taskPerm;
1315
1316 // Add permutations where we divide the tasks with longest
1317 // dimension but permute procs.
1318 permutations += procPerm;
1319
1320 // Add permutation with both tasks and procs divided by longest
1321 // dimension
1322 permutations += 1;
1323 //add one also that partitions based the longest dimension.
1324
1325 // Holds the pointers to proc_adjList
1326 part_t *proc_xadj = new part_t[num_parts+1];
1327
1328 // Holds the processors in parts according to the result of
1329 // partitioning algorithm.
1330 // The processors assigned to part x is at
1331 // proc_adjList[ proc_xadj[x] : proc_xadj[x + 1] ]
1332 part_t *proc_adjList = new part_t[this->no_procs];
1333
1334
1335 part_t used_num_procs = this->no_procs;
1336 if (this->no_procs > this->no_tasks) {
1337 // Obtain the subset of the processors that are closest to each
1338 // other.
1339 this->getClosestSubset(proc_adjList, this->no_procs,
1340 this->no_tasks);
1341 used_num_procs = this->no_tasks;
1342 }
1343 else {
1344 fillContinousArray<part_t>(proc_adjList,this->no_procs, NULL);
1345 }
1346
1347 // Index of the permutation
1348 int myPermutation = myRank % permutations;
1349 bool task_partition_along_longest_dim = false;
1350 bool proc_partition_along_longest_dim = false;
1351
1352
1353 int myProcPerm = 0;
1354 int myTaskPerm = 0;
1355
1356 if (myPermutation == 0) {
1357 task_partition_along_longest_dim = true;
1358 proc_partition_along_longest_dim = true;
1359 }
1360 else {
1361 --myPermutation;
1362 if (myPermutation < taskPerm) {
1363 proc_partition_along_longest_dim = true;
1364 // Index of the task permutation
1365 myTaskPerm = myPermutation;
1366 }
1367 else {
1368 myPermutation -= taskPerm;
1369 if (myPermutation < procPerm) {
1370 task_partition_along_longest_dim = true;
1371 // Index of the task permutation
1372 myProcPerm = myPermutation;
1373 }
1374 else {
1375 myPermutation -= procPerm;
1376 // Index of the proc permutation
1377 myProcPerm = myPermutation % procPerm;
1378 // Index of the task permutation
1379 myTaskPerm = myPermutation / procPerm;
1380 }
1381 }
1382 }
1383
1384/*
1385 if (task_partition_along_longest_dim &&
1386 proc_partition_along_longest_dim) {
1387 std::cout <<"me:" << myRank << " task:longest proc:longest"
1388 << " numPerms:" << permutations << std::endl;
1389 }
1390 else if (proc_partition_along_longest_dim) {
1391 std::cout <<"me:" << myRank << " task:" << myTaskPerm
1392 << " proc:longest" << " numPerms:" << permutations << std::endl;
1393 }
1394 else if (task_partition_along_longest_dim) {
1395 std::cout <<"me:" << myRank << " task: longest" << " proc:"
1396 << myProcPerm << " numPerms:" << permutations << std::endl;
1397 }
1398 else {
1399 std::cout <<"me:" << myRank << " task:" << myTaskPerm << " proc:"
1400 << myProcPerm << " numPerms:" << permutations << std::endl;
1401 }
1402*/
1403
1404 int *permutation = new int[(this->proc_coord_dim > this->task_coord_dim)
1405 ? this->proc_coord_dim : this->task_coord_dim];
1406
1407 // Get the permutation order from the proc permutation index.
1408 if (this->proc_coord_dim <= 8)
1409 ithPermutation<int>(this->proc_coord_dim, myProcPerm, permutation);
1410 else
1411 ithPermutation<int>(8, myProcPerm, permutation);
1412
1413/*
1414 // Reorder the coordinate dimensions.
1415 pcoord_t **pcoords = allocMemory<pcoord_t *>(this->proc_coord_dim);
1416 for (int i = 0; i < this->proc_coord_dim; ++i) {
1417 pcoords[i] = this->proc_coords[permutation[i]];
1418// std::cout << permutation[i] << " ";
1419 }
1420*/
1421
1422 int procdim = this->proc_coord_dim;
1423 pcoord_t **pcoords = this->proc_coords;
1424
1425/*
1426 int procdim = this->proc_coord_dim;
1427 procdim = 6;
1428 //reorder the coordinate dimensions.
1429 pcoord_t **pcoords = allocMemory<pcoord_t *>(procdim);
1430 for (int i = 0; i < procdim; ++i) {
1431 pcoords[i] = new pcoord_t[used_num_procs] ;
1432// this->proc_coords[permutation[i]];
1433 }
1434
1435 for (int k = 0; k < used_num_procs ; k++) {
1436 pcoords[0][k] = (int (this->proc_coords[0][k]) / 2) * 64;
1437 pcoords[3][k] = (int (this->proc_coords[0][k]) % 2) * 8 ;
1438
1439 pcoords[1][k] = (int (this->proc_coords[1][k]) / 2) * 8 * 2400;
1440 pcoords[4][k] = (int (this->proc_coords[1][k]) % 2) * 8;
1441 pcoords[2][k] = ((int (this->proc_coords[2][k])) / 8) * 160;
1442 pcoords[5][k] = ((int (this->proc_coords[2][k])) % 8) * 5;
1443
1444 //if (this->proc_coords[0][k] == 40 &&
1445 // this->proc_coords[1][k] == 8 &&
1446 // this->proc_coords[2][k] == 48) {
1447 if (this->proc_coords[0][k] == 5 &&
1448 this->proc_coords[1][k] == 0 &&
1449 this->proc_coords[2][k] == 10) {
1450 std::cout << "pcoords[0][k]:" << pcoords[0][k]
1451 << "pcoords[1][k]:" << pcoords[1][k]
1452 << "pcoords[2][k]:" << pcoords[2][k]
1453 << "pcoords[3][k]:" << pcoords[3][k]
1454 << "pcoords[4][k]:" << pcoords[4][k]
1455 << "pcoords[5][k]:" << pcoords[5][k] << std::endl;
1456 }
1457 else if (pcoords[0][k] == 64 &&
1458 pcoords[1][k] == 0 &&
1459 pcoords[2][k] == 160 &&
1460 pcoords[3][k] == 16 &&
1461 pcoords[4][k] == 0 &&
1462 pcoords[5][k] == 10) {
1463 std::cout << "this->proc_coords[0][k]:" << this->proc_coords[0][k]
1464 << "this->proc_coords[1][k]:" << this->proc_coords[1][k]
1465 << "this->proc_coords[2][k]:" << this->proc_coords[2][k]
1466 << std::endl;
1467 }
1468 }
1469*/
1470
1471// if (partNoArray == NULL)
1472// std::cout << "partNoArray is null" << std::endl;
1473// std::cout << "recursion_depth:" << recursion_depth
1474// << " partArraySize:" << partArraySize << std::endl;
1475
1476 // Optimization for Dragonfly Networks, First Level of partitioning
1477 // is imbalanced to ensure procs are divided by first RCA
1478 // coord (a.k.a. group).
1479 part_t num_group_count = 1;
1480 part_t *group_count = NULL;
1481
1482 if (machine != NULL)
1483 num_group_count = machine->getNumUniqueGroups();
1484
1485 if (num_group_count > 1) {
1486 group_count = new part_t[num_group_count];
1487 memset(group_count, 0, sizeof(part_t) * num_group_count);
1488
1489 machine->getGroupCount(group_count);
1490 }
1491
1492 // Do the partitioning and renumber the parts.
1493 env->timerStart(MACRO_TIMERS, "Mapping - Proc Partitioning");
1494 // Partitioning of Processors
1496
1497 typedef typename node_t::device_type device_t;
1498 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1499 Kokkos::View<pcoord_t**, Kokkos::LayoutLeft, device_t>
1500 kokkos_pcoords("pcoords", this->no_procs, procdim);
1501 auto host_kokkos_pcoords = Kokkos::create_mirror_view(kokkos_pcoords);
1502 for(int i = 0; i < procdim; ++i) {
1503 for(int j = 0; j < this->no_procs; ++j) {
1504 host_kokkos_pcoords(j,i) = pcoords[i][j];
1505 }
1506 }
1507 Kokkos::deep_copy(kokkos_pcoords, host_kokkos_pcoords);
1508
1509 Kokkos::View<part_t*, device_t> initial_selected_coords_output_permutation_pcoords(
1510 "initial_selected_coords_output_permutation_pcoords", this->no_procs);
1511 typename Kokkos::View<part_t*, device_t>::HostMirror
1512 host_initial_selected_coords_output_permutation_pcoords =
1513 Kokkos::create_mirror_view(initial_selected_coords_output_permutation_pcoords);
1514 for(int n = 0; n < this->no_procs; ++n) {
1515 host_initial_selected_coords_output_permutation_pcoords(n) =
1516 proc_adjList[n];
1517 }
1518 Kokkos::deep_copy(initial_selected_coords_output_permutation_pcoords,
1519 host_initial_selected_coords_output_permutation_pcoords);
1520
1521 // Note num_group_count = 1 when group_count = NULL - perhaps could change
1522 Kokkos::View<part_t *, Kokkos::HostSpace> kokkos_group_count(
1523 "kokkos_group_count", group_count ? num_group_count : 0);
1524 if(group_count) {
1525 for(int n = 0; n < num_group_count; ++n) {
1526 kokkos_group_count(n) = group_count[n];
1527 }
1528 }
1529
1530 mj_partitioner.sequential_task_partitioning(
1531 env,
1532 this->no_procs,
1533 used_num_procs,
1534 num_parts,
1535 procdim,
1536 //minCoordDim,
1537 kokkos_pcoords,
1538 initial_selected_coords_output_permutation_pcoords,
1539 proc_xadj,
1540 recursion_depth,
1542 proc_partition_along_longest_dim, // false
1545 num_group_count,
1546 kokkos_group_count);
1547 env->timerStop(MACRO_TIMERS, "Mapping - Proc Partitioning");
1548// comm_->barrier();
1549// std::cout << "mj_partitioner.for procs over" << std::endl;
1550// freeArray<pcoord_t *>(pcoords);
1551
1552 Kokkos::deep_copy(host_initial_selected_coords_output_permutation_pcoords,
1553 initial_selected_coords_output_permutation_pcoords);
1554 for(int n = 0; n < this->no_procs; ++n) {
1555 proc_adjList[n] =
1556 host_initial_selected_coords_output_permutation_pcoords(n);
1557 }
1558
1559 part_t *task_xadj = new part_t[num_parts + 1];
1560 part_t *task_adjList = new part_t[this->no_tasks];
1561
1562 // Fill task_adjList st: task_adjList[i] <- i.
1563 fillContinousArray<part_t>(task_adjList,this->no_tasks, NULL);
1564
1565 // Get the permutation order from the task permutation index.
1566 if (this->task_coord_dim <= 8)
1567 ithPermutation<int>(this->task_coord_dim, myTaskPerm, permutation);
1568 else
1569 ithPermutation<int>(8, myTaskPerm, permutation);
1570
1571 // Reorder task coordinate dimensions.
1572 tcoord_t **tcoords = new tcoord_t*[this->task_coord_dim];
1573 for(int i = 0; i < this->task_coord_dim; ++i) {
1574 tcoords[i] = this->task_coords[permutation[i]];
1575 }
1576
1577 // coordinates in MJ are LayoutLeft since Tpetra Multivector gives LayoutLeft
1578 Kokkos::View<tcoord_t**, Kokkos::LayoutLeft, device_t>
1579 kokkos_tcoords("tcoords", this->no_tasks, this->task_coord_dim);
1580 auto host_kokkos_tcoords = Kokkos::create_mirror_view(kokkos_tcoords);
1581 for(int i = 0; i < this->task_coord_dim; ++i) {
1582 for(int j = 0; j < this->no_tasks; ++j) {
1583 host_kokkos_tcoords(j,i) = tcoords[i][j];
1584 }
1585 }
1586 Kokkos::deep_copy(kokkos_tcoords, host_kokkos_tcoords);
1587
1588 env->timerStart(MACRO_TIMERS, "Mapping - Task Partitioning");
1589
1590 Kokkos::View<part_t*, device_t> initial_selected_coords_output_permutation_tcoords(
1591 "initial_selected_coords_output_permutation_tcoords", this->no_tasks);
1592 typename Kokkos::View<part_t*, device_t>::HostMirror
1593 host_initial_selected_coords_output_permutation_tcoords =
1594 Kokkos::create_mirror_view(initial_selected_coords_output_permutation_tcoords);
1595 for(int n = 0; n < this->no_tasks; ++n) {
1596 host_initial_selected_coords_output_permutation_tcoords(n) =
1597 task_adjList[n];
1598 }
1599 Kokkos::deep_copy(initial_selected_coords_output_permutation_tcoords,
1600 host_initial_selected_coords_output_permutation_tcoords);
1601
1602 //partitioning of tasks
1603 mj_partitioner.sequential_task_partitioning(
1604 env,
1605 this->no_tasks,
1606 this->no_tasks,
1607 num_parts,
1608 this->task_coord_dim,
1609 //minCoordDim,
1610 kokkos_tcoords,
1611 initial_selected_coords_output_permutation_tcoords,
1612 task_xadj,
1613 recursion_depth,
1615 task_partition_along_longest_dim,
1618 num_group_count,
1619 kokkos_group_count);
1620 env->timerStop(MACRO_TIMERS, "Mapping - Task Partitioning");
1621
1622 Kokkos::deep_copy(host_initial_selected_coords_output_permutation_tcoords,
1623 initial_selected_coords_output_permutation_tcoords);
1624 for(int n = 0; n < this->no_tasks; ++n) {
1625 task_adjList[n] =
1626 host_initial_selected_coords_output_permutation_tcoords(n);
1627 }
1628
1629// std::cout << "myrank:" << myRank << std::endl;
1630// comm_->barrier();
1631// std::cout << "mj_partitioner.sequential_task_partitioning over"
1632// << std::endl;
1633
1634 delete [] tcoords;
1635 delete [] permutation;
1636
1637
1638 //filling proc_to_task_xadj, proc_to_task_adj, task_to_proc arrays.
1639 for(part_t i = 0; i < num_parts; ++i) {
1640
1641 part_t proc_index_begin = proc_xadj[i];
1642 part_t task_begin_index = task_xadj[i];
1643 part_t proc_index_end = proc_xadj[i + 1];
1644 part_t task_end_index = task_xadj[i + 1];
1645
1646
1647 if(proc_index_end - proc_index_begin != 1) {
1648 std::cerr << "Error at partitioning of processors" << std::endl;
1649 std::cerr << "PART:" << i << " is assigned to "
1650 << proc_index_end - proc_index_begin << " processors."
1651 << std::endl;
1652 std::terminate();
1653 }
1654 part_t assigned_proc = proc_adjList[proc_index_begin];
1655 proc_to_task_xadj[assigned_proc] = task_end_index - task_begin_index;
1656 }
1657
1658 //holds the pointer to the task array
1659 //convert proc_to_task_xadj to CSR index array
1660 part_t *proc_to_task_xadj_work = new part_t[this->no_procs];
1661 part_t sum = 0;
1662 for(part_t i = 0; i < this->no_procs; ++i) {
1663 part_t tmp = proc_to_task_xadj[i];
1664 proc_to_task_xadj[i] = sum;
1665 sum += tmp;
1666 proc_to_task_xadj_work[i] = sum;
1667 }
1668 proc_to_task_xadj[this->no_procs] = sum;
1669
1670 for(part_t i = 0; i < num_parts; ++i){
1671
1672 part_t proc_index_begin = proc_xadj[i];
1673 part_t task_begin_index = task_xadj[i];
1674 part_t task_end_index = task_xadj[i + 1];
1675
1676 part_t assigned_proc = proc_adjList[proc_index_begin];
1677
1678 for (part_t j = task_begin_index; j < task_end_index; ++j) {
1679 part_t taskId = task_adjList[j];
1680
1681 task_to_proc[taskId] = assigned_proc;
1682
1683 proc_to_task_adj [--proc_to_task_xadj_work[assigned_proc]] = taskId;
1684 }
1685 }
1686
1687/*
1688 if (myPermutation == 0) {
1689 std::ofstream gnuPlotCode ("mymapping.out", std::ofstream::out);
1690
1691 for (part_t i = 0; i < num_parts; ++i) {
1692
1693 part_t proc_index_begin = proc_xadj[i];
1694 part_t proc_index_end = proc_xadj[i + 1];
1695
1696 if (proc_index_end - proc_index_begin != 1) {
1697 std::cerr << "Error at partitioning of processors" << std::endl;
1698 std::cerr << "PART:" << i << " is assigned to "
1699 << proc_index_end - proc_index_begin << " processors."
1700 << std::endl;
1701 exit(1);
1702 }
1703
1704 part_t assigned_proc = proc_adjList[proc_index_begin];
1705 gnuPlotCode << "Rank:" << i << " "
1706 << this->proc_coords[0][assigned_proc] << " "
1707 << this->proc_coords[1][assigned_proc] << " "
1708 << this->proc_coords[2][assigned_proc] << " "
1709 << pcoords[0][assigned_proc] << " "
1710 << pcoords[1][assigned_proc] << " "
1711 << pcoords[2][assigned_proc] << " "
1712 << pcoords[3][assigned_proc] << std::endl;
1713 }
1714
1715 gnuPlotCode << "Machine Extent:" << std::endl;
1716 //filling proc_to_task_xadj, proc_to_task_adj, task_to_proc arrays.
1717 for (part_t i = 0; i < num_parts; ++i) {
1718
1719 part_t proc_index_begin = proc_xadj[i];
1720 part_t proc_index_end = proc_xadj[i + 1];
1721
1722 if (proc_index_end - proc_index_begin != 1) {
1723 std::cerr << "Error at partitioning of processors" << std::endl;
1724 std::cerr << "PART:" << i << " is assigned to "
1725 << proc_index_end - proc_index_begin << " processors."
1726 << std::endl;
1727 exit(1);
1728 }
1729
1730 part_t assigned_proc = proc_adjList[proc_index_begin];
1731 gnuPlotCode << "Rank:" << i << " "
1732 << this->proc_coords[0][assigned_proc] << " "
1733 << this->proc_coords[1][assigned_proc] << " "
1734 << this->proc_coords[2][assigned_proc] << std::endl;
1735 }
1736 gnuPlotCode.close();
1737 }
1738*/
1739
1740 delete [] proc_to_task_xadj_work;
1741 delete [] task_xadj;
1742 delete [] task_adjList;
1743 delete [] proc_xadj;
1744 delete [] proc_adjList;
1745 }
1746
1747};
1748
1749template <typename Adapter, typename part_t>
1751protected:
1752
1753#ifndef DOXYGEN_SHOULD_SKIP_THIS
1754
1755 typedef typename Adapter::scalar_t pcoord_t;
1756 typedef typename Adapter::scalar_t tcoord_t;
1757 typedef typename Adapter::scalar_t scalar_t;
1758 typedef typename Adapter::lno_t lno_t;
1759
1760#if defined(KOKKOS_ENABLE_CUDA)
1761 using node_t = Kokkos::Compat::KokkosDeviceWrapperNode<
1762 Kokkos::Cuda, Kokkos::CudaSpace>;
1763#elif defined(KOKKOS_ENABLE_HIP)
1764 using node_t = Kokkos::Compat::KokkosDeviceWrapperNode<
1765 Kokkos::Experimental::HIP, Kokkos::Experimental::HIPSpace>;
1766#else
1767 using node_t = typename Adapter::node_t;
1768#endif
1769
1770#endif
1771
1772// RCP<const Environment> env;
1773
1774 // Holds the pointer to the task array
1775 ArrayRCP<part_t> proc_to_task_xadj;
1776// = allocMemory<part_t> (this->no_procs + 1);
1777
1778 // Holds the indices of tasks wrt to proc_to_task_xadj array.
1779 ArrayRCP<part_t> proc_to_task_adj;
1780// = allocMemory<part_t>(this->no_tasks);
1781
1782 // Holds the processors mapped to tasks.
1783 ArrayRCP<part_t> task_to_proc;
1784// = allocMemory<part_t>(this->no_procs);
1785
1786 // Holds the processors mapped to tasks.
1787 ArrayRCP<part_t> local_task_to_rank;
1788// = allocMemory<part_t>(this->no_procs);
1789
1794 ArrayRCP<part_t> task_communication_xadj;
1795 ArrayRCP<part_t> task_communication_adj;
1797
1798
1802 void doMapping(int myRank,
1803 const Teuchos::RCP<const Teuchos::Comm<int> > comm_) {
1804
1805 if (this->proc_task_comm) {
1806 this->proc_task_comm->getMapping(
1807 myRank,
1808 this->env,
1809 // Holds the pointer to the task array
1810 this->proc_to_task_xadj,
1811 // Holds the indices of task wrt to proc_to_task_xadj array
1812 this->proc_to_task_adj,
1813 // Holds the processors mapped to tasks
1814 this->task_to_proc,
1815 comm_
1816 );
1817 }
1818 else {
1819 std::cerr << "communicationModel is not specified in the Mapper"
1820 << std::endl;
1821 exit(1);
1822 }
1823 }
1824
1825
1829 RCP<Comm<int> > create_subCommunicator() {
1830 int procDim = this->proc_task_comm->proc_coord_dim;
1831 int taskDim = this->proc_task_comm->task_coord_dim;
1832
1833 // Number of permutations for tasks and processors
1834 int taskPerm = 1;
1835 int procPerm = 1;
1836
1837 // Get number of different permutations for task dimension ordering
1838 if (taskDim <= 8)
1839 taskPerm = z2Fact<int>(taskDim);
1840 // Prevent overflow
1841 else
1842 taskPerm = z2Fact<int>(8);
1843
1844 // Get number of different permutations for proc dimension ordering
1845 if (procDim <= 8)
1846 procPerm = z2Fact<int>(procDim);
1847 // Prevent overflow
1848 else
1849 procPerm = z2Fact<int>(8);
1850
1851 // Total number of permutations
1852 int idealGroupSize = taskPerm * procPerm;
1853
1854 // For the one that does longest dimension partitioning.
1855 idealGroupSize += taskPerm + procPerm + 1;
1856
1857 int myRank = this->comm->getRank();
1858 int commSize = this->comm->getSize();
1859
1860 int myGroupIndex = myRank / idealGroupSize;
1861
1862 int prevGroupBegin = (myGroupIndex - 1)* idealGroupSize;
1863 if (prevGroupBegin < 0) prevGroupBegin = 0;
1864 int myGroupBegin = myGroupIndex * idealGroupSize;
1865 int myGroupEnd = (myGroupIndex + 1) * idealGroupSize;
1866 int nextGroupEnd = (myGroupIndex + 2)* idealGroupSize;
1867
1868 if (myGroupEnd > commSize) {
1869 myGroupBegin = prevGroupBegin;
1870 myGroupEnd = commSize;
1871 }
1872 if (nextGroupEnd > commSize) {
1873 myGroupEnd = commSize;
1874 }
1875 int myGroupSize = myGroupEnd - myGroupBegin;
1876
1877 part_t *myGroup = new part_t[myGroupSize];
1878 for (int i = 0; i < myGroupSize; ++i) {
1879 myGroup[i] = myGroupBegin + i;
1880 }
1881// std::cout << "me:" << myRank << " myGroupBegin:" << myGroupBegin
1882// << " myGroupEnd:" << myGroupEnd << endl;
1883
1884 ArrayView<const part_t> myGroupView(myGroup, myGroupSize);
1885
1886 RCP<Comm<int> > subComm =
1887 this->comm->createSubcommunicator(myGroupView);
1888 delete [] myGroup;
1889 return subComm;
1890 }
1891
1892
1897 //create the sub group.
1898 RCP<Comm<int> > subComm = this->create_subCommunicator();
1899 //calculate cost.
1900 double myCost = this->proc_task_comm->getCommunicationCostMetric();
1901// std::cout << "me:" << this->comm->getRank() << " myCost:"
1902// << myCost << std::endl;
1903 double localCost[2], globalCost[2];
1904
1905 localCost[0] = myCost;
1906 localCost[1] = double(subComm->getRank());
1907
1908 globalCost[1] = globalCost[0] = std::numeric_limits<double>::max();
1910 reduceAll<int, double>(*subComm, reduceBest,
1911 2, localCost, globalCost);
1912
1913 int sender = int(globalCost[1]);
1914
1915/*
1916 if ( this->comm->getRank() == 0) {
1917 std::cout << "me:" << localCost[1] <<
1918 " localcost:" << localCost[0]<<
1919 " bestcost:" << globalCost[0] <<
1920 " Sender:" << sender <<
1921 " procDim" << proc_task_comm->proc_coord_dim <<
1922 " taskDim:" << proc_task_comm->task_coord_dim << std::endl;
1923 }
1924*/
1925
1926// std::cout << "me:" << localCost[1] << " localcost:" << localCost[0]
1927// << " bestcost:" << globalCost[0] << endl;
1928// std::cout << "me:" << localCost[1] << " proc:" << globalCost[1]
1929// << endl;
1930 broadcast(*subComm, sender, this->ntasks,
1931 this->task_to_proc.getRawPtr());
1932 broadcast(*subComm, sender, this->nprocs,
1933 this->proc_to_task_xadj.getRawPtr());
1934 broadcast(*subComm, sender, this->ntasks,
1935 this->proc_to_task_adj.getRawPtr());
1936 }
1937
1938 //write mapping to gnuPlot code to visualize.
1940 std::ofstream gnuPlotCode("gnuPlot.plot", std::ofstream::out);
1941
1942 int mindim = MINOF(proc_task_comm->proc_coord_dim,
1943 proc_task_comm->task_coord_dim);
1944 std::string ss = "";
1945 for (part_t i = 0; i < this->nprocs; ++i) {
1946
1947 std::string procFile = Teuchos::toString<int>(i) + "_mapping.txt";
1948 if (i == 0) {
1949 gnuPlotCode << "plot \"" << procFile << "\"\n";
1950 }
1951 else {
1952 gnuPlotCode << "replot \"" << procFile << "\"\n";
1953 }
1954
1955 std::ofstream inpFile(procFile.c_str(), std::ofstream::out);
1956
1957 std::string gnuPlotArrow = "set arrow from ";
1958 for (int j = 0; j < mindim; ++j) {
1959 if (j == mindim - 1) {
1960 inpFile << proc_task_comm->proc_coords[j][i];
1961 gnuPlotArrow +=
1962 Teuchos::toString<float>(proc_task_comm->proc_coords[j][i]);
1963
1964 }
1965 else {
1966 inpFile << proc_task_comm->proc_coords[j][i] << " ";
1967 gnuPlotArrow +=
1968 Teuchos::toString<float>(proc_task_comm->
1969 proc_coords[j][i]) + ",";
1970 }
1971 }
1972 gnuPlotArrow += " to ";
1973
1974 inpFile << std::endl;
1975 ArrayView<part_t> a = this->getAssignedTasksForProc(i);
1976
1977 for (int k = 0; k < a.size(); ++k) {
1978 int j = a[k];
1979// std::cout << "i:" << i << " j:"
1980 std::string gnuPlotArrow2 = gnuPlotArrow;
1981 for (int z = 0; z < mindim; ++z) {
1982 if (z == mindim - 1) {
1983
1984// std::cout << "z:" << z << " j:" << j << " "
1985// << proc_task_comm->task_coords[z][j] << endl;
1986 inpFile << proc_task_comm->task_coords[z][j];
1987 gnuPlotArrow2 +=
1988 Teuchos::toString<float>(proc_task_comm->task_coords[z][j]);
1989 }
1990 else {
1991 inpFile << proc_task_comm->task_coords[z][j] << " ";
1992 gnuPlotArrow2 +=
1993 Teuchos::toString<float>(proc_task_comm->
1994 task_coords[z][j]) + ",";
1995 }
1996 }
1997 ss += gnuPlotArrow2 + "\n";
1998 inpFile << std::endl;
1999 }
2000 inpFile.close();
2001 }
2002 gnuPlotCode << ss;
2003 gnuPlotCode << "\nreplot\n pause -1 \n";
2004 gnuPlotCode.close();
2005 }
2006
2007 //write mapping to gnuPlot code to visualize.
2008 void writeMapping2(int myRank) {
2009 std::string rankStr = Teuchos::toString<int>(myRank);
2010 std::string gnuPlots = "gnuPlot", extentionS = ".plot";
2011 std::string outF = gnuPlots + rankStr+ extentionS;
2012 std::ofstream gnuPlotCode(outF.c_str(), std::ofstream::out);
2013
2015 *tmpproc_task_comm =
2016 static_cast <CoordinateCommunicationModel<
2017 pcoord_t, tcoord_t, part_t, node_t> * > (
2019
2020// int mindim = MINOF(tmpproc_task_comm->proc_coord_dim,
2021// tmpproc_task_comm->task_coord_dim);
2022 int mindim = tmpproc_task_comm->proc_coord_dim;
2023 if (mindim != 3) {
2024 std::cerr << "Mapping Write is only good for 3 dim" << std::endl;
2025 return;
2026 }
2027 std::string ss = "";
2028 std::string procs = "";
2029
2030 std::set < std::tuple<int,int,int,int,int,int> > my_arrows;
2031 for (part_t origin_rank = 0; origin_rank < this->nprocs; ++origin_rank) {
2032 ArrayView<part_t> a = this->getAssignedTasksForProc(origin_rank);
2033 if (a.size() == 0) {
2034 continue;
2035 }
2036
2037 std::string gnuPlotArrow = "set arrow from ";
2038 for (int j = 0; j < mindim; ++j) {
2039 if (j == mindim - 1) {
2040 gnuPlotArrow +=
2041 Teuchos::toString<float>(tmpproc_task_comm->
2042 proc_coords[j][origin_rank]);
2043 procs +=
2044 Teuchos::toString<float>(tmpproc_task_comm->
2045 proc_coords[j][origin_rank]);
2046
2047 }
2048 else {
2049 gnuPlotArrow +=
2050 Teuchos::toString<float>(tmpproc_task_comm->
2051 proc_coords[j][origin_rank]) + ",";
2052 procs +=
2053 Teuchos::toString<float>(tmpproc_task_comm->
2054 proc_coords[j][origin_rank])+ " ";
2055 }
2056 }
2057 procs += "\n";
2058
2059 gnuPlotArrow += " to ";
2060
2061
2062 for (int k = 0; k < a.size(); ++k) {
2063 int origin_task = a[k];
2064
2065 for (int nind = task_communication_xadj[origin_task];
2066 nind < task_communication_xadj[origin_task + 1]; ++nind) {
2067 int neighbor_task = task_communication_adj[nind];
2068
2069 bool differentnode = false;
2070
2071 int neighbor_rank = this->getAssignedProcForTask(neighbor_task);
2072
2073 for (int j = 0; j < mindim; ++j) {
2074 if (int(tmpproc_task_comm->proc_coords[j][origin_rank]) !=
2075 int(tmpproc_task_comm->proc_coords[j][neighbor_rank])) {
2076 differentnode = true; break;
2077 }
2078 }
2079 std::tuple<int,int,int, int, int, int> foo(
2080 int(tmpproc_task_comm->proc_coords[0][origin_rank]),
2081 int(tmpproc_task_comm->proc_coords[1][origin_rank]),
2082 int(tmpproc_task_comm->proc_coords[2][origin_rank]),
2083 int(tmpproc_task_comm->proc_coords[0][neighbor_rank]),
2084 int(tmpproc_task_comm->proc_coords[1][neighbor_rank]),
2085 int(tmpproc_task_comm->proc_coords[2][neighbor_rank]));
2086
2087
2088 if (differentnode && my_arrows.find(foo) == my_arrows.end()) {
2089 my_arrows.insert(foo);
2090
2091 std::string gnuPlotArrow2 = "";
2092 for (int j = 0; j < mindim; ++j) {
2093 if (j == mindim - 1) {
2094 gnuPlotArrow2 +=
2095 Teuchos::toString<float>(tmpproc_task_comm->
2096 proc_coords[j][neighbor_rank]);
2097 }
2098 else {
2099 gnuPlotArrow2 +=
2100 Teuchos::toString<float>(tmpproc_task_comm->
2101 proc_coords[j][neighbor_rank]) + ",";
2102 }
2103 }
2104 ss += gnuPlotArrow + gnuPlotArrow2 + " nohead\n";
2105 }
2106 }
2107 }
2108 }
2109
2110 std::ofstream procFile("procPlot.plot", std::ofstream::out);
2111 procFile << procs << "\n";
2112 procFile.close();
2113
2114 //gnuPlotCode << ss;
2115 if (mindim == 2) {
2116 gnuPlotCode << "plot \"procPlot.plot\" with points pointsize 3\n";
2117 } else {
2118 gnuPlotCode << "splot \"procPlot.plot\" with points pointsize 3\n";
2119 }
2120
2121 gnuPlotCode << ss << "\nreplot\n pause -1 \n";
2122 gnuPlotCode.close();
2123 }
2124
2125
2126// KDD Need to provide access to algorithm for getPartBoxes
2127#ifdef gnuPlot
2128 void writeGnuPlot(
2129 const Teuchos::Comm<int> *comm_,
2131 int coordDim,
2132 tcoord_t **partCenters) {
2133
2134 std::string file = "gggnuPlot";
2135 std::string exten = ".plot";
2136 std::ofstream mm("2d.txt");
2137 file += Teuchos::toString<int>(comm_->getRank()) + exten;
2138 std::ofstream ff(file.c_str());
2139// ff.seekg(0, ff.end);
2140 std::vector<Zoltan2::coordinateModelPartBox <tcoord_t, part_t> >
2141 outPartBoxes =
2144
2145 for (part_t i = 0; i < this->ntasks;++i) {
2146 outPartBoxes[i].writeGnuPlot(ff, mm);
2147 }
2148 if (coordDim == 2) {
2149 ff << "plot \"2d.txt\"" << std::endl;
2150// ff << "\n pause -1" << endl;
2151 }
2152 else {
2153 ff << "splot \"2d.txt\"" << std::endl;
2154// ff << "\n pause -1" << endl;
2155 }
2156 mm.close();
2157
2158 ff << "set style arrow 5 nohead size screen 0.03,15,135 ls 1"
2159 << std::endl;
2160
2161 for (part_t i = 0; i < this->ntasks; ++i) {
2163 part_t pe = task_communication_xadj[i + 1];
2164
2165 for (part_t p = pb; p < pe; ++p) {
2167
2168// std::cout << "i:" << i << " n:" << n << endl;
2169 std::string arrowline = "set arrow from ";
2170 for (int j = 0; j < coordDim - 1; ++j) {
2171 arrowline +=
2172 Teuchos::toString<tcoord_t>(partCenters[j][n]) + ",";
2173 }
2174 arrowline +=
2175 Teuchos::toString<tcoord_t>(partCenters[coordDim - 1][n]) +
2176 " to ";
2177
2178 for (int j = 0; j < coordDim - 1; ++j) {
2179 arrowline +=
2180 Teuchos::toString<tcoord_t>(partCenters[j][i]) + ",";
2181 }
2182 arrowline +=
2183 Teuchos::toString<tcoord_t>(partCenters[coordDim - 1][i]) +
2184 " as 5\n";
2185
2186// std::cout << "arrow:" << arrowline << endl;
2187 ff << arrowline;
2188 }
2189 }
2190
2191 ff << "replot\n pause -1" << std::endl;
2192 ff.close();
2193 }
2194#endif // gnuPlot
2195
2196public:
2197
2198 void getProcTask(part_t* &proc_to_task_xadj_,
2199 part_t* &proc_to_task_adj_) {
2200 proc_to_task_xadj_ = this->proc_to_task_xadj.getRawPtr();
2201 proc_to_task_adj_ = this->proc_to_task_adj.getRawPtr();
2202 }
2203
2204 virtual void map(const RCP<MappingSolution<Adapter> > &mappingsoln) {
2205
2206 // Mapping was already computed in the constructor; we need to store it
2207 // in the solution.
2208 mappingsoln->setMap_RankForLocalElements(local_task_to_rank);
2209
2210 // KDDKDD TODO: Algorithm is also creating task_to_proc, which maybe
2211 // KDDKDD is not needed once we use MappingSolution to answer queries
2212 // KDDKDD instead of this algorithm.
2213 // KDDKDD Ask Mehmet: what is the most efficient way to get the answer
2214 // KDDKDD out of CoordinateTaskMapper and into the MappingSolution?
2215 }
2216
2217
2219 //freeArray<part_t>(proc_to_task_xadj);
2220 //freeArray<part_t>(proc_to_task_adj);
2221 //freeArray<part_t>(task_to_proc);
2222 if(this->isOwnerofModel) {
2223 delete this->proc_task_comm;
2224 }
2225 }
2226
2228 const lno_t num_local_coords,
2229 const part_t *local_coord_parts,
2230 const ArrayRCP<part_t> task_to_proc_) {
2231 local_task_to_rank = ArrayRCP <part_t>(num_local_coords);
2232
2233 for (lno_t i = 0; i < num_local_coords; ++i) {
2234 part_t local_coord_part = local_coord_parts[i];
2235 part_t rank_index = task_to_proc_[local_coord_part];
2236 local_task_to_rank[i] = rank_index;
2237 }
2238 }
2239
2240
2241
2256 const Teuchos::RCP <const Teuchos::Comm<int> > comm_,
2257 const Teuchos::RCP <const MachineRepresentation<pcoord_t, part_t> >
2258 machine_,
2259 const Teuchos::RCP <const Adapter> input_adapter_,
2260 const Teuchos::RCP <const Zoltan2::PartitioningSolution<Adapter> >
2261 soln_,
2262 const Teuchos::RCP <const Environment> envConst,
2263 bool is_input_adapter_distributed = true,
2264 int num_ranks_per_node = 1,
2265 bool divide_to_prime_first = false,
2266 bool reduce_best_mapping = true):
2267 PartitionMapping<Adapter>(comm_, machine_, input_adapter_,
2268 soln_, envConst),
2271 task_to_proc(0),
2272 isOwnerofModel(true),
2273 proc_task_comm(0),
2277
2278 using namespace Teuchos;
2279 typedef typename Adapter::base_adapter_t ctm_base_adapter_t;
2280
2281 RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
2282 RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
2283
2284 RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
2285 RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
2286 if (!is_input_adapter_distributed) {
2287 ia_comm = Teuchos::createSerialComm<int>();
2288 }
2289
2290 RCP<const Environment> envConst_ = envConst;
2291
2292 RCP<const ctm_base_adapter_t> baseInputAdapter_(
2293 rcp(dynamic_cast<const ctm_base_adapter_t *>(
2294 input_adapter_.getRawPtr()), false));
2295
2296 modelFlag_t coordFlags_, graphFlags_;
2297
2298 //create coordinate model
2299 //since this is coordinate task mapper,
2300 //the adapter has to have the coordinates
2301 coordinateModel_ = rcp(new CoordinateModel<ctm_base_adapter_t>(
2302 baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2303
2304 //if the adapter has also graph model, we will use graph model
2305 //to calculate the cost mapping.
2306 BaseAdapterType inputType_ = input_adapter_->adapterType();
2307 if (inputType_ == MatrixAdapterType ||
2308 inputType_ == GraphAdapterType ||
2309 inputType_ == MeshAdapterType)
2310 {
2311 graph_model_ = rcp(new GraphModel<ctm_base_adapter_t>(
2312 baseInputAdapter_, envConst_, ia_comm,
2313 graphFlags_));
2314 }
2315
2316 if (!machine_->hasMachineCoordinates()) {
2317 throw std::runtime_error("Existing machine does not provide "
2318 "coordinates for coordinate task mapping");
2319 }
2320
2321 //if mapping type is 0 then it is coordinate mapping
2322 int procDim = machine_->getMachineDim();
2323 this->nprocs = machine_->getNumRanks();
2324
2325 //get processor coordinates.
2326 pcoord_t **procCoordinates = NULL;
2327 if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2328 throw std::runtime_error("Existing machine does not implement "
2329 "getAllMachineCoordinatesView");
2330 }
2331
2332 //get the machine extent.
2333 //if we have machine extent,
2334 //if the machine has wrap-around links, we would like to shift the
2335 //coordinates, so that the largest hap would be the wrap-around.
2336 std::vector<int> machine_extent_vec(procDim);
2337 //std::vector<bool> machine_extent_wrap_around_vec(procDim, 0);
2338 int *machine_extent = &(machine_extent_vec[0]);
2339 bool *machine_extent_wrap_around = new bool[procDim];
2340 for (int i = 0; i < procDim; ++i)
2341 machine_extent_wrap_around[i] = false;
2342
2343 bool haveWrapArounds = machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2344
2345 // KDDKDD ASK MEHMET: SHOULD WE GET AND USE machine_dimension HERE IF IT
2346 // KDDKDD ASK MEHMET: IS PROVIDED BY THE MACHINE REPRESENTATION?
2347 // KDDKDD ASK MEHMET: IF NOT HERE, THEN WHERE?
2348 // MD: Yes, I ADDED BELOW:
2349 if (machine_->getMachineExtent(machine_extent) &&
2350 haveWrapArounds) {
2351
2352 procCoordinates =
2354 procDim,
2355 machine_extent,
2356 machine_extent_wrap_around,
2357 this->nprocs,
2358 procCoordinates);
2359 }
2360
2361 //get the tasks information, such as coordinate dimension,
2362 //number of parts.
2363 int coordDim = coordinateModel_->getCoordinateDim();
2364
2365// int coordDim = machine_->getMachineDim();
2366
2367 this->ntasks = soln_->getActualGlobalNumberOfParts();
2368 if (part_t(soln_->getTargetGlobalNumberOfParts()) > this->ntasks) {
2369 this->ntasks = soln_->getTargetGlobalNumberOfParts();
2370 }
2371 this->solution_parts = soln_->getPartListView();
2372
2373 //we need to calculate the center of parts.
2374 tcoord_t **partCenters = new tcoord_t*[coordDim];
2375 for (int i = 0; i < coordDim; ++i) {
2376 partCenters[i] = new tcoord_t[this->ntasks];
2377 }
2378
2379 typedef typename Adapter::scalar_t t_scalar_t;
2380
2381
2382 envConst->timerStart(MACRO_TIMERS, "Mapping - Solution Center");
2383
2384 //get centers for the parts.
2385 getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2386 envConst.getRawPtr(),
2387 ia_comm.getRawPtr(),
2388 coordinateModel_.getRawPtr(),
2389 this->solution_parts,
2390// soln_->getPartListView();
2391// this->soln.getRawPtr(),
2392 coordDim,
2393 ntasks,
2394 partCenters);
2395
2396 envConst->timerStop(MACRO_TIMERS, "Mapping - Solution Center");
2397
2398 //create the part graph
2399 if (graph_model_.getRawPtr() != NULL) {
2400 getCoarsenedPartGraph<Adapter, t_scalar_t, part_t>(
2401 envConst.getRawPtr(),
2402 ia_comm.getRawPtr(),
2403 graph_model_.getRawPtr(),
2404 this->ntasks,
2405 this->solution_parts,
2406// soln_->getPartListView(),
2407// this->soln.getRawPtr(),
2411 );
2412 }
2413
2414 //create coordinate communication model.
2415 this->proc_task_comm = new Zoltan2::CoordinateCommunicationModel<
2416 pcoord_t, tcoord_t, part_t, node_t>(
2417 procDim,
2418 procCoordinates,
2419 coordDim,
2420 partCenters,
2421 this->nprocs,
2422 this->ntasks,
2423 machine_extent,
2424 machine_extent_wrap_around,
2425 machine_.getRawPtr());
2426
2427 int myRank = comm_->getRank();
2428 this->proc_task_comm->num_ranks_per_node = num_ranks_per_node ;
2429 this->proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2430
2431 envConst->timerStart(MACRO_TIMERS, "Mapping - Processor Task map");
2432 this->doMapping(myRank, comm_);
2433 envConst->timerStop(MACRO_TIMERS, "Mapping - Processor Task map");
2434
2435 envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Graph");
2436
2437/*
2438 soln_->getCommunicationGraph(task_communication_xadj,
2439 task_communication_adj);
2440*/
2441
2442 envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Graph");
2443 #ifdef gnuPlot1
2444 if (comm_->getRank() == 0) {
2445
2446 part_t taskCommCount = task_communication_xadj.size();
2447 std::cout << " TotalComm:"
2448 << task_communication_xadj[taskCommCount] << std::endl;
2450 for (part_t i = 1; i <= taskCommCount; ++i) {
2451 part_t nc =
2453 if (maxN < nc)
2454 maxN = nc;
2455 }
2456 std::cout << " maxNeighbor:" << maxN << std::endl;
2457 }
2458
2459 this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2460 #endif
2461
2462 envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Cost");
2463
2464 if (reduce_best_mapping && task_communication_xadj.getRawPtr() &&
2465 task_communication_adj.getRawPtr()) {
2466 this->proc_task_comm->calculateCommunicationCost(
2467 task_to_proc.getRawPtr(),
2468 task_communication_xadj.getRawPtr(),
2469 task_communication_adj.getRawPtr(),
2471 );
2472 }
2473
2474// std::cout << "me: " << comm_->getRank() << " cost:"
2475// << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2476
2477 envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Cost");
2478
2479 //processors are divided into groups of size procDim! * coordDim!
2480 //each processor in the group obtains a mapping with a different
2481 //rotation and best one is broadcasted all processors.
2482 this->getBestMapping();
2484 coordinateModel_->getLocalNumCoordinates(),
2485 this->solution_parts,
2486 this->task_to_proc);
2487/*
2488 {
2489 if (task_communication_xadj.getRawPtr() &&
2490 task_communication_adj.getRawPtr())
2491 this->proc_task_comm->calculateCommunicationCost(
2492 task_to_proc.getRawPtr(),
2493 task_communication_xadj.getRawPtr(),
2494 task_communication_adj.getRawPtr(),
2495 task_communication_edge_weight.getRawPtr()
2496 );
2497 std::cout << "me: " << comm_->getRank() << " cost:"
2498 << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2499 }
2500*/
2501
2502
2503 #ifdef gnuPlot
2504 this->writeMapping2(comm_->getRank());
2505 #endif
2506
2507 delete [] machine_extent_wrap_around;
2508
2509 if (machine_->getMachineExtent(machine_extent) &&
2510 haveWrapArounds) {
2511 for (int i = 0; i < procDim; ++i) {
2512 delete [] procCoordinates[i];
2513 }
2514 delete [] procCoordinates;
2515 }
2516
2517 for (int i = 0; i < coordDim; ++i) {
2518 delete [] partCenters[i];
2519 }
2520 delete [] partCenters;
2521
2522
2523 }
2524
2525
2543 const Teuchos::RCP <const Teuchos::Comm<int> > comm_,
2544 const Teuchos::RCP <const MachineRepresentation<pcoord_t,part_t> >
2545 machine_,
2546 const Teuchos::RCP <const Adapter> input_adapter_,
2547 const part_t num_parts_,
2548 const part_t *result_parts,
2549 const Teuchos::RCP <const Environment> envConst,
2550 bool is_input_adapter_distributed = true,
2551 int num_ranks_per_node = 1,
2552 bool divide_to_prime_first = false,
2553 bool reduce_best_mapping = true):
2554 PartitionMapping<Adapter>(comm_, machine_, input_adapter_,
2555 num_parts_, result_parts, envConst),
2558 task_to_proc(0),
2559 isOwnerofModel(true),
2560 proc_task_comm(0),
2564
2565 using namespace Teuchos;
2566 typedef typename Adapter::base_adapter_t ctm_base_adapter_t;
2567
2568 RCP<Zoltan2::GraphModel<ctm_base_adapter_t> > graph_model_;
2569 RCP<Zoltan2::CoordinateModel<ctm_base_adapter_t> > coordinateModel_ ;
2570
2571 RCP<const Teuchos::Comm<int> > rcp_comm = comm_;
2572 RCP<const Teuchos::Comm<int> > ia_comm = rcp_comm;
2573 if (!is_input_adapter_distributed) {
2574 ia_comm = Teuchos::createSerialComm<int>();
2575 }
2576 RCP<const Environment> envConst_ = envConst;
2577
2578 RCP<const ctm_base_adapter_t> baseInputAdapter_(
2579 rcp(dynamic_cast<const ctm_base_adapter_t *>(
2580 input_adapter_.getRawPtr()), false));
2581
2582 modelFlag_t coordFlags_, graphFlags_;
2583
2584 //create coordinate model
2585 //since this is coordinate task mapper,
2586 //the adapter has to have the coordinates
2587 coordinateModel_ = rcp(new CoordinateModel<ctm_base_adapter_t>(
2588 baseInputAdapter_, envConst_, ia_comm, coordFlags_));
2589
2590 //if the adapter has also graph model, we will use graph model
2591 //to calculate the cost mapping.
2592 BaseAdapterType inputType_ = input_adapter_->adapterType();
2593 if (inputType_ == MatrixAdapterType ||
2594 inputType_ == GraphAdapterType ||
2595 inputType_ == MeshAdapterType)
2596 {
2597 graph_model_ = rcp(new GraphModel<ctm_base_adapter_t>(
2598 baseInputAdapter_, envConst_, ia_comm,
2599 graphFlags_));
2600 }
2601
2602 if (!machine_->hasMachineCoordinates()) {
2603 throw std::runtime_error("Existing machine does not provide "
2604 "coordinates for coordinate task mapping.");
2605 }
2606
2607 //if mapping type is 0 then it is coordinate mapping
2608 int procDim = machine_->getMachineDim();
2609 this->nprocs = machine_->getNumRanks();
2610
2611 //get processor coordinates.
2612 pcoord_t **procCoordinates = NULL;
2613 if (!machine_->getAllMachineCoordinatesView(procCoordinates)) {
2614 throw std::runtime_error("Existing machine does not implement "
2615 "getAllMachineCoordinatesView");
2616 }
2617
2618 //get the machine extent.
2619 //if we have machine extent,
2620 //if the machine has wrap-around links, we would like to shift the
2621 //coordinates,
2622 //so that the largest hap would be the wrap-around.
2623 std::vector<int> machine_extent_vec(procDim);
2624// std::vector<bool> machine_extent_wrap_around_vec(procDim, 0);
2625 int *machine_extent = &(machine_extent_vec[0]);
2626 bool *machine_extent_wrap_around = new bool[procDim];
2627 bool haveWrapArounds = machine_->getMachineExtentWrapArounds(machine_extent_wrap_around);
2628
2629 // KDDKDD ASK MEHMET: SHOULD WE GET AND USE machine_dimension HERE IF IT
2630 // KDDKDD ASK MEHMET: IS PROVIDED BY THE MACHINE REPRESENTATION?
2631 // KDDKDD ASK MEHMET: IF NOT HERE, THEN WHERE?
2632 // MD: Yes, I ADDED BELOW:
2633 if (machine_->getMachineExtent(machine_extent) &&
2634 haveWrapArounds) {
2635 procCoordinates =
2637 procDim,
2638 machine_extent,
2639 machine_extent_wrap_around,
2640 this->nprocs,
2641 procCoordinates);
2642 }
2643
2644 //get the tasks information, such as coordinate dimension,
2645 //number of parts.
2646 int coordDim = coordinateModel_->getCoordinateDim();
2647
2648// int coordDim = machine_->getMachineDim();
2649
2650
2651 this->ntasks = num_parts_;
2652 this->solution_parts = result_parts;
2653
2654 //we need to calculate the center of parts.
2655 tcoord_t **partCenters = new tcoord_t*[coordDim];
2656 for (int i = 0; i < coordDim; ++i) {
2657 partCenters[i] = new tcoord_t[this->ntasks];
2658 }
2659
2660 typedef typename Adapter::scalar_t t_scalar_t;
2661
2662
2663 envConst->timerStart(MACRO_TIMERS, "Mapping - Solution Center");
2664
2665 //get centers for the parts.
2666 getSolutionCenterCoordinates<Adapter, t_scalar_t,part_t>(
2667 envConst.getRawPtr(),
2668 ia_comm.getRawPtr(),
2669 coordinateModel_.getRawPtr(),
2670 this->solution_parts,
2671// soln_->getPartListView();
2672// this->soln.getRawPtr(),
2673 coordDim,
2674 ntasks,
2675 partCenters);
2676
2677 envConst->timerStop(MACRO_TIMERS, "Mapping - Solution Center");
2678
2679 envConst->timerStart(MACRO_TIMERS, "GRAPHCREATE");
2680 //create the part graph
2681 if (graph_model_.getRawPtr() != NULL) {
2682 getCoarsenedPartGraph<Adapter, t_scalar_t, part_t>(
2683 envConst.getRawPtr(),
2684 ia_comm.getRawPtr(),
2685 graph_model_.getRawPtr(),
2686 this->ntasks,
2687 this->solution_parts,
2688// soln_->getPartListView(),
2689// this->soln.getRawPtr(),
2693 );
2694 }
2695 envConst->timerStop(MACRO_TIMERS, "GRAPHCREATE");
2696
2697 envConst->timerStart(MACRO_TIMERS,
2698 "CoordinateCommunicationModel Create");
2699 //create coordinate communication model.
2700 this->proc_task_comm = new Zoltan2::CoordinateCommunicationModel<
2701 pcoord_t, tcoord_t, part_t, node_t>(
2702 procDim,
2703 procCoordinates,
2704 coordDim,
2705 partCenters,
2706 this->nprocs,
2707 this->ntasks,
2708 machine_extent,
2709 machine_extent_wrap_around,
2710 machine_.getRawPtr());
2711
2712 envConst->timerStop(MACRO_TIMERS,
2713 "CoordinateCommunicationModel Create");
2714
2715
2716 this->proc_task_comm->num_ranks_per_node = num_ranks_per_node;
2717 this->proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2718
2719 int myRank = comm_->getRank();
2720
2721
2722 envConst->timerStart(MACRO_TIMERS, "Mapping - Processor Task map");
2723 this->doMapping(myRank, comm_);
2724 envConst->timerStop(MACRO_TIMERS, "Mapping - Processor Task map");
2725
2726
2727 envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Graph");
2728
2729/*
2730 soln_->getCommunicationGraph(task_communication_xadj,
2731 task_communication_adj);
2732*/
2733
2734 envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Graph");
2735 #ifdef gnuPlot1
2736 if (comm_->getRank() == 0) {
2737
2738 part_t taskCommCount = task_communication_xadj.size();
2739 std::cout << " TotalComm:"
2740 << task_communication_xadj[taskCommCount] << std::endl;
2742 for (part_t i = 1; i <= taskCommCount; ++i) {
2743 part_t nc =
2745 if (maxN < nc)
2746 maxN = nc;
2747 }
2748 std::cout << " maxNeighbor:" << maxN << std::endl;
2749 }
2750
2751 this->writeGnuPlot(comm_, soln_, coordDim, partCenters);
2752 #endif
2753
2754 envConst->timerStart(MACRO_TIMERS, "Mapping - Communication Cost");
2755
2756 if (reduce_best_mapping && task_communication_xadj.getRawPtr() &&
2757 task_communication_adj.getRawPtr()) {
2758 this->proc_task_comm->calculateCommunicationCost(
2759 task_to_proc.getRawPtr(),
2760 task_communication_xadj.getRawPtr(),
2761 task_communication_adj.getRawPtr(),
2763 );
2764 }
2765
2766// std::cout << "me: " << comm_->getRank() << " cost:"
2767// << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2768
2769 envConst->timerStop(MACRO_TIMERS, "Mapping - Communication Cost");
2770
2771 //processors are divided into groups of size procDim! * coordDim!
2772 //each processor in the group obtains a mapping with a different rotation
2773 //and best one is broadcasted all processors.
2774 this->getBestMapping();
2775
2777 coordinateModel_->getLocalNumCoordinates(),
2778 this->solution_parts,
2779 this->task_to_proc);
2780/*
2781 {
2782 if (task_communication_xadj.getRawPtr() &&
2783 task_communication_adj.getRawPtr())
2784 this->proc_task_comm->calculateCommunicationCost(
2785 task_to_proc.getRawPtr(),
2786 task_communication_xadj.getRawPtr(),
2787 task_communication_adj.getRawPtr(),
2788 task_communication_edge_weight.getRawPtr()
2789 );
2790 std::cout << "me: " << comm_->getRank() << " cost:"
2791 << this->proc_task_comm->getCommunicationCostMetric() << std::endl;
2792 }
2793*/
2794
2795
2796
2797 #ifdef gnuPlot
2798 this->writeMapping2(comm_->getRank());
2799 #endif
2800
2801 delete [] machine_extent_wrap_around;
2802
2803 if (machine_->getMachineExtent(machine_extent) &&
2804 haveWrapArounds) {
2805 for (int i = 0; i < procDim; ++i) {
2806 delete [] procCoordinates[i];
2807 }
2808 delete [] procCoordinates;
2809 }
2810
2811 for (int i = 0; i < coordDim; ++i) {
2812 delete [] partCenters[i];
2813 }
2814 delete [] partCenters;
2815 }
2816
2883 const Environment *env_const_,
2884 const Teuchos::Comm<int> *problemComm,
2885 int proc_dim,
2886 int num_processors,
2887 pcoord_t **machine_coords,
2888 int task_dim,
2889 part_t num_tasks,
2890 tcoord_t **task_coords,
2891 ArrayRCP<part_t>task_comm_xadj,
2892 ArrayRCP<part_t>task_comm_adj,
2893 pcoord_t *task_communication_edge_weight_,
2894 int recursion_depth,
2895 Kokkos::View<part_t *, Kokkos::HostSpace> part_no_array,
2896 const part_t *machine_dimensions,
2897 int num_ranks_per_node = 1,
2898 bool divide_to_prime_first = false,
2899 bool reduce_best_mapping = true):
2900 PartitionMapping<Adapter>(
2901 Teuchos::rcpFromRef<const Teuchos::Comm<int> >(*problemComm),
2902 Teuchos::rcpFromRef<const Environment>(*env_const_)),
2905 task_to_proc(0),
2906 isOwnerofModel(true),
2907 proc_task_comm(0),
2908 task_communication_xadj(task_comm_xadj),
2909 task_communication_adj(task_comm_adj) {
2910
2911 //if mapping type is 0 then it is coordinate mapping
2912 pcoord_t ** virtual_machine_coordinates = machine_coords;
2913 bool *wrap_arounds = new bool [proc_dim];
2914 for (int i = 0; i < proc_dim; ++i) wrap_arounds[i] = true;
2915
2916 if (machine_dimensions) {
2917 virtual_machine_coordinates =
2919 proc_dim,
2920 machine_dimensions,
2921 wrap_arounds,
2922 num_processors,
2923 machine_coords);
2924 }
2925
2926 this->nprocs = num_processors;
2927
2928 int coordDim = task_dim;
2929 this->ntasks = num_tasks;
2930
2931 //alloc memory for part centers.
2932 tcoord_t **partCenters = task_coords;
2933
2934 //create coordinate communication model.
2935 this->proc_task_comm =
2937 proc_dim,
2938 virtual_machine_coordinates,
2939 coordDim,
2940 partCenters,
2941 this->nprocs,
2942 this->ntasks, NULL, NULL
2943 );
2944
2945 this->proc_task_comm->num_ranks_per_node = num_ranks_per_node;
2946 this->proc_task_comm->divide_to_prime_first = divide_to_prime_first;
2947
2948 this->proc_task_comm->setPartArraySize(recursion_depth);
2949 this->proc_task_comm->setPartArray(part_no_array);
2950
2951 int myRank = problemComm->getRank();
2952
2953 this->doMapping(myRank, this->comm);
2954#ifdef gnuPlot
2955 this->writeMapping2(myRank);
2956#endif
2957
2958 // MDM added this edge case - for example if NX = 1 NY = 1 NZ = 1
2959 // That would pass on original develop so updated this so now it will also pass.
2960 if (reduce_best_mapping && task_communication_xadj.size() &&
2961 task_communication_adj.size()) {
2962 this->proc_task_comm->calculateCommunicationCost(
2963 task_to_proc.getRawPtr(),
2964 task_communication_xadj.getRawPtr(),
2965 task_communication_adj.getRawPtr(),
2966 task_communication_edge_weight_
2967 );
2968
2969
2970 this->getBestMapping();
2971
2972/*
2973 if (myRank == 0) {
2974 this->proc_task_comm->calculateCommunicationCost(
2975 task_to_proc.getRawPtr(),
2976 task_communication_xadj.getRawPtr(),
2977 task_communication_adj.getRawPtr(),
2978 task_communication_edge_weight_
2979 );
2980 cout << "me: " << problemComm->getRank() << " cost:"
2981 << this->proc_task_comm->getCommunicationCostMetric() << endl;
2982 }
2983*/
2984
2985 }
2986
2987 delete [] wrap_arounds;
2988
2989 if (machine_dimensions) {
2990 for (int i = 0; i < proc_dim; ++i) {
2991 delete [] virtual_machine_coordinates[i];
2992 }
2993 delete [] virtual_machine_coordinates;
2994 }
2995#ifdef gnuPlot
2996 if (problemComm->getRank() == 0)
2997 this->writeMapping2(-1);
2998#endif
2999 }
3000
3001
3002 /*
3003 double getCommunicationCostMetric() {
3004 return this->proc_task_comm->getCommCost();
3005 }
3006 */
3007
3010 virtual size_t getLocalNumberOfParts() const{
3011 return 0;
3012 }
3013
3025 int machine_dim,
3026 const part_t *machine_dimensions,
3027 bool *machine_extent_wrap_around,
3028 part_t numProcs,
3029 pcoord_t **mCoords) {
3030
3031 pcoord_t **result_machine_coords = NULL;
3032 result_machine_coords = new pcoord_t*[machine_dim];
3033
3034 for (int i = 0; i < machine_dim; ++i) {
3035 result_machine_coords[i] = new pcoord_t [numProcs];
3036 }
3037
3038 for (int i = 0; i < machine_dim; ++i) {
3039 part_t numMachinesAlongDim = machine_dimensions[i];
3040
3041 part_t *machineCounts = new part_t[numMachinesAlongDim];
3042 memset(machineCounts, 0, sizeof(part_t) * numMachinesAlongDim);
3043
3044 int *filledCoordinates = new int[numMachinesAlongDim];
3045
3046 pcoord_t *coords = mCoords[i];
3047
3048 for (part_t j = 0; j < numProcs; ++j) {
3049 part_t mc = (part_t) coords[j];
3050 ++machineCounts[mc];
3051 }
3052
3053 part_t filledCoordinateCount = 0;
3054 for (part_t j = 0; j < numMachinesAlongDim; ++j) {
3055 if (machineCounts[j] > 0) {
3056 filledCoordinates[filledCoordinateCount++] = j;
3057 }
3058 }
3059
3060 part_t firstProcCoord = filledCoordinates[0];
3061 part_t firstProcCount = machineCounts[firstProcCoord];
3062
3063 part_t lastProcCoord = filledCoordinates[filledCoordinateCount - 1];
3064 part_t lastProcCount = machineCounts[lastProcCoord];
3065
3066 part_t firstLastGap =
3067 numMachinesAlongDim - lastProcCoord + firstProcCoord;
3068 part_t firstLastGapProc = lastProcCount + firstProcCount;
3069
3070 part_t leftSideProcCoord = firstProcCoord;
3071 part_t leftSideProcCount = firstProcCount;
3072 part_t biggestGap = 0;
3073 part_t biggestGapProc = numProcs;
3074
3075 part_t shiftBorderCoordinate = -1;
3076 for (part_t j = 1; j < filledCoordinateCount; ++j) {
3077 part_t rightSideProcCoord= filledCoordinates[j];
3078 part_t rightSideProcCount = machineCounts[rightSideProcCoord];
3079
3080 part_t gap = rightSideProcCoord - leftSideProcCoord;
3081 part_t gapProc = rightSideProcCount + leftSideProcCount;
3082
3083 // Pick the largest gap in this dimension. Use fewer process on
3084 // either side of the largest gap to break the tie. An easy
3085 // addition to this would be to weight the gap by the number of
3086 // processes.
3087 if (gap > biggestGap ||
3088 (gap == biggestGap && biggestGapProc > gapProc)) {
3089 shiftBorderCoordinate = rightSideProcCoord;
3090 biggestGapProc = gapProc;
3091 biggestGap = gap;
3092 }
3093 leftSideProcCoord = rightSideProcCoord;
3094 leftSideProcCount = rightSideProcCount;
3095 }
3096
3097
3098 if (!(biggestGap > firstLastGap ||
3099 (biggestGap == firstLastGap &&
3100 biggestGapProc < firstLastGapProc))) {
3101 shiftBorderCoordinate = -1;
3102 }
3103
3104 for (part_t j = 0; j < numProcs; ++j) {
3105
3106 if (machine_extent_wrap_around[i] &&
3107 coords[j] < shiftBorderCoordinate) {
3108 result_machine_coords[i][j] = coords[j] + numMachinesAlongDim;
3109
3110 }
3111 else {
3112 result_machine_coords[i][j] = coords[j];
3113 }
3114 }
3115 delete [] machineCounts;
3116 delete [] filledCoordinates;
3117 }
3118
3119 return result_machine_coords;
3120
3121 }
3122
3130 virtual void getProcsForPart(part_t taskId, part_t &numProcs,
3131 part_t *&procs) const {
3132 numProcs = 1;
3133 procs = this->task_to_proc.getRawPtr() + taskId;
3134 }
3135
3141 return this->task_to_proc[taskId];
3142 }
3143
3151 virtual void getPartsForProc(int procId, part_t &numParts,
3152 part_t *&parts) const {
3153
3154 part_t task_begin = this->proc_to_task_xadj[procId];
3155 part_t taskend = this->proc_to_task_xadj[procId + 1];
3156
3157 parts = this->proc_to_task_adj.getRawPtr() + task_begin;
3158 numParts = taskend - task_begin;
3159 }
3160
3161 ArrayView<part_t> getAssignedTasksForProc(part_t procId) {
3162 part_t task_begin = this->proc_to_task_xadj[procId];
3163 part_t taskend = this->proc_to_task_xadj[procId + 1];
3164
3165/*
3166 std::cout << "part_t:" << procId << " taskCount:"
3167 << taskend - task_begin << std::endl;
3168
3169 for (part_t i = task_begin; i < taskend; ++i) {
3170 std::cout << "part_t:" << procId << " task:"
3171 << proc_to_task_adj[i] << endl;
3172 }
3173*/
3174 if (taskend - task_begin > 0) {
3175 ArrayView <part_t> assignedParts(
3176 this->proc_to_task_adj.getRawPtr() + task_begin,
3177 taskend - task_begin);
3178
3179 return assignedParts;
3180 }
3181 else {
3182 ArrayView <part_t> assignedParts;
3183
3184 return assignedParts;
3185 }
3186 }
3187
3188};
3189
3268template <typename part_t, typename pcoord_t, typename tcoord_t>
3270 RCP<const Teuchos::Comm<int> > problemComm,
3271 int proc_dim,
3272 int num_processors,
3273 pcoord_t **machine_coords,
3274 int task_dim,
3275 part_t num_tasks,
3276 tcoord_t **task_coords,
3277 part_t *task_comm_xadj,
3278 part_t *task_comm_adj,
3279 // float-like, same size with task_communication_adj_ weight of the
3280 // corresponding edge.
3281 pcoord_t *task_communication_edge_weight_,
3282 part_t *proc_to_task_xadj, /*output*/
3283 part_t *proc_to_task_adj, /*output*/
3284 int recursion_depth,
3285 Kokkos::View<part_t *, Kokkos::HostSpace> part_no_array,
3286 const part_t *machine_dimensions,
3287 int num_ranks_per_node = 1,
3288 bool divide_to_prime_first = false) {
3289
3290 const Environment *envConst_ = new Environment(problemComm);
3291
3292 // mfh 03 Mar 2015: It's OK to omit the Node template
3293 // parameter in Tpetra, if you're just going to use the
3294 // default Node.
3295 typedef Tpetra::MultiVector<tcoord_t, part_t, part_t> tMVector_t;
3296
3297 Teuchos::ArrayRCP<part_t> task_communication_xadj(
3298 task_comm_xadj, 0, num_tasks + 1, false);
3299
3300 Teuchos::ArrayRCP<part_t> task_communication_adj;
3301 if (task_comm_xadj) {
3302 Teuchos::ArrayRCP<part_t> tmp_task_communication_adj(
3303 task_comm_adj, 0, task_comm_xadj[num_tasks], false);
3304 task_communication_adj = tmp_task_communication_adj;
3305 }
3306
3307
3310 envConst_,
3311 problemComm.getRawPtr(),
3312 proc_dim,
3313 num_processors,
3314 machine_coords,
3315// machine_coords_,
3316
3317 task_dim,
3318 num_tasks,
3319 task_coords,
3320
3321 task_communication_xadj,
3322 task_communication_adj,
3323 task_communication_edge_weight_,
3324 recursion_depth,
3325 part_no_array,
3326 machine_dimensions,
3327 num_ranks_per_node,
3328 divide_to_prime_first);
3329
3330
3331 part_t* proc_to_task_xadj_;
3332 part_t* proc_to_task_adj_;
3333
3334 ctm->getProcTask(proc_to_task_xadj_, proc_to_task_adj_);
3335
3336 for (part_t i = 0; i <= num_processors; ++i) {
3337 proc_to_task_xadj[i] = proc_to_task_xadj_[i];
3338 }
3339
3340 for (part_t i = 0; i < num_tasks; ++i) {
3341 proc_to_task_adj[i] = proc_to_task_adj_[i];
3342 }
3343
3344 delete ctm;
3345 delete envConst_;
3346}
3347
3348template <typename proc_coord_t, typename v_lno_t>
3349inline void visualize_mapping(int myRank,
3350 const int machine_coord_dim,
3351 const int num_ranks,
3352 proc_coord_t **machine_coords,
3353 const v_lno_t num_tasks,
3354 const v_lno_t *task_communication_xadj,
3355 const v_lno_t *task_communication_adj,
3356 const int *task_to_rank) {
3357
3358 std::string rankStr = Teuchos::toString<int>(myRank);
3359 std::string gnuPlots = "gnuPlot", extentionS = ".plot";
3360 std::string outF = gnuPlots + rankStr+ extentionS;
3361 std::ofstream gnuPlotCode( outF.c_str(), std::ofstream::out);
3362
3363 if (machine_coord_dim != 3) {
3364 std::cerr << "Mapping Write is only good for 3 dim" << std::endl;
3365 return;
3366 }
3367 std::string ss = "";
3368 std::string procs = "";
3369
3370 std::set<std::tuple<int, int, int, int, int, int> > my_arrows;
3371
3372 for (v_lno_t origin_task = 0; origin_task < num_tasks; ++origin_task) {
3373 int origin_rank = task_to_rank[origin_task];
3374 std::string gnuPlotArrow = "set arrow from ";
3375
3376 for (int j = 0; j < machine_coord_dim; ++j) {
3377 if (j == machine_coord_dim - 1) {
3378 gnuPlotArrow +=
3379 Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
3380 procs +=
3381 Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank]);
3382
3383 }
3384 else {
3385 gnuPlotArrow +=
3386 Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank])
3387 + ",";
3388 procs +=
3389 Teuchos::toString<proc_coord_t>(machine_coords[j][origin_rank])
3390 + " ";
3391 }
3392 }
3393 procs += "\n";
3394
3395 gnuPlotArrow += " to ";
3396
3397
3398 for (int nind = task_communication_xadj[origin_task];
3399 nind < task_communication_xadj[origin_task + 1]; ++nind) {
3400
3401 int neighbor_task = task_communication_adj[nind];
3402
3403 bool differentnode = false;
3404 int neighbor_rank = task_to_rank[neighbor_task];
3405
3406 for (int j = 0; j < machine_coord_dim; ++j) {
3407 if (int(machine_coords[j][origin_rank]) !=
3408 int(machine_coords[j][neighbor_rank])) {
3409 differentnode = true; break;
3410 }
3411 }
3412
3413 std::tuple<int,int,int, int, int, int> foo(
3414 (int)(machine_coords[0][origin_rank]),
3415 (int)(machine_coords[1][origin_rank]),
3416 (int)(machine_coords[2][origin_rank]),
3417 (int)(machine_coords[0][neighbor_rank]),
3418 (int)(machine_coords[1][neighbor_rank]),
3419 (int)(machine_coords[2][neighbor_rank]));
3420
3421 if (differentnode && my_arrows.find(foo) == my_arrows.end()) {
3422 my_arrows.insert(foo);
3423
3424 std::string gnuPlotArrow2 = "";
3425 for (int j = 0; j < machine_coord_dim; ++j) {
3426 if (j == machine_coord_dim - 1) {
3427 gnuPlotArrow2 +=
3428 Teuchos::toString<float>(machine_coords[j][neighbor_rank]);
3429 }
3430 else {
3431 gnuPlotArrow2 +=
3432 Teuchos::toString<float>(machine_coords[j][neighbor_rank])
3433 + ",";
3434 }
3435 }
3436 ss += gnuPlotArrow + gnuPlotArrow2 + " nohead\n";
3437 }
3438 }
3439 }
3440
3441 std::ofstream procFile("procPlot.plot", std::ofstream::out);
3442 procFile << procs << "\n";
3443 procFile.close();
3444
3445 //gnuPlotCode << ss;
3446 if (machine_coord_dim == 2) {
3447 gnuPlotCode << "plot \"procPlot.plot\" with points pointsize 3\n";
3448 }
3449 else {
3450 gnuPlotCode << "splot \"procPlot.plot\" with points pointsize 3\n";
3451 }
3452
3453 gnuPlotCode << ss << "\nreplot\n pause -1\npause -1";
3454 gnuPlotCode.close();
3455}
3456
3457} // namespace Zoltan2
3458
3459#endif
Contains the Multi-jagged algorthm.
Defines the GraphModel interface.
Defines the MappingSolution class.
Gathering definitions used in software development.
Traits class to handle conversions between gno_t/lno_t and TPL data types (e.g., ParMETIS's idx_t,...
#define MINOF(a, b)
Defines the XpetraMultiVectorAdapter.
Zoltan2_ReduceBestMapping Class, reduces the minimum cost mapping, ties breaks with minimum proc id.
void reduce(const Ordinal count, const T inBuffer[], T inoutBuffer[]) const
Implement Teuchos::ValueTypeReductionOp interface.
Zoltan2_ReduceBestMapping()
Default Constructor.
Multi Jagged coordinate partitioning algorithm.
void sequential_task_partitioning(const RCP< const Environment > &env, mj_lno_t num_total_coords, mj_lno_t num_selected_coords, size_t num_target_part, int coord_dim, Kokkos::View< mj_scalar_t **, Kokkos::LayoutLeft, device_t > &mj_coordinates_, Kokkos::View< mj_lno_t *, device_t > &initial_selected_coords_output_permutation, mj_lno_t *output_xadj, int recursion_depth_, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &part_no_array, bool partition_along_longest_dim, int num_ranks_per_node, bool divide_to_prime_first_, mj_part_t num_first_level_parts_=1, const Kokkos::View< mj_part_t *, Kokkos::HostSpace > &first_level_distribution_=Kokkos::View< mj_part_t *, Kokkos::HostSpace >())
Special function for partitioning for task mapping. Runs sequential, and performs deterministic parti...
Adapter::part_t part_t
Adapter::scalar_t scalar_t
virtual std::vector< coordinateModelPartBox > & getPartBoxesView() const
for partitioning methods, return bounding boxes of the
CommunicationModel Base Class that performs mapping between the coordinate partitioning result.
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &proc_to_task_xadj, ArrayRCP< part_t > &proc_to_task_adj, ArrayRCP< part_t > &task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const =0
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
void calculateCommunicationCost(part_t *task_to_proc, part_t *task_communication_xadj, part_t *task_communication_adj, pcoord_t *task_communication_edge_weight)
CommunicationModel(part_t no_procs_, part_t no_tasks_)
virtual double getProcDistance(int procId1, int procId2) const =0
CoordinateModelInput Class that performs mapping between the coordinate partitioning result and mpi r...
void update_visit_order(part_t *visitOrder, part_t n, int &_u_umpa_seed, part_t rndm)
const MachineRepresentation< pcoord_t, part_t > * machine
virtual void getMapping(int myRank, const RCP< const Environment > &env, ArrayRCP< part_t > &rcp_proc_to_task_xadj, ArrayRCP< part_t > &rcp_proc_to_task_adj, ArrayRCP< part_t > &rcp_task_to_proc, const Teuchos::RCP< const Teuchos::Comm< int > > comm_) const
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
static part_t umpa_uRandom(part_t l, int &_u_umpa_seed)
void setPartArray(Kokkos::View< part_t *, Kokkos::HostSpace > pNo)
Kokkos::View< part_t *, Kokkos::HostSpace > kokkos_partNoArray
virtual double getProcDistance(int procId1, int procId2) const
CoordinateCommunicationModel(int pcoord_dim_, pcoord_t **pcoords_, int tcoord_dim_, tcoord_t **tcoords_, part_t no_procs_, part_t no_tasks_, int *machine_extent_, bool *machine_extent_wrap_around_, const MachineRepresentation< pcoord_t, part_t > *machine_=NULL)
Class Constructor:
void getClosestSubset(part_t *proc_permutation, part_t nprocs, part_t ntasks) const
Function is called whenever nprocs > no_task. Function returns only the subset of processors that are...
This class provides geometric coordinates with optional weights to the Zoltan2 algorithm.
size_t getCoordinates(ArrayView< const gno_t > &Ids, ArrayView< input_t > &xyz, ArrayView< input_t > &wgts) const
Returns the coordinate ids, values and optional weights.
size_t getLocalNumCoordinates() const
Returns the number of coordinates on this process.
virtual void map(const RCP< MappingSolution< Adapter > > &mappingsoln)
Mapping method.
ArrayRCP< scalar_t > task_communication_edge_weight
virtual size_t getLocalNumberOfParts() const
Returns the number of parts to be assigned to this process.
void doMapping(int myRank, const Teuchos::RCP< const Teuchos::Comm< int > > comm_)
doMapping function, calls getMapping function of communicationModel object.
pcoord_t ** shiftMachineCoordinates(int machine_dim, const part_t *machine_dimensions, bool *machine_extent_wrap_around, part_t numProcs, pcoord_t **mCoords)
Using the machine dimensions provided, create virtual machine coordinates by assigning the largest ga...
void getProcTask(part_t *&proc_to_task_xadj_, part_t *&proc_to_task_adj_)
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const Teuchos::RCP< const Zoltan2::PartitioningSolution< Adapter > > soln_, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. When this constructor is called, in order to calculate the communication metric,...
ArrayView< part_t > getAssignedTasksForProc(part_t procId)
part_t getAssignedProcForTask(part_t taskId)
getAssignedProcForTask function, returns the assigned processor id for the given task
void getBestMapping()
finds the lowest cost mapping and broadcasts solution to everyone.
void create_local_task_to_rank(const lno_t num_local_coords, const part_t *local_coord_parts, const ArrayRCP< part_t > task_to_proc_)
RCP< Comm< int > > create_subCommunicator()
creates and returns the subcommunicator for the processor group.
virtual void getProcsForPart(part_t taskId, part_t &numProcs, part_t *&procs) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
virtual void getPartsForProc(int procId, part_t &numParts, part_t *&parts) const
getAssignedProcForTask function, returns the assigned tasks with the number of tasks.
CoordinateTaskMapper(const Environment *env_const_, const Teuchos::Comm< int > *problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, ArrayRCP< part_t >task_comm_xadj, ArrayRCP< part_t >task_comm_adj, pcoord_t *task_communication_edge_weight_, int recursion_depth, Kokkos::View< part_t *, Kokkos::HostSpace > part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor The mapping constructor which will also perform the mapping operation....
CoordinateCommunicationModel< pcoord_t, tcoord_t, part_t, node_t > * proc_task_comm
CoordinateTaskMapper(const Teuchos::RCP< const Teuchos::Comm< int > > comm_, const Teuchos::RCP< const MachineRepresentation< pcoord_t, part_t > > machine_, const Teuchos::RCP< const Adapter > input_adapter_, const part_t num_parts_, const part_t *result_parts, const Teuchos::RCP< const Environment > envConst, bool is_input_adapter_distributed=true, int num_ranks_per_node=1, bool divide_to_prime_first=false, bool reduce_best_mapping=true)
Constructor. Instead of Solution we have two parameters, numparts.
The user parameters, debug, timing and memory profiling output objects, and error checking methods.
void timerStart(TimerType tt, const char *timerName) const
Start a named timer.
void timerStop(TimerType tt, const char *timerName) const
Stop a named timer.
GraphModel defines the interface required for graph models.
size_t getLocalNumEdges() const
Returns the number of edges on this process. In global or subset graphs, includes off-process edges.
size_t getEdgeList(ArrayView< const gno_t > &edgeIds, ArrayView< const offset_t > &offsets, ArrayView< input_t > &wgts) const
Sets pointers to this process' edge (neighbor) global Ids, including off-process edges.
size_t getLocalNumVertices() const
Returns the number vertices on this process.
size_t getVertexList(ArrayView< const gno_t > &Ids, ArrayView< input_t > &wgts) const
Sets pointers to this process' vertex Ids and their weights.
int getNumWeightsPerEdge() const
Returns the number (0 or greater) of weights per edge.
KMeansAlgorithm Class that performs clustering of the coordinates, and returns the closest set of coo...
KMeansAlgorithm(int dim_, IT numElements_, WT **elementCoords_, IT required_elements_)
KMeansAlgorithm Constructor.
void getMinDistanceCluster(IT *procPermutation)
void setParams(int dimension_, int heapsize)
void copyCoordinates(IT *permutation)
WT getDistance(IT index, WT **elementCoords)
bool getNewCenters(WT **coords)
KmeansHeap Class, max heap, but holds the minimum values.
bool getNewCenters(WT *center, WT **coords, int dimension)
void setHeapsize(IT heapsize_)
void push_down(IT index_on_heap)
void copyCoordinates(IT *permutation)
void addPoint(IT index, WT distance)
MachineRepresentation Class Base class for representing machine coordinates, networks,...
bool getHopCount(int rank1, int rank2, pcoord_t &hops) const
return the hop count between rank1 and rank2
PartitionMapping maps a solution or an input distribution to ranks.
PartitionMapping maps a solution or an input distribution to ranks.
const Teuchos::RCP< const Environment > env
const Teuchos::RCP< const Teuchos::Comm< int > > comm
A PartitioningSolution is a solution to a partitioning problem.
size_t getActualGlobalNumberOfParts() const
Returns the actual global number of parts provided in setParts().
const part_t * getPartListView() const
Returns the part list corresponding to the global ID list.
size_t getTargetGlobalNumberOfParts() const
Returns the global number of parts desired in the solution.
The StridedData class manages lists of weights or coordinates.
map_t::local_ordinal_type lno_t
Definition: mapRemotes.cpp:17
map_t::global_ordinal_type gno_t
Definition: mapRemotes.cpp:18
Created by mbenlioglu on Aug 31, 2020.
void visualize_mapping(int myRank, const int machine_coord_dim, const int num_ranks, proc_coord_t **machine_coords, const v_lno_t num_tasks, const v_lno_t *task_communication_xadj, const v_lno_t *task_communication_adj, const int *task_to_rank)
void fillContinousArray(T *arr, size_t arrSize, T *val)
fillContinousArray function
std::bitset< NUM_MODEL_FLAGS > modelFlag_t
@ MACRO_TIMERS
Time an algorithm (or other entity) as a whole.
void getSolutionCenterCoordinates(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::CoordinateModel< typename Adapter::base_adapter_t > *coords, const part_t *parts, int coordDim, part_t ntasks, scalar_t **partCenters)
void getCoarsenedPartGraph(const Environment *envConst, const Teuchos::Comm< int > *comm, const Zoltan2::GraphModel< typename Adapter::base_adapter_t > *graph, part_t np, const part_t *parts, ArrayRCP< part_t > &g_part_xadj, ArrayRCP< part_t > &g_part_adj, ArrayRCP< scalar_t > &g_part_ew)
void getGridCommunicationGraph(part_t taskCount, part_t *&task_comm_xadj, part_t *&task_comm_adj, std::vector< int > grid_dims)
void ithPermutation(const IT n, IT i, IT *perm)
BaseAdapterType
An enum to identify general types of adapters.
@ GraphAdapterType
graph data
@ MatrixAdapterType
matrix data
@ MeshAdapterType
mesh data
void coordinateTaskMapperInterface(RCP< const Teuchos::Comm< int > > problemComm, int proc_dim, int num_processors, pcoord_t **machine_coords, int task_dim, part_t num_tasks, tcoord_t **task_coords, part_t *task_comm_xadj, part_t *task_comm_adj, pcoord_t *task_communication_edge_weight_, part_t *proc_to_task_xadj, part_t *proc_to_task_adj, int recursion_depth, Kokkos::View< part_t *, Kokkos::HostSpace > part_no_array, const part_t *machine_dimensions, int num_ranks_per_node=1, bool divide_to_prime_first=false)
Constructor The interface function that calls CoordinateTaskMapper which will also perform the mappin...
#define epsilon
Definition: nd.cpp:82
SparseMatrixAdapter_t::part_t part_t
Tpetra::MultiVector< zscalar_t, zlno_t, zgno_t, znode_t > tMVector_t