-
Notifications
You must be signed in to change notification settings - Fork 1
/
multiplicator.cpp
138 lines (122 loc) · 5.56 KB
/
multiplicator.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#include "multiplicator.hpp"
#include <algorithm>
#include <mpi.h>
#include "matrix_utils.hpp"
#include "utils.hpp"
using MPI::COMM_WORLD;
using namespace std;
DenseMatrix Multiplicator::matmulInnerABC(int exponent, int replication) {
c = replication;
parts = p/c;
part_first = 0;
part_id = Flags::group_comm.Get_rank();
ONE_DBG cerr << " gsize: " << Flags::group_comm.Get_size() << " parts: " << parts
<< " part_first: " << part_first << " part_id: " << part_id << endl;
ONE_DBG cerr << "initial sparse fragment: " << endl << A;
int shifts = p/(c*c);
for (int iter = 0; iter < exponent; ++iter) {
for (int shift = 0; shift < shifts; ++shift) {
mulInnerABC();
ONE_DBG cerr << "after multiplication " << shift << ": " << endl << C;
// no need for rotation after last shift, impossible to rotate with one process
if (shifts > 1 && shift != shifts-1) {
rotatePartA();
ONE_DBG cerr << "after rotation " << shift << ": " << endl << A;
}
}
if (c > 1 && (iter != exponent-1 || Flags::show_results)) {
innerGatherC();
}
if (iter != exponent-1) {
B = C;
C = DenseMatrix(B.height, B.width, B.row_off, B.col_off);
}
}
return C;
}
DenseMatrix Multiplicator::matmulColumnA(int exponent, int replication) {
c = replication;
parts = p/c;
part_first = 0;
part_id = groupId();
ONE_DBG cerr << " gsize: " << Flags::group_comm.Get_size() << " parts: " << parts << endl;
ONE_DBG cerr << "initial sparse fragment: " << endl << A;
for (int iter = 0; iter < exponent; ++iter) {
for (int part = 0; part < parts; ++part) {
mulColA();
ONE_DBG cerr << "after multiplication " << part << ": " << endl << C;
// no need for rotation after last part, impossible to rotate with one process
if (parts > 1 && part != parts-1) {
rotatePartA();
ONE_DBG cerr << "after rotation " << part << ": " << endl << A;
}
}
if (iter != exponent-1) {
B = C;
C = DenseMatrix(B.height, B.width, B.row_off, B.col_off);
}
}
return C;
}
Multiplicator::Multiplicator(SparseMatrix &_A, DenseMatrix &_B, vector<int> &_nnzs)
: p(Flags::procs), n(_B.height), g_rank(Flags::group_comm.Get_rank()),
A(_A), B(_B), C(_B.height, _B.width, _B.row_off, _B.col_off), nnzs(_nnzs) {
int max_nnz = *max_element(nnzs.begin(), nnzs.end());
// Resize the communication vectors once so we don't have to worry later
// -- they would be of this size at some point anyway
recv_a_v.resize(max_nnz);
recv_ij_v.resize(max_nnz + n + 1);
// Note resize vs. reserve -- we want the send vectors empty
send_a_v.reserve(max_nnz);
send_ij_v.reserve(max_nnz + n + 1);
}
void Multiplicator::mulColA() {
ONE_DBG cerr << "mul part_id: " << part_id << endl;
for (const auto &elem : A.values) { // for each element in sparse matrix
for (int bcol = 0; bcol < B.width; ++bcol) { // for each B column we have
// no offsets, elem.row, elem.col are absolute values in the original matrix
C[elem.row][bcol] += elem.val * B[elem.col][bcol];
}
}
}
void Multiplicator::mulInnerABC() {
ONE_DBG cerr << "mul part_id: " << part_id << endl;
for (const auto &elem : A.values) { // for each element in sparse matrix
for (int bcol = 0; bcol < B.width; ++bcol) { // for each B column we have
// no offsets, elem.row, elem.col are absolute values in the original matrix
C[elem.row][bcol] += elem.val * B[elem.col][bcol];
}
}
}
void Multiplicator::rotatePartA() {
ONE_DBG cerr << "send part_id: " << part_id << " width: " << A.width
<< " nnz: " << A.nnz() << " cur nnzs: " << nnzs;
int next = (g_rank == parts - 1) ? 0 : g_rank + 1;
int prev = (g_rank == 0) ? parts - 1 : g_rank - 1;
ONE_DBG cerr << "grank: " << g_rank << " next: " << next << " prev: " << prev
<< " gsize: " << Flags::group_comm.Get_size() << " parts: " << parts << endl;
part_id = (part_id == part_first) ? part_first + parts - 1 : part_id - 1;
// Prepare send & recv buffers
if (first_isend) first_isend = false;
else isend_req.Wait();
send_A = A;
A = SparseMatrix(send_A.height, partASize(true, part_id),
0, partAStart(true, part_id), nnzs[part_id]);
ONE_DBG cerr << "recv part_id: " << part_id << " width: " << A.width
<< " nnz: " << A.nnz() << " new nnzs: " << nnzs;
// Send our part of A to the next process asynchronously
isend_req = Flags::group_comm.Isend(send_A.values.data(), send_A.nnz(), SparseMatrix::ELEM_TYPE,
next, ROTATE_SPARSE_BLOCK_COL);
// Receive next part of A from prev process
Flags::group_comm.Recv(A.values.data(), A.nnz(), SparseMatrix::ELEM_TYPE,
prev, ROTATE_SPARSE_BLOCK_COL);
}
void Multiplicator::innerGatherC() {
ONE_DBG cerr << "---- before gather C ----" << endl << C;
// note: we could do this a bit faster by extracting the nonzero part and doing an Allgatherv,
// but then we'd have to reorganize the data yet again -- because the matrix is stored in column
// order, and here we want to concatenate row-block submatrices. Figuring out the proper
// indexing would probably cause more errors than profits.
Flags::team_comm.Allreduce(MPI::IN_PLACE, C.rawData(), C.elems(), MPI::DOUBLE, MPI::SUM);
ONE_DBG cerr << "---- after gather C ----" << endl << C;
}