Skip to content

Commit

Permalink
added tests for indexed_sum and made it much faster on gpus
Browse files Browse the repository at this point in the history
  • Loading branch information
lehner committed May 10, 2024
1 parent 268c2d8 commit b7c7481
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 9 deletions.
12 changes: 12 additions & 0 deletions lib/cgpt/lib/benchmarks.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,18 @@ EXPORT(benchmarks,{
//benchmarks(16);
//benchmarks(32);

GridCartesian * UGrid = SpaceTimeGrid::makeFourDimGrid(GridDefaultLatt(), GridDefaultSimd(4,vComplexD::Nsimd()), GridDefaultMpi());

LatticeSpinColourMatrixD data1(UGrid);
LatticeSpinColourMatrixD data2(UGrid);
LatticeComplexD index(UGrid);
index = Zero();
std::vector<typename LatticeSpinColourMatrixD::scalar_object> res(192);
PVector<LatticeSpinColourMatrixD> v;
v.push_back(&data1);
v.push_back(&data2);
cgpt_rank_indexed_sum(v, index, res);

return PyLong_FromLong(0);
});

Expand Down
18 changes: 9 additions & 9 deletions lib/cgpt/lib/foundation/transform.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,23 +184,23 @@ inline void cgpt_rank_indexed_sum(const PVector<Lattice<vobj>> &Data,
auto Index_p = &Index_v[0];

VECTOR_VIEW_OPEN(Data, Data_v, AcceleratorRead);

accelerator_for(ii,index_osites_per_block,1,{

accelerator_for(_idx,index_osites_per_block * n_elem * Nbasis,1,{

uint64_t idx = _idx;
uint64_t ii = _idx % index_osites_per_block; _idx /= index_osites_per_block;
uint64_t i = _idx % n_elem; _idx /= n_elem;
uint64_t nb = _idx % Nbasis; _idx /= Nbasis;

for (long jj=0;jj<len;jj++) {
long oidx = jj*index_osites_per_block + ii;
if (oidx < index_osites) {

for (int lane=0;lane<Nsimd;lane++) {

long index = (long)((scalar_type*)&Index_p[oidx])[lane].real();

for (int nb=0;nb<Nbasis;nb++) {
for (int i=0;i<n_elem;i++) {
((scalar_type*)&lsSum_p[(nb * len + index)*index_osites_per_block + ii])[i] +=
((scalar_type*)&Data_v[nb][oidx])[i * Nsimd + lane];
}
}
((scalar_type*)&lsSum_p[(nb * len + index)*index_osites_per_block + ii])[i] +=
((scalar_type*)&Data_v[nb][oidx])[i * Nsimd + lane];
}
}
}
Expand Down
9 changes: 9 additions & 0 deletions tests/core/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,13 +161,22 @@ def assign_pos_view():
rng.cnormal(obj_list)

for dimension in range(4):
clat = g.complex(obj_list[0].grid)
clat[:] = np.ascontiguousarray(g.coordinates(clat)[:, dimension].astype(np.complex128))

tmp = g.slice(obj_list, dimension)
full_sliced = np.array([[g.util.tensor_to_value(v) for v in obj] for obj in tmp])
tmp2 = g.indexed_sum(obj_list, clat, clat.grid.gdimensions[dimension])
full_sliced2 = np.array([[g.util.tensor_to_value(v) for v in obj] for obj in tmp2])

for n, obj in enumerate(obj_list):
tmp = g.slice(obj, dimension)
tmp2 = g.indexed_sum(obj, clat, obj.grid.gdimensions[dimension])
sliced = np.array([g.util.tensor_to_value(v) for v in tmp])
sliced2 = np.array([g.util.tensor_to_value(v) for v in tmp2])
assert np.allclose(full_sliced[n], sliced, atol=0.0, rtol=1e-13)
assert np.allclose(full_sliced2[n], sliced2, atol=0.0, rtol=1e-13)
assert np.allclose(sliced, sliced2, atol=0.0, rtol=1e-12)

sliced_numpy = np.array(
[
Expand Down

0 comments on commit b7c7481

Please sign in to comment.