diff --git a/BrightData.Cuda/BrightData.Cuda.xml b/BrightData.Cuda/BrightData.Cuda.xml
index 3e746986..bbef7e59 100644
--- a/BrightData.Cuda/BrightData.Cuda.xml
+++ b/BrightData.Cuda/BrightData.Cuda.xml
@@ -55,6 +55,9 @@
+
+
+
@@ -247,7 +250,7 @@
-
+
diff --git a/BrightData.Cuda/CudaLinearAlgebraProvider.cs b/BrightData.Cuda/CudaLinearAlgebraProvider.cs
index 70916483..1d144526 100644
--- a/BrightData.Cuda/CudaLinearAlgebraProvider.cs
+++ b/BrightData.Cuda/CudaLinearAlgebraProvider.cs
@@ -9,6 +9,7 @@
using BrightData.LinearAlgebra;
using BrightData.LinearAlgebra.Segments;
using CommunityToolkit.HighPerformance.Buffers;
+using static System.Runtime.InteropServices.JavaScript.JSType;
namespace BrightData.Cuda
{
@@ -75,6 +76,23 @@ public override INumericSegment CreateSegment(params float[] data)
return new CudaTensorSegment(deviceMemory, Provider);
}
+ ///
+ public override INumericSegment CreateSegment(IReadOnlyNumericSegment segment)
+ {
+ var deviceMemory = Provider.Allocate(segment.Size);
+ var temp = SpanOwner.Empty;
+ var wasTempUsed = false;
+ try {
+ var span = segment.GetSpan(ref temp, out wasTempUsed);
+ deviceMemory.CopyToDevice(span, 0);
+ }
+ finally {
+ if (wasTempUsed)
+ temp.Dispose();
+ }
+ return new CudaTensorSegment(deviceMemory, Provider);
+ }
+
internal CudaTensorSegment CreateCudaTensorSegment(IDeviceMemoryPtr ptr) => new(ptr, Provider);
///
@@ -713,14 +731,14 @@ public override IMatrix CreateMatrix(uint rows, uint columns, bool initia
}
///
- public override IMatrix FindDistances(IVector[] vectors, IReadOnlyList> compareTo, DistanceMetric distanceMetric)
+ public override IMatrix FindDistances(IReadOnlyList> vectors, IReadOnlyList> compareTo, DistanceMetric distanceMetric)
{
if (distanceMetric is not (DistanceMetric.Euclidean or DistanceMetric.Manhattan or DistanceMetric.Cosine))
throw new NotImplementedException();
var size = vectors[0].Size;
var rows = (uint)compareTo.Count;
- var columns = (uint)vectors.Length;
+ var columns = (uint)vectors.Count;
var ret = Provider.Allocate(rows * columns, null, true);
using (var vectorPtr = new PtrToDeviceMemoryList(vectors.Cast().ToArray()))
@@ -746,7 +764,7 @@ public override IMatrix FindDistances(IVector[] vectors, IReadOnly
return ones.Subtract(distance);
}
- Provider.CalculateDistances(size, columns, rows,
+ Provider.CalculateMultiDistances(size, columns, rows,
vectorPtr.DevicePointer,
compareToPtr.DevicePointer,
ret.DevicePointer,
@@ -764,6 +782,56 @@ public override IMatrix FindDistances(IVector[] vectors, IReadOnly
return matrix;
}
+ public override IVector FindDistances(IReadOnlyNumericSegment vector, IReadOnlyList> compareTo, DistanceMetric distanceMetric)
+ {
+ if (distanceMetric is not (DistanceMetric.Euclidean or DistanceMetric.Manhattan or DistanceMetric.Cosine))
+ throw new NotImplementedException();
+
+ var size = vector.Size;
+ var numVectors = (uint)compareTo.Count;
+ var ret = Provider.Allocate(numVectors, null, true);
+
+ var vectorPtr = (IHaveDeviceMemory)vector;
+ using (var compareToPtr = new PtrToDeviceMemoryList(compareTo.Cast().ToArray())) {
+ if (distanceMetric == DistanceMetric.Cosine) {
+ var aa = Provider.Allocate(numVectors, null, true);
+ var bb = Provider.Allocate(numVectors, null, true);
+ Provider.CosineDistances(size, numVectors,
+ vectorPtr.Memory.DevicePointer,
+ compareToPtr.DevicePointer,
+ aa.DevicePointer,
+ ret.DevicePointer,
+ bb.DevicePointer
+ );
+ using var ones = CreateVector(numVectors, _ => 1f);
+ using var vectorMagnitude = new CudaVector(CreateCudaTensorSegment(aa), this);
+ using var vectorSqrt = vectorMagnitude.Sqrt();
+ using var compareToMagnitude = new CudaVector(CreateCudaTensorSegment(bb), this);
+ using var compareToSqrt = compareToMagnitude.Sqrt();
+ using var norms = vectorSqrt.PointwiseMultiply(compareToSqrt);
+ using var result = new CudaVector(CreateCudaTensorSegment(ret), this);
+ using var distance = result.PointwiseDivide(norms);
+ return ones.Subtract(distance);
+ }
+
+ Provider.CalculateDistances(size, numVectors,
+ vectorPtr.Memory.DevicePointer,
+ compareToPtr.DevicePointer,
+ ret.DevicePointer,
+ distanceMetric
+ );
+ }
+
+ IVector matrix = new CudaVector(CreateCudaTensorSegment(ret), this);
+ if (distanceMetric == DistanceMetric.Euclidean) {
+ var sqrt = matrix.Sqrt();
+ matrix.Dispose();
+ matrix = sqrt;
+ }
+
+ return matrix;
+ }
+
///
public override void BindThread()
{
diff --git a/BrightData.Cuda/CudaProvider.cs b/BrightData.Cuda/CudaProvider.cs
index 3022146a..90454a6a 100644
--- a/BrightData.Cuda/CudaProvider.cs
+++ b/BrightData.Cuda/CudaProvider.cs
@@ -135,7 +135,9 @@ readonly CuFunction
_tensorReverseMaxPool,
_tensorReverseIm2Col,
_isFinite,
- _calculateDistance,
+ _calculateMultiDistances,
+ _calculateDistances,
+ _cosineDistances,
_roundInPlace,
_scale
;
@@ -197,66 +199,68 @@ public CudaProvider(BrightDataContext context, string? cudaKernelPath, string? c
});
_cuda.SetCurrent();
- _pointwiseMultiply = _kernel.LoadFunction("PointwiseMultiply");
- _addInPlace = _kernel.LoadFunction("AddInPlace");
- _subtractInPlace = _kernel.LoadFunction("SubtractInPlace");
- _addToEachRow = _kernel.LoadFunction("AddToEachRow");
- _addToEachColumn = _kernel.LoadFunction("AddToEachColumn");
- _multiplyByEachRow = _kernel.LoadFunction("MultiplyByEachRow");
- _multiplyByEachColumn = _kernel.LoadFunction("MultiplyByEachColumn");
- _tanh = _kernel.LoadFunction("TanH");
- _tanhDerivative = _kernel.LoadFunction("TanHDerivative");
- _sigmoid = _kernel.LoadFunction("Sigmoid");
- _sigmoidDerivative = _kernel.LoadFunction("SigmoidDerivative");
- _sumRows = _kernel.LoadFunction("SumRows");
- _relu = _kernel.LoadFunction("RELU");
- _reluDerivative = _kernel.LoadFunction("RELUDerivative");
- _memSet = _kernel.LoadFunction("MemSet");
- _memCpy = _kernel.LoadFunction("MemCpy");
- _sumColumns = _kernel.LoadFunction("SumColumns");
- _pointwiseDivide = _kernel.LoadFunction("PointwiseDivide");
- _sqrt = _kernel.LoadFunction("Sqrt");
- _findMinAndMax = _kernel.LoadFunction("FindMinAndMax");
- _sumValues = _kernel.LoadFunction("SumValues");
- _findStdDev = _kernel.LoadFunction("FindStdDev");
- _constrain = _kernel.LoadFunction("Constrain");
- _pow = _kernel.LoadFunction("Pow");
- _diagonal = _kernel.LoadFunction("Diagonal");
- _l1Regularisation = _kernel.LoadFunction("L1Regularisation");
- _leakyRelu = _kernel.LoadFunction("LeakyRELU");
- _leakyReluDerivative = _kernel.LoadFunction("LeakyRELUDerivative");
- _pointwiseDivideRows = _kernel.LoadFunction("PointwiseDivideRows");
- _pointwiseDivideColumns = _kernel.LoadFunction("PointwiseDivideColumns");
- _splitRows = _kernel.LoadFunction("SplitRows");
- _splitColumns = _kernel.LoadFunction("SplitColumns");
- _concatRows = _kernel.LoadFunction("ConcatRows");
- _concatColumns = _kernel.LoadFunction("ConcatColumns");
- _euclideanDistance = _kernel.LoadFunction("EuclideanDistance");
- _manhattanDistance = _kernel.LoadFunction("ManhattanDistance");
- _cosineDistance = _kernel.LoadFunction("CosineDistance");
- _abs = _kernel.LoadFunction("Abs");
- _normalise = _kernel.LoadFunction("Normalise");
- _softmaxVector = _kernel.LoadFunction("SoftmaxVector");
- _multiCosine = _kernel.LoadFunction("MultiCosineDistance");
- _log = _kernel.LoadFunction("Log");
- _exp = _kernel.LoadFunction("Exp");
- _vectorAddInPlace = _kernel.LoadFunction("VectorAddInPlace");
- _vectorCopyRandom = _kernel.LoadFunction("VectorCopyRandom");
- _copyToMatrixColumns = _kernel.LoadFunction("CopyToMatrixColumns");
- _copyToMatrixRows = _kernel.LoadFunction("CopyToMatrixRows");
- _tensorAddPadding = _kernel.LoadFunction("TensorAddPadding");
- _tensorRemovePadding = _kernel.LoadFunction("TensorRemovePadding");
- _tensorIm2Col = _kernel.LoadFunction("TensorIm2Col");
- _softmaxDerivative = _kernel.LoadFunction("SoftmaxDerivative");
- _reverse = _kernel.LoadFunction("Reverse");
- _rotateInPlace = _kernel.LoadFunction("RotateInPlace");
- _tensorMaxPool = _kernel.LoadFunction("TensorMaxPool");
- _tensorReverseMaxPool = _kernel.LoadFunction("TensorReverseMaxPool");
- _tensorReverseIm2Col = _kernel.LoadFunction("TensorReverseIm2Col");
- _isFinite = _kernel.LoadFunction("IsFinite");
- _calculateDistance = _kernel.LoadFunction("CalculateDistances");
- _roundInPlace = _kernel.LoadFunction("RoundInPlace");
- _scale = _kernel.LoadFunction("Scale");
+ _pointwiseMultiply = _kernel.LoadFunction("PointwiseMultiply");
+ _addInPlace = _kernel.LoadFunction("AddInPlace");
+ _subtractInPlace = _kernel.LoadFunction("SubtractInPlace");
+ _addToEachRow = _kernel.LoadFunction("AddToEachRow");
+ _addToEachColumn = _kernel.LoadFunction("AddToEachColumn");
+ _multiplyByEachRow = _kernel.LoadFunction("MultiplyByEachRow");
+ _multiplyByEachColumn = _kernel.LoadFunction("MultiplyByEachColumn");
+ _tanh = _kernel.LoadFunction("TanH");
+ _tanhDerivative = _kernel.LoadFunction("TanHDerivative");
+ _sigmoid = _kernel.LoadFunction("Sigmoid");
+ _sigmoidDerivative = _kernel.LoadFunction("SigmoidDerivative");
+ _sumRows = _kernel.LoadFunction("SumRows");
+ _relu = _kernel.LoadFunction("RELU");
+ _reluDerivative = _kernel.LoadFunction("RELUDerivative");
+ _memSet = _kernel.LoadFunction("MemSet");
+ _memCpy = _kernel.LoadFunction("MemCpy");
+ _sumColumns = _kernel.LoadFunction("SumColumns");
+ _pointwiseDivide = _kernel.LoadFunction("PointwiseDivide");
+ _sqrt = _kernel.LoadFunction("Sqrt");
+ _findMinAndMax = _kernel.LoadFunction("FindMinAndMax");
+ _sumValues = _kernel.LoadFunction("SumValues");
+ _findStdDev = _kernel.LoadFunction("FindStdDev");
+ _constrain = _kernel.LoadFunction("Constrain");
+ _pow = _kernel.LoadFunction("Pow");
+ _diagonal = _kernel.LoadFunction("Diagonal");
+ _l1Regularisation = _kernel.LoadFunction("L1Regularisation");
+ _leakyRelu = _kernel.LoadFunction("LeakyRELU");
+ _leakyReluDerivative = _kernel.LoadFunction("LeakyRELUDerivative");
+ _pointwiseDivideRows = _kernel.LoadFunction("PointwiseDivideRows");
+ _pointwiseDivideColumns = _kernel.LoadFunction("PointwiseDivideColumns");
+ _splitRows = _kernel.LoadFunction("SplitRows");
+ _splitColumns = _kernel.LoadFunction("SplitColumns");
+ _concatRows = _kernel.LoadFunction("ConcatRows");
+ _concatColumns = _kernel.LoadFunction("ConcatColumns");
+ _euclideanDistance = _kernel.LoadFunction("EuclideanDistance");
+ _manhattanDistance = _kernel.LoadFunction("ManhattanDistance");
+ _cosineDistance = _kernel.LoadFunction("CosineDistance");
+ _cosineDistances = _kernel.LoadFunction("CosineDistances");
+ _abs = _kernel.LoadFunction("Abs");
+ _normalise = _kernel.LoadFunction("Normalise");
+ _softmaxVector = _kernel.LoadFunction("SoftmaxVector");
+ _multiCosine = _kernel.LoadFunction("CosineMultiDistance");
+ _log = _kernel.LoadFunction("Log");
+ _exp = _kernel.LoadFunction("Exp");
+ _vectorAddInPlace = _kernel.LoadFunction("VectorAddInPlace");
+ _vectorCopyRandom = _kernel.LoadFunction("VectorCopyRandom");
+ _copyToMatrixColumns = _kernel.LoadFunction("CopyToMatrixColumns");
+ _copyToMatrixRows = _kernel.LoadFunction("CopyToMatrixRows");
+ _tensorAddPadding = _kernel.LoadFunction("TensorAddPadding");
+ _tensorRemovePadding = _kernel.LoadFunction("TensorRemovePadding");
+ _tensorIm2Col = _kernel.LoadFunction("TensorIm2Col");
+ _softmaxDerivative = _kernel.LoadFunction("SoftmaxDerivative");
+ _reverse = _kernel.LoadFunction("Reverse");
+ _rotateInPlace = _kernel.LoadFunction("RotateInPlace");
+ _tensorMaxPool = _kernel.LoadFunction("TensorMaxPool");
+ _tensorReverseMaxPool = _kernel.LoadFunction("TensorReverseMaxPool");
+ _tensorReverseIm2Col = _kernel.LoadFunction("TensorReverseIm2Col");
+ _isFinite = _kernel.LoadFunction("IsFinite");
+ _calculateMultiDistances = _kernel.LoadFunction("CalculateMultiDistances");
+ _calculateDistances = _kernel.LoadFunction("CalculateDistances");
+ _roundInPlace = _kernel.LoadFunction("RoundInPlace");
+ _scale = _kernel.LoadFunction("Scale");
}
///
@@ -1058,9 +1062,22 @@ internal void MultiCosine(uint size, uint columns, uint rows, CuDevicePtr vector
);
}
- internal void CalculateDistances(uint size, uint columns, uint rows, CuDevicePtr vectorPtr, CuDevicePtr compareToPtr, CuDevicePtr ret, DistanceMetric distanceMetric)
+ internal void CosineDistances(uint size, uint numVectors, CuDevicePtr vectorPtr, CuDevicePtr compareToPtr, CuDevicePtr aa, CuDevicePtr ret, CuDevicePtr bb)
{
- InvokeTensor(_calculateDistance, null, size, columns, rows,
+ InvokeMatrix(_cosineDistances, null, size, numVectors,
+ vectorPtr,
+ compareToPtr,
+ aa,
+ ret,
+ bb,
+ numVectors,
+ size
+ );
+ }
+
+ internal void CalculateMultiDistances(uint size, uint columns, uint rows, CuDevicePtr vectorPtr, CuDevicePtr compareToPtr, CuDevicePtr ret, DistanceMetric distanceMetric)
+ {
+ InvokeTensor(_calculateMultiDistances, null, size, columns, rows,
vectorPtr,
compareToPtr,
ret,
@@ -1071,6 +1088,18 @@ internal void CalculateDistances(uint size, uint columns, uint rows, CuDevicePtr
);
}
+ internal void CalculateDistances(uint size, uint numVectors, CuDevicePtr vectorPtr, CuDevicePtr compareToPtr, CuDevicePtr ret, DistanceMetric distanceMetric)
+ {
+ InvokeMatrix(_calculateDistances, null, size, numVectors,
+ vectorPtr,
+ compareToPtr,
+ ret,
+ numVectors,
+ size,
+ (uint)distanceMetric
+ );
+ }
+
internal void CopyToMatrixRows(uint rows, uint columns, CudaDeviceVariable from, IDeviceMemoryPtr to, CuStream* stream = null)
{
InvokeMatrix(_copyToMatrixRows, stream, rows, columns, from.DevicePointer, to.DevicePointer, rows, columns);
diff --git a/BrightData.Cuda/CudaTensorSegment.cs b/BrightData.Cuda/CudaTensorSegment.cs
index f2d2ea58..6233711d 100644
--- a/BrightData.Cuda/CudaTensorSegment.cs
+++ b/BrightData.Cuda/CudaTensorSegment.cs
@@ -6,7 +6,7 @@
namespace BrightData.Cuda
{
- internal class CudaTensorSegment(IDeviceMemoryPtr data, CudaProvider provider) : INumericSegment
+ internal class CudaTensorSegment(IDeviceMemoryPtr data, CudaProvider provider) : INumericSegment, IHaveDeviceMemory
{
const string CudaSegmentType = "cuda";
@@ -29,6 +29,7 @@ public static bool IsCuda(IReadOnlyNumericSegment segment, [NotNullWhen(t
public int Release() => DeviceMemory.Release();
public IDeviceMemoryPtr DeviceMemory { get; } = data;
+ IDeviceMemoryPtr IHaveDeviceMemory.Memory => DeviceMemory;
public bool IsValid => DeviceMemory.IsValid;
public uint Size => DeviceMemory.Size;
public string SegmentType => CudaSegmentType;
diff --git a/BrightData.Cuda/cuda/brightwire.cu b/BrightData.Cuda/cuda/brightwire.cu
index 4f6911c3..dfb1fdba 100644
--- a/BrightData.Cuda/cuda/brightwire.cu
+++ b/BrightData.Cuda/cuda/brightwire.cu
@@ -864,7 +864,7 @@ extern "C"
}
}
- __global__ void CalculateDistances(
+ __global__ void CalculateMultiDistances(
const float** __restrict a,
const float** __restrict b,
float* __restrict c,
@@ -883,8 +883,6 @@ extern "C"
if(distanceMetric == 0) { // euclidean
float diff = aVal - bVal;
output = diff * diff;
- }else if(distanceMetric == 1) { // cosine
- output = aVal * bVal;
}else if(distanceMetric == 2) { // manhattan
output = abs(aVal - bVal);
}
@@ -895,7 +893,32 @@ extern "C"
}
}
- __global__ void MultiCosineDistance(
+ __global__ void CalculateDistances(
+ const float* __restrict a,
+ const float** __restrict b,
+ float* __restrict c,
+ uint numVectors,
+ uint size,
+ uint distanceMetric
+ ) {
+ for (uint i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) {
+ for (uint j = blockDim.y * blockIdx.y + threadIdx.y; j < numVectors; j += blockDim.y * gridDim.y) {
+ float aVal = a[i];
+ float bVal = b[j][i];
+ float output = 0;
+
+ if(distanceMetric == 0) { // euclidean
+ float diff = aVal - bVal;
+ output = diff * diff;
+ }else if(distanceMetric == 2) { // manhattan
+ output = abs(aVal - bVal);
+ }
+ atomicAdd(c + j, output);
+ }
+ }
+ }
+
+ __global__ void CosineMultiDistance(
const float** __restrict a,
const float** __restrict b,
float* __restrict aa,
@@ -919,6 +942,26 @@ extern "C"
}
}
+ __global__ void CosineDistances(
+ const float* __restrict a,
+ const float** __restrict b,
+ float* __restrict aa,
+ float* __restrict ab,
+ float* __restrict bb,
+ uint numVectors,
+ uint size
+ ) {
+ for (uint i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) {
+ for (uint j = blockDim.y * blockIdx.y + threadIdx.y; j < numVectors; j += blockDim.y * gridDim.y) {
+ float aVal = a[i];
+ float bVal = b[j][i];
+ atomicAdd(aa + j, aVal * aVal);
+ atomicAdd(ab + j, aVal * bVal);
+ atomicAdd(bb + j, bVal * bVal);
+ }
+ }
+ }
+
__global__ void SumValues(const float* __restrict a, uint count, float* __restrict sum, uint ai)
{
uint tidX = threadIdx.x;
diff --git a/BrightData.Cuda/cuda/brightwire.ptx b/BrightData.Cuda/cuda/brightwire.ptx
index 42cce186..66aa3204 100644
--- a/BrightData.Cuda/cuda/brightwire.ptx
+++ b/BrightData.Cuda/cuda/brightwire.ptx
@@ -4022,200 +4022,308 @@ $L__BB56_3:
ret;
}
- // .globl CalculateDistances
-.visible .entry CalculateDistances(
- .param .u64 CalculateDistances_param_0,
- .param .u64 CalculateDistances_param_1,
- .param .u64 CalculateDistances_param_2,
- .param .u32 CalculateDistances_param_3,
- .param .u32 CalculateDistances_param_4,
- .param .u32 CalculateDistances_param_5,
- .param .u32 CalculateDistances_param_6
+ // .globl CalculateMultiDistances
+.visible .entry CalculateMultiDistances(
+ .param .u64 CalculateMultiDistances_param_0,
+ .param .u64 CalculateMultiDistances_param_1,
+ .param .u64 CalculateMultiDistances_param_2,
+ .param .u32 CalculateMultiDistances_param_3,
+ .param .u32 CalculateMultiDistances_param_4,
+ .param .u32 CalculateMultiDistances_param_5,
+ .param .u32 CalculateMultiDistances_param_6
)
{
- .reg .pred %p<15>;
- .reg .f32 %f<15>;
- .reg .b32 %r<53>;
- .reg .b64 %rd<46>;
-
-
- ld.param.u64 %rd5, [CalculateDistances_param_0];
- ld.param.u64 %rd6, [CalculateDistances_param_1];
- ld.param.u64 %rd7, [CalculateDistances_param_2];
- ld.param.u32 %r27, [CalculateDistances_param_3];
- ld.param.u32 %r28, [CalculateDistances_param_4];
- ld.param.u32 %r29, [CalculateDistances_param_5];
- ld.param.u32 %r30, [CalculateDistances_param_6];
- cvta.to.global.u64 %rd1, %rd7;
- cvta.to.global.u64 %rd2, %rd6;
- cvta.to.global.u64 %rd3, %rd5;
- mov.u32 %r31, %ctaid.x;
+ .reg .pred %p<13>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<49>;
+ .reg .b64 %rd<38>;
+
+
+ ld.param.u64 %rd6, [CalculateMultiDistances_param_0];
+ ld.param.u64 %rd7, [CalculateMultiDistances_param_1];
+ ld.param.u64 %rd8, [CalculateMultiDistances_param_2];
+ ld.param.u32 %r25, [CalculateMultiDistances_param_3];
+ ld.param.u32 %r26, [CalculateMultiDistances_param_4];
+ ld.param.u32 %r27, [CalculateMultiDistances_param_5];
+ ld.param.u32 %r28, [CalculateMultiDistances_param_6];
+ cvta.to.global.u64 %rd1, %rd8;
+ cvta.to.global.u64 %rd2, %rd7;
+ cvta.to.global.u64 %rd3, %rd6;
+ mov.u32 %r29, %ctaid.x;
mov.u32 %r1, %ntid.x;
- mov.u32 %r32, %tid.x;
- mad.lo.s32 %r45, %r1, %r31, %r32;
- setp.ge.u32 %p1, %r45, %r29;
- @%p1 bra $L__BB57_21;
-
- mov.u32 %r33, %ntid.y;
- mov.u32 %r34, %ctaid.y;
- mov.u32 %r35, %tid.y;
- mad.lo.s32 %r3, %r33, %r34, %r35;
- mov.u32 %r36, %nctaid.x;
- mul.lo.s32 %r4, %r1, %r36;
- mov.u32 %r37, %ctaid.z;
- mov.u32 %r38, %ntid.z;
- mov.u32 %r39, %tid.z;
- mad.lo.s32 %r5, %r38, %r37, %r39;
- mov.u32 %r40, %nctaid.y;
- mul.lo.s32 %r6, %r33, %r40;
- mov.u32 %r41, %nctaid.z;
- mul.lo.s32 %r7, %r38, %r41;
+ mov.u32 %r30, %tid.x;
+ mad.lo.s32 %r42, %r1, %r29, %r30;
+ setp.ge.u32 %p1, %r42, %r27;
+ @%p1 bra $L__BB57_18;
+
+ mov.u32 %r31, %ntid.y;
+ mov.u32 %r32, %ctaid.y;
+ mov.u32 %r33, %tid.y;
+ mad.lo.s32 %r3, %r31, %r32, %r33;
+ mov.u32 %r34, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r34;
+ mov.u32 %r35, %ctaid.z;
+ mov.u32 %r36, %ntid.z;
+ mov.u32 %r37, %tid.z;
+ mad.lo.s32 %r5, %r36, %r35, %r37;
+ mov.u32 %r38, %nctaid.y;
+ mul.lo.s32 %r6, %r31, %r38;
+ mov.u32 %r39, %nctaid.z;
+ mul.lo.s32 %r7, %r36, %r39;
$L__BB57_2:
- setp.ge.u32 %p2, %r3, %r28;
- @%p2 bra $L__BB57_20;
+ setp.ge.u32 %p2, %r3, %r26;
+ @%p2 bra $L__BB57_17;
- setp.eq.s32 %p3, %r30, 0;
- cvt.u64.u32 %rd4, %r45;
- @%p3 bra $L__BB57_15;
+ setp.eq.s32 %p3, %r28, 0;
+ cvt.u64.u32 %rd4, %r42;
+ @%p3 bra $L__BB57_12;
- mov.u32 %r46, %r3;
+ mov.u32 %r43, %r3;
$L__BB57_5:
- setp.ge.u32 %p4, %r5, %r27;
- @%p4 bra $L__BB57_14;
-
- mul.wide.u32 %rd8, %r46, 8;
- add.s64 %rd9, %rd3, %rd8;
- ld.global.nc.u64 %rd10, [%rd9];
- cvta.to.global.u64 %rd11, %rd10;
- shl.b64 %rd12, %rd4, 2;
- add.s64 %rd13, %rd11, %rd12;
- ld.global.f32 %f1, [%rd13];
- mul.lo.s32 %r10, %r46, %r27;
- setp.eq.s32 %p5, %r30, 1;
- @%p5 bra $L__BB57_10;
-
- setp.ne.s32 %p6, %r30, 2;
- @%p6 bra $L__BB57_12;
+ setp.ge.u32 %p4, %r5, %r25;
+ @%p4 bra $L__BB57_11;
- add.s32 %r47, %r5, %r10;
- mov.u32 %r48, %r5;
+ setp.eq.s32 %p5, %r28, 2;
+ mul.wide.u32 %rd9, %r43, 8;
+ add.s64 %rd10, %rd3, %rd9;
+ ld.global.nc.u64 %rd11, [%rd10];
+ cvta.to.global.u64 %rd12, %rd11;
+ shl.b64 %rd13, %rd4, 2;
+ add.s64 %rd5, %rd12, %rd13;
+ mul.lo.s32 %r10, %r43, %r25;
+ @%p5 bra $L__BB57_9;
+ bra.uni $L__BB57_7;
$L__BB57_9:
- mul.wide.u32 %rd14, %r48, 8;
- add.s64 %rd15, %rd2, %rd14;
- ld.global.nc.u64 %rd16, [%rd15];
- cvta.to.global.u64 %rd17, %rd16;
- add.s64 %rd19, %rd17, %rd12;
- ld.global.f32 %f3, [%rd19];
- sub.ftz.f32 %f4, %f1, %f3;
- abs.ftz.f32 %f5, %f4;
- mul.wide.u32 %rd20, %r47, 4;
- add.s64 %rd21, %rd1, %rd20;
- atom.global.add.f32 %f6, [%rd21], %f5;
- add.s32 %r47, %r47, %r7;
- add.s32 %r48, %r48, %r7;
- setp.lt.u32 %p7, %r48, %r27;
- @%p7 bra $L__BB57_9;
- bra.uni $L__BB57_14;
+ ld.global.f32 %f1, [%rd5];
+ mov.u32 %r46, %r5;
$L__BB57_10:
- mov.u32 %r49, %r5;
+ mul.wide.u32 %rd16, %r46, 8;
+ add.s64 %rd17, %rd2, %rd16;
+ ld.global.nc.u64 %rd18, [%rd17];
+ cvta.to.global.u64 %rd19, %rd18;
+ add.s64 %rd21, %rd19, %rd13;
+ ld.global.f32 %f4, [%rd21];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ add.s32 %r40, %r46, %r10;
+ mul.wide.u32 %rd22, %r40, 4;
+ add.s64 %rd23, %rd1, %rd22;
+ atom.global.add.f32 %f7, [%rd23], %f6;
+ add.s32 %r46, %r46, %r7;
+ setp.lt.u32 %p7, %r46, %r25;
+ @%p7 bra $L__BB57_10;
+ bra.uni $L__BB57_11;
+
+$L__BB57_7:
+ add.s32 %r44, %r5, %r10;
+ mov.u32 %r45, %r5;
+
+$L__BB57_8:
+ mul.wide.u32 %rd14, %r44, 4;
+ add.s64 %rd15, %rd1, %rd14;
+ atom.global.add.f32 %f3, [%rd15], 0f00000000;
+ add.s32 %r44, %r44, %r7;
+ add.s32 %r45, %r45, %r7;
+ setp.lt.u32 %p6, %r45, %r25;
+ @%p6 bra $L__BB57_8;
$L__BB57_11:
- mul.wide.u32 %rd22, %r49, 8;
- add.s64 %rd23, %rd2, %rd22;
- ld.global.nc.u64 %rd24, [%rd23];
- cvta.to.global.u64 %rd25, %rd24;
- add.s64 %rd27, %rd25, %rd12;
- ld.global.f32 %f7, [%rd27];
- mul.ftz.f32 %f8, %f1, %f7;
- add.s32 %r42, %r49, %r10;
- mul.wide.u32 %rd28, %r42, 4;
- add.s64 %rd29, %rd1, %rd28;
- atom.global.add.f32 %f9, [%rd29], %f8;
- add.s32 %r49, %r49, %r7;
- setp.lt.u32 %p8, %r49, %r27;
- @%p8 bra $L__BB57_11;
- bra.uni $L__BB57_14;
+ add.s32 %r43, %r43, %r6;
+ setp.lt.u32 %p8, %r43, %r26;
+ @%p8 bra $L__BB57_5;
+ bra.uni $L__BB57_17;
$L__BB57_12:
- mov.u32 %r50, %r5;
+ mov.u32 %r47, %r3;
$L__BB57_13:
- add.s32 %r43, %r50, %r10;
- mul.wide.u32 %rd30, %r43, 4;
- add.s64 %rd31, %rd1, %rd30;
- atom.global.add.f32 %f10, [%rd31], 0f00000000;
- add.s32 %r50, %r50, %r7;
- setp.lt.u32 %p9, %r50, %r27;
- @%p9 bra $L__BB57_13;
-
-$L__BB57_14:
- add.s32 %r46, %r46, %r6;
- setp.lt.u32 %p10, %r46, %r28;
- @%p10 bra $L__BB57_5;
- bra.uni $L__BB57_20;
+ setp.ge.u32 %p9, %r5, %r25;
+ @%p9 bra $L__BB57_16;
+
+ mul.wide.u32 %rd24, %r47, 8;
+ add.s64 %rd25, %rd3, %rd24;
+ ld.global.nc.u64 %rd26, [%rd25];
+ cvta.to.global.u64 %rd27, %rd26;
+ shl.b64 %rd28, %rd4, 2;
+ add.s64 %rd29, %rd27, %rd28;
+ ld.global.f32 %f2, [%rd29];
+ mul.lo.s32 %r20, %r47, %r25;
+ mov.u32 %r48, %r5;
$L__BB57_15:
- mov.u32 %r51, %r3;
+ mul.wide.u32 %rd30, %r48, 8;
+ add.s64 %rd31, %rd2, %rd30;
+ ld.global.nc.u64 %rd32, [%rd31];
+ cvta.to.global.u64 %rd33, %rd32;
+ add.s64 %rd35, %rd33, %rd28;
+ ld.global.f32 %f8, [%rd35];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ add.s32 %r41, %r48, %r20;
+ mul.wide.u32 %rd36, %r41, 4;
+ add.s64 %rd37, %rd1, %rd36;
+ atom.global.add.f32 %f11, [%rd37], %f10;
+ add.s32 %r48, %r48, %r7;
+ setp.lt.u32 %p10, %r48, %r25;
+ @%p10 bra $L__BB57_15;
$L__BB57_16:
- setp.ge.u32 %p11, %r5, %r27;
- @%p11 bra $L__BB57_19;
-
- mul.wide.u32 %rd32, %r51, 8;
- add.s64 %rd33, %rd3, %rd32;
- ld.global.nc.u64 %rd34, [%rd33];
- cvta.to.global.u64 %rd35, %rd34;
- shl.b64 %rd36, %rd4, 2;
- add.s64 %rd37, %rd35, %rd36;
- ld.global.f32 %f2, [%rd37];
- mul.lo.s32 %r22, %r51, %r27;
- mov.u32 %r52, %r5;
+ add.s32 %r47, %r47, %r6;
+ setp.lt.u32 %p11, %r47, %r26;
+ @%p11 bra $L__BB57_13;
+
+$L__BB57_17:
+ add.s32 %r42, %r42, %r4;
+ setp.lt.u32 %p12, %r42, %r27;
+ @%p12 bra $L__BB57_2;
$L__BB57_18:
- mul.wide.u32 %rd38, %r52, 8;
- add.s64 %rd39, %rd2, %rd38;
- ld.global.nc.u64 %rd40, [%rd39];
- cvta.to.global.u64 %rd41, %rd40;
- add.s64 %rd43, %rd41, %rd36;
- ld.global.f32 %f11, [%rd43];
- sub.ftz.f32 %f12, %f2, %f11;
- mul.ftz.f32 %f13, %f12, %f12;
- add.s32 %r44, %r52, %r22;
- mul.wide.u32 %rd44, %r44, 4;
- add.s64 %rd45, %rd1, %rd44;
- atom.global.add.f32 %f14, [%rd45], %f13;
- add.s32 %r52, %r52, %r7;
- setp.lt.u32 %p12, %r52, %r27;
- @%p12 bra $L__BB57_18;
-
-$L__BB57_19:
- add.s32 %r51, %r51, %r6;
- setp.lt.u32 %p13, %r51, %r28;
- @%p13 bra $L__BB57_16;
-
-$L__BB57_20:
- add.s32 %r45, %r45, %r4;
- setp.lt.u32 %p14, %r45, %r29;
- @%p14 bra $L__BB57_2;
-
-$L__BB57_21:
ret;
}
- // .globl MultiCosineDistance
-.visible .entry MultiCosineDistance(
- .param .u64 MultiCosineDistance_param_0,
- .param .u64 MultiCosineDistance_param_1,
- .param .u64 MultiCosineDistance_param_2,
- .param .u64 MultiCosineDistance_param_3,
- .param .u64 MultiCosineDistance_param_4,
- .param .u32 MultiCosineDistance_param_5,
- .param .u32 MultiCosineDistance_param_6,
- .param .u32 MultiCosineDistance_param_7
+ // .globl CalculateDistances
+.visible .entry CalculateDistances(
+ .param .u64 CalculateDistances_param_0,
+ .param .u64 CalculateDistances_param_1,
+ .param .u64 CalculateDistances_param_2,
+ .param .u32 CalculateDistances_param_3,
+ .param .u32 CalculateDistances_param_4,
+ .param .u32 CalculateDistances_param_5
+)
+{
+ .reg .pred %p<11>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<31>;
+ .reg .b64 %rd<31>;
+
+
+ ld.param.u64 %rd7, [CalculateDistances_param_0];
+ ld.param.u64 %rd8, [CalculateDistances_param_1];
+ ld.param.u64 %rd9, [CalculateDistances_param_2];
+ ld.param.u32 %r16, [CalculateDistances_param_3];
+ ld.param.u32 %r17, [CalculateDistances_param_4];
+ ld.param.u32 %r18, [CalculateDistances_param_5];
+ cvta.to.global.u64 %rd1, %rd9;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd7;
+ mov.u32 %r19, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r20, %tid.x;
+ mad.lo.s32 %r26, %r1, %r19, %r20;
+ setp.ge.u32 %p1, %r26, %r17;
+ @%p1 bra $L__BB58_15;
+
+ mov.u32 %r21, %ntid.y;
+ mov.u32 %r22, %ctaid.y;
+ mov.u32 %r23, %tid.y;
+ mad.lo.s32 %r3, %r21, %r22, %r23;
+ mov.u32 %r24, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r24;
+ mov.u32 %r25, %nctaid.y;
+ mul.lo.s32 %r5, %r21, %r25;
+ setp.eq.s32 %p2, %r18, 0;
+ @%p2 bra $L__BB58_11;
+
+$L__BB58_3:
+ setp.ge.u32 %p3, %r3, %r16;
+ @%p3 bra $L__BB58_9;
+
+ setp.eq.s32 %p4, %r18, 2;
+ cvt.u64.u32 %rd4, %r26;
+ mul.wide.u32 %rd10, %r26, 4;
+ add.s64 %rd5, %rd3, %rd10;
+ @%p4 bra $L__BB58_7;
+ bra.uni $L__BB58_5;
+
+$L__BB58_7:
+ ld.global.nc.f32 %f1, [%rd5];
+ mov.u32 %r28, %r3;
+
+$L__BB58_8:
+ mul.wide.u32 %rd13, %r28, 8;
+ add.s64 %rd14, %rd2, %rd13;
+ ld.global.nc.u64 %rd15, [%rd14];
+ cvta.to.global.u64 %rd16, %rd15;
+ shl.b64 %rd17, %rd4, 2;
+ add.s64 %rd18, %rd16, %rd17;
+ ld.global.f32 %f4, [%rd18];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ mul.wide.u32 %rd19, %r28, 4;
+ add.s64 %rd20, %rd1, %rd19;
+ atom.global.add.f32 %f7, [%rd20], %f6;
+ add.s32 %r28, %r28, %r5;
+ setp.lt.u32 %p6, %r28, %r16;
+ @%p6 bra $L__BB58_8;
+ bra.uni $L__BB58_9;
+
+$L__BB58_5:
+ mov.u32 %r27, %r3;
+
+$L__BB58_6:
+ mul.wide.u32 %rd11, %r27, 4;
+ add.s64 %rd12, %rd1, %rd11;
+ atom.global.add.f32 %f3, [%rd12], 0f00000000;
+ add.s32 %r27, %r27, %r5;
+ setp.lt.u32 %p5, %r27, %r16;
+ @%p5 bra $L__BB58_6;
+
+$L__BB58_9:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p7, %r26, %r17;
+ @%p7 bra $L__BB58_3;
+ bra.uni $L__BB58_15;
+
+$L__BB58_11:
+ setp.ge.u32 %p8, %r3, %r16;
+ @%p8 bra $L__BB58_14;
+
+ cvt.u64.u32 %rd6, %r26;
+ mul.wide.u32 %rd21, %r26, 4;
+ add.s64 %rd22, %rd3, %rd21;
+ ld.global.nc.f32 %f2, [%rd22];
+ mov.u32 %r30, %r3;
+
+$L__BB58_13:
+ mul.wide.u32 %rd23, %r30, 8;
+ add.s64 %rd24, %rd2, %rd23;
+ ld.global.nc.u64 %rd25, [%rd24];
+ cvta.to.global.u64 %rd26, %rd25;
+ shl.b64 %rd27, %rd6, 2;
+ add.s64 %rd28, %rd26, %rd27;
+ ld.global.f32 %f8, [%rd28];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ mul.wide.u32 %rd29, %r30, 4;
+ add.s64 %rd30, %rd1, %rd29;
+ atom.global.add.f32 %f11, [%rd30], %f10;
+ add.s32 %r30, %r30, %r5;
+ setp.lt.u32 %p9, %r30, %r16;
+ @%p9 bra $L__BB58_13;
+
+$L__BB58_14:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p10, %r26, %r17;
+ @%p10 bra $L__BB58_11;
+
+$L__BB58_15:
+ ret;
+
+}
+ // .globl CosineMultiDistance
+.visible .entry CosineMultiDistance(
+ .param .u64 CosineMultiDistance_param_0,
+ .param .u64 CosineMultiDistance_param_1,
+ .param .u64 CosineMultiDistance_param_2,
+ .param .u64 CosineMultiDistance_param_3,
+ .param .u64 CosineMultiDistance_param_4,
+ .param .u32 CosineMultiDistance_param_5,
+ .param .u32 CosineMultiDistance_param_6,
+ .param .u32 CosineMultiDistance_param_7
)
{
.reg .pred %p<7>;
@@ -4224,20 +4332,20 @@ $L__BB57_21:
.reg .b64 %rd<28>;
- ld.param.u64 %rd7, [MultiCosineDistance_param_0];
- ld.param.u64 %rd8, [MultiCosineDistance_param_1];
- ld.param.u64 %rd9, [MultiCosineDistance_param_2];
- ld.param.u64 %rd10, [MultiCosineDistance_param_3];
- ld.param.u64 %rd11, [MultiCosineDistance_param_4];
- ld.param.u32 %r17, [MultiCosineDistance_param_5];
- ld.param.u32 %r18, [MultiCosineDistance_param_6];
- ld.param.u32 %r19, [MultiCosineDistance_param_7];
+ ld.param.u64 %rd7, [CosineMultiDistance_param_0];
+ ld.param.u64 %rd8, [CosineMultiDistance_param_1];
+ ld.param.u64 %rd9, [CosineMultiDistance_param_2];
+ ld.param.u64 %rd10, [CosineMultiDistance_param_3];
+ ld.param.u64 %rd11, [CosineMultiDistance_param_4];
+ ld.param.u32 %r17, [CosineMultiDistance_param_5];
+ ld.param.u32 %r18, [CosineMultiDistance_param_6];
+ ld.param.u32 %r19, [CosineMultiDistance_param_7];
mov.u32 %r20, %ctaid.x;
mov.u32 %r1, %ntid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r31, %r1, %r20, %r21;
setp.ge.u32 %p1, %r31, %r19;
- @%p1 bra $L__BB58_9;
+ @%p1 bra $L__BB59_9;
mov.u32 %r22, %ntid.y;
mov.u32 %r23, %ctaid.y;
@@ -4259,16 +4367,16 @@ $L__BB57_21:
cvta.to.global.u64 %rd4, %rd10;
cvta.to.global.u64 %rd5, %rd11;
-$L__BB58_2:
+$L__BB59_2:
setp.ge.u32 %p2, %r3, %r18;
- @%p2 bra $L__BB58_8;
+ @%p2 bra $L__BB59_8;
cvt.u64.u32 %rd6, %r31;
mov.u32 %r32, %r3;
-$L__BB58_4:
+$L__BB59_4:
setp.ge.u32 %p3, %r5, %r17;
- @%p3 bra $L__BB58_7;
+ @%p3 bra $L__BB59_7;
mul.wide.u32 %rd12, %r32, 8;
add.s64 %rd13, %rd1, %rd12;
@@ -4281,7 +4389,7 @@ $L__BB58_4:
mad.lo.s32 %r33, %r32, %r17, %r5;
mov.u32 %r34, %r5;
-$L__BB58_6:
+$L__BB59_6:
mul.wide.u32 %rd18, %r34, 8;
add.s64 %rd19, %rd2, %rd18;
ld.global.nc.u64 %rd20, [%rd19];
@@ -4300,19 +4408,105 @@ $L__BB58_6:
add.s32 %r33, %r33, %r7;
add.s32 %r34, %r34, %r7;
setp.lt.u32 %p4, %r34, %r17;
- @%p4 bra $L__BB58_6;
+ @%p4 bra $L__BB59_6;
-$L__BB58_7:
+$L__BB59_7:
add.s32 %r32, %r32, %r6;
setp.lt.u32 %p5, %r32, %r18;
- @%p5 bra $L__BB58_4;
+ @%p5 bra $L__BB59_4;
-$L__BB58_8:
+$L__BB59_8:
add.s32 %r31, %r31, %r4;
setp.lt.u32 %p6, %r31, %r19;
- @%p6 bra $L__BB58_2;
+ @%p6 bra $L__BB59_2;
-$L__BB58_9:
+$L__BB59_9:
+ ret;
+
+}
+ // .globl CosineDistances
+.visible .entry CosineDistances(
+ .param .u64 CosineDistances_param_0,
+ .param .u64 CosineDistances_param_1,
+ .param .u64 CosineDistances_param_2,
+ .param .u64 CosineDistances_param_3,
+ .param .u64 CosineDistances_param_4,
+ .param .u32 CosineDistances_param_5,
+ .param .u32 CosineDistances_param_6
+)
+{
+ .reg .pred %p<5>;
+ .reg .f32 %f<9>;
+ .reg .b32 %r<21>;
+ .reg .b64 %rd<24>;
+
+
+ ld.param.u64 %rd7, [CosineDistances_param_0];
+ ld.param.u64 %rd8, [CosineDistances_param_1];
+ ld.param.u64 %rd9, [CosineDistances_param_2];
+ ld.param.u64 %rd10, [CosineDistances_param_3];
+ ld.param.u64 %rd11, [CosineDistances_param_4];
+ ld.param.u32 %r10, [CosineDistances_param_5];
+ ld.param.u32 %r11, [CosineDistances_param_6];
+ mov.u32 %r12, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r13, %tid.x;
+ mad.lo.s32 %r19, %r1, %r12, %r13;
+ setp.ge.u32 %p1, %r19, %r11;
+ @%p1 bra $L__BB60_6;
+
+ mov.u32 %r14, %ntid.y;
+ mov.u32 %r15, %ctaid.y;
+ mov.u32 %r16, %tid.y;
+ mad.lo.s32 %r3, %r14, %r15, %r16;
+ mov.u32 %r17, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r17;
+ mov.u32 %r18, %nctaid.y;
+ mul.lo.s32 %r5, %r14, %r18;
+ cvta.to.global.u64 %rd1, %rd7;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd9;
+ cvta.to.global.u64 %rd4, %rd10;
+ cvta.to.global.u64 %rd5, %rd11;
+
+$L__BB60_2:
+ setp.ge.u32 %p2, %r3, %r10;
+ @%p2 bra $L__BB60_5;
+
+ cvt.u64.u32 %rd6, %r19;
+ mul.wide.u32 %rd12, %r19, 4;
+ add.s64 %rd13, %rd1, %rd12;
+ ld.global.nc.f32 %f1, [%rd13];
+ mul.ftz.f32 %f2, %f1, %f1;
+ mov.u32 %r20, %r3;
+
+$L__BB60_4:
+ mul.wide.u32 %rd14, %r20, 8;
+ add.s64 %rd15, %rd2, %rd14;
+ ld.global.nc.u64 %rd16, [%rd15];
+ cvta.to.global.u64 %rd17, %rd16;
+ shl.b64 %rd18, %rd6, 2;
+ add.s64 %rd19, %rd17, %rd18;
+ ld.global.f32 %f3, [%rd19];
+ mul.wide.u32 %rd20, %r20, 4;
+ add.s64 %rd21, %rd3, %rd20;
+ atom.global.add.f32 %f4, [%rd21], %f2;
+ add.s64 %rd22, %rd4, %rd20;
+ mul.ftz.f32 %f5, %f1, %f3;
+ atom.global.add.f32 %f6, [%rd22], %f5;
+ add.s64 %rd23, %rd5, %rd20;
+ mul.ftz.f32 %f7, %f3, %f3;
+ atom.global.add.f32 %f8, [%rd23], %f7;
+ add.s32 %r20, %r20, %r5;
+ setp.lt.u32 %p3, %r20, %r10;
+ @%p3 bra $L__BB60_4;
+
+$L__BB60_5:
+ add.s32 %r19, %r19, %r4;
+ setp.lt.u32 %p4, %r19, %r11;
+ @%p4 bra $L__BB60_2;
+
+$L__BB60_6:
ret;
}
@@ -4340,7 +4534,7 @@ $L__BB58_9:
mov.u32 %r2, %tid.x;
mad.lo.s32 %r3, %r22, %r1, %r2;
setp.ge.u32 %p1, %r3, %r20;
- @%p1 bra $L__BB59_2;
+ @%p1 bra $L__BB61_2;
cvta.to.global.u64 %rd3, %rd1;
mul.lo.s32 %r23, %r3, %r21;
@@ -4352,17 +4546,17 @@ $L__BB58_9:
add.s32 %r26, %r25, %r24;
st.shared.f32 [%r26], %f8;
-$L__BB59_2:
+$L__BB61_2:
bar.sync 0;
setp.ne.s32 %p2, %r2, 0;
- @%p2 bra $L__BB59_11;
+ @%p2 bra $L__BB61_11;
shl.b32 %r4, %r1, 10;
sub.s32 %r27, %r20, %r4;
min.u32 %r5, %r27, 1024;
setp.eq.s32 %p3, %r5, 0;
mov.f32 %f25, 0f00000000;
- @%p3 bra $L__BB59_10;
+ @%p3 bra $L__BB61_10;
not.b32 %r29, %r20;
add.s32 %r30, %r4, %r29;
@@ -4373,7 +4567,7 @@ $L__BB59_2:
setp.lt.u32 %p4, %r33, 3;
mov.f32 %f25, 0f00000000;
mov.u32 %r45, 0;
- @%p4 bra $L__BB59_7;
+ @%p4 bra $L__BB61_7;
add.s32 %r36, %r4, -1;
sub.s32 %r37, %r36, %r20;
@@ -4382,7 +4576,7 @@ $L__BB59_2:
neg.s32 %r42, %r39;
mov.u32 %r43, _ZZ9SumValuesE5block;
-$L__BB59_6:
+$L__BB61_6:
ld.shared.f32 %f13, [%r43];
add.ftz.f32 %f14, %f25, %f13;
ld.shared.f32 %f15, [%r43+4];
@@ -4395,32 +4589,32 @@ $L__BB59_6:
add.s32 %r43, %r43, 16;
add.s32 %r42, %r42, -4;
setp.ne.s32 %p5, %r42, 1;
- @%p5 bra $L__BB59_6;
+ @%p5 bra $L__BB61_6;
-$L__BB59_7:
+$L__BB61_7:
setp.eq.s32 %p6, %r47, 0;
- @%p6 bra $L__BB59_10;
+ @%p6 bra $L__BB61_10;
shl.b32 %r40, %r45, 2;
mov.u32 %r41, _ZZ9SumValuesE5block;
add.s32 %r46, %r41, %r40;
-$L__BB59_9:
+$L__BB61_9:
.pragma "nounroll";
ld.shared.f32 %f20, [%r46];
add.ftz.f32 %f25, %f25, %f20;
add.s32 %r46, %r46, 4;
add.s32 %r47, %r47, -1;
setp.ne.s32 %p7, %r47, 0;
- @%p7 bra $L__BB59_9;
+ @%p7 bra $L__BB61_9;
-$L__BB59_10:
+$L__BB61_10:
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f25;
-$L__BB59_11:
+$L__BB61_11:
ret;
}
diff --git a/BrightData.Cuda/cuda/brightwire_50.ptx b/BrightData.Cuda/cuda/brightwire_50.ptx
index 33090aa4..ce953a43 100644
--- a/BrightData.Cuda/cuda/brightwire_50.ptx
+++ b/BrightData.Cuda/cuda/brightwire_50.ptx
@@ -4022,200 +4022,308 @@ $L__BB56_3:
ret;
}
- // .globl CalculateDistances
-.visible .entry CalculateDistances(
- .param .u64 CalculateDistances_param_0,
- .param .u64 CalculateDistances_param_1,
- .param .u64 CalculateDistances_param_2,
- .param .u32 CalculateDistances_param_3,
- .param .u32 CalculateDistances_param_4,
- .param .u32 CalculateDistances_param_5,
- .param .u32 CalculateDistances_param_6
+ // .globl CalculateMultiDistances
+.visible .entry CalculateMultiDistances(
+ .param .u64 CalculateMultiDistances_param_0,
+ .param .u64 CalculateMultiDistances_param_1,
+ .param .u64 CalculateMultiDistances_param_2,
+ .param .u32 CalculateMultiDistances_param_3,
+ .param .u32 CalculateMultiDistances_param_4,
+ .param .u32 CalculateMultiDistances_param_5,
+ .param .u32 CalculateMultiDistances_param_6
)
{
- .reg .pred %p<15>;
- .reg .f32 %f<15>;
- .reg .b32 %r<53>;
- .reg .b64 %rd<46>;
-
-
- ld.param.u64 %rd5, [CalculateDistances_param_0];
- ld.param.u64 %rd6, [CalculateDistances_param_1];
- ld.param.u64 %rd7, [CalculateDistances_param_2];
- ld.param.u32 %r27, [CalculateDistances_param_3];
- ld.param.u32 %r28, [CalculateDistances_param_4];
- ld.param.u32 %r29, [CalculateDistances_param_5];
- ld.param.u32 %r30, [CalculateDistances_param_6];
- cvta.to.global.u64 %rd1, %rd7;
- cvta.to.global.u64 %rd2, %rd6;
- cvta.to.global.u64 %rd3, %rd5;
- mov.u32 %r31, %ctaid.x;
+ .reg .pred %p<13>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<49>;
+ .reg .b64 %rd<38>;
+
+
+ ld.param.u64 %rd6, [CalculateMultiDistances_param_0];
+ ld.param.u64 %rd7, [CalculateMultiDistances_param_1];
+ ld.param.u64 %rd8, [CalculateMultiDistances_param_2];
+ ld.param.u32 %r25, [CalculateMultiDistances_param_3];
+ ld.param.u32 %r26, [CalculateMultiDistances_param_4];
+ ld.param.u32 %r27, [CalculateMultiDistances_param_5];
+ ld.param.u32 %r28, [CalculateMultiDistances_param_6];
+ cvta.to.global.u64 %rd1, %rd8;
+ cvta.to.global.u64 %rd2, %rd7;
+ cvta.to.global.u64 %rd3, %rd6;
+ mov.u32 %r29, %ctaid.x;
mov.u32 %r1, %ntid.x;
- mov.u32 %r32, %tid.x;
- mad.lo.s32 %r45, %r1, %r31, %r32;
- setp.ge.u32 %p1, %r45, %r29;
- @%p1 bra $L__BB57_21;
-
- mov.u32 %r33, %ntid.y;
- mov.u32 %r34, %ctaid.y;
- mov.u32 %r35, %tid.y;
- mad.lo.s32 %r3, %r33, %r34, %r35;
- mov.u32 %r36, %nctaid.x;
- mul.lo.s32 %r4, %r1, %r36;
- mov.u32 %r37, %ctaid.z;
- mov.u32 %r38, %ntid.z;
- mov.u32 %r39, %tid.z;
- mad.lo.s32 %r5, %r38, %r37, %r39;
- mov.u32 %r40, %nctaid.y;
- mul.lo.s32 %r6, %r33, %r40;
- mov.u32 %r41, %nctaid.z;
- mul.lo.s32 %r7, %r38, %r41;
+ mov.u32 %r30, %tid.x;
+ mad.lo.s32 %r42, %r1, %r29, %r30;
+ setp.ge.u32 %p1, %r42, %r27;
+ @%p1 bra $L__BB57_18;
+
+ mov.u32 %r31, %ntid.y;
+ mov.u32 %r32, %ctaid.y;
+ mov.u32 %r33, %tid.y;
+ mad.lo.s32 %r3, %r31, %r32, %r33;
+ mov.u32 %r34, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r34;
+ mov.u32 %r35, %ctaid.z;
+ mov.u32 %r36, %ntid.z;
+ mov.u32 %r37, %tid.z;
+ mad.lo.s32 %r5, %r36, %r35, %r37;
+ mov.u32 %r38, %nctaid.y;
+ mul.lo.s32 %r6, %r31, %r38;
+ mov.u32 %r39, %nctaid.z;
+ mul.lo.s32 %r7, %r36, %r39;
$L__BB57_2:
- setp.ge.u32 %p2, %r3, %r28;
- @%p2 bra $L__BB57_20;
+ setp.ge.u32 %p2, %r3, %r26;
+ @%p2 bra $L__BB57_17;
- setp.eq.s32 %p3, %r30, 0;
- cvt.u64.u32 %rd4, %r45;
- @%p3 bra $L__BB57_15;
+ setp.eq.s32 %p3, %r28, 0;
+ cvt.u64.u32 %rd4, %r42;
+ @%p3 bra $L__BB57_12;
- mov.u32 %r46, %r3;
+ mov.u32 %r43, %r3;
$L__BB57_5:
- setp.ge.u32 %p4, %r5, %r27;
- @%p4 bra $L__BB57_14;
-
- mul.wide.u32 %rd8, %r46, 8;
- add.s64 %rd9, %rd3, %rd8;
- ld.global.nc.u64 %rd10, [%rd9];
- cvta.to.global.u64 %rd11, %rd10;
- shl.b64 %rd12, %rd4, 2;
- add.s64 %rd13, %rd11, %rd12;
- ld.global.f32 %f1, [%rd13];
- mul.lo.s32 %r10, %r46, %r27;
- setp.eq.s32 %p5, %r30, 1;
- @%p5 bra $L__BB57_10;
-
- setp.ne.s32 %p6, %r30, 2;
- @%p6 bra $L__BB57_12;
+ setp.ge.u32 %p4, %r5, %r25;
+ @%p4 bra $L__BB57_11;
- add.s32 %r47, %r5, %r10;
- mov.u32 %r48, %r5;
+ setp.eq.s32 %p5, %r28, 2;
+ mul.wide.u32 %rd9, %r43, 8;
+ add.s64 %rd10, %rd3, %rd9;
+ ld.global.nc.u64 %rd11, [%rd10];
+ cvta.to.global.u64 %rd12, %rd11;
+ shl.b64 %rd13, %rd4, 2;
+ add.s64 %rd5, %rd12, %rd13;
+ mul.lo.s32 %r10, %r43, %r25;
+ @%p5 bra $L__BB57_9;
+ bra.uni $L__BB57_7;
$L__BB57_9:
- mul.wide.u32 %rd14, %r48, 8;
- add.s64 %rd15, %rd2, %rd14;
- ld.global.nc.u64 %rd16, [%rd15];
- cvta.to.global.u64 %rd17, %rd16;
- add.s64 %rd19, %rd17, %rd12;
- ld.global.f32 %f3, [%rd19];
- sub.ftz.f32 %f4, %f1, %f3;
- abs.ftz.f32 %f5, %f4;
- mul.wide.u32 %rd20, %r47, 4;
- add.s64 %rd21, %rd1, %rd20;
- atom.global.add.f32 %f6, [%rd21], %f5;
- add.s32 %r47, %r47, %r7;
- add.s32 %r48, %r48, %r7;
- setp.lt.u32 %p7, %r48, %r27;
- @%p7 bra $L__BB57_9;
- bra.uni $L__BB57_14;
+ ld.global.f32 %f1, [%rd5];
+ mov.u32 %r46, %r5;
$L__BB57_10:
- mov.u32 %r49, %r5;
+ mul.wide.u32 %rd16, %r46, 8;
+ add.s64 %rd17, %rd2, %rd16;
+ ld.global.nc.u64 %rd18, [%rd17];
+ cvta.to.global.u64 %rd19, %rd18;
+ add.s64 %rd21, %rd19, %rd13;
+ ld.global.f32 %f4, [%rd21];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ add.s32 %r40, %r46, %r10;
+ mul.wide.u32 %rd22, %r40, 4;
+ add.s64 %rd23, %rd1, %rd22;
+ atom.global.add.f32 %f7, [%rd23], %f6;
+ add.s32 %r46, %r46, %r7;
+ setp.lt.u32 %p7, %r46, %r25;
+ @%p7 bra $L__BB57_10;
+ bra.uni $L__BB57_11;
+
+$L__BB57_7:
+ add.s32 %r44, %r5, %r10;
+ mov.u32 %r45, %r5;
+
+$L__BB57_8:
+ mul.wide.u32 %rd14, %r44, 4;
+ add.s64 %rd15, %rd1, %rd14;
+ atom.global.add.f32 %f3, [%rd15], 0f00000000;
+ add.s32 %r44, %r44, %r7;
+ add.s32 %r45, %r45, %r7;
+ setp.lt.u32 %p6, %r45, %r25;
+ @%p6 bra $L__BB57_8;
$L__BB57_11:
- mul.wide.u32 %rd22, %r49, 8;
- add.s64 %rd23, %rd2, %rd22;
- ld.global.nc.u64 %rd24, [%rd23];
- cvta.to.global.u64 %rd25, %rd24;
- add.s64 %rd27, %rd25, %rd12;
- ld.global.f32 %f7, [%rd27];
- mul.ftz.f32 %f8, %f1, %f7;
- add.s32 %r42, %r49, %r10;
- mul.wide.u32 %rd28, %r42, 4;
- add.s64 %rd29, %rd1, %rd28;
- atom.global.add.f32 %f9, [%rd29], %f8;
- add.s32 %r49, %r49, %r7;
- setp.lt.u32 %p8, %r49, %r27;
- @%p8 bra $L__BB57_11;
- bra.uni $L__BB57_14;
+ add.s32 %r43, %r43, %r6;
+ setp.lt.u32 %p8, %r43, %r26;
+ @%p8 bra $L__BB57_5;
+ bra.uni $L__BB57_17;
$L__BB57_12:
- mov.u32 %r50, %r5;
+ mov.u32 %r47, %r3;
$L__BB57_13:
- add.s32 %r43, %r50, %r10;
- mul.wide.u32 %rd30, %r43, 4;
- add.s64 %rd31, %rd1, %rd30;
- atom.global.add.f32 %f10, [%rd31], 0f00000000;
- add.s32 %r50, %r50, %r7;
- setp.lt.u32 %p9, %r50, %r27;
- @%p9 bra $L__BB57_13;
-
-$L__BB57_14:
- add.s32 %r46, %r46, %r6;
- setp.lt.u32 %p10, %r46, %r28;
- @%p10 bra $L__BB57_5;
- bra.uni $L__BB57_20;
+ setp.ge.u32 %p9, %r5, %r25;
+ @%p9 bra $L__BB57_16;
+
+ mul.wide.u32 %rd24, %r47, 8;
+ add.s64 %rd25, %rd3, %rd24;
+ ld.global.nc.u64 %rd26, [%rd25];
+ cvta.to.global.u64 %rd27, %rd26;
+ shl.b64 %rd28, %rd4, 2;
+ add.s64 %rd29, %rd27, %rd28;
+ ld.global.f32 %f2, [%rd29];
+ mul.lo.s32 %r20, %r47, %r25;
+ mov.u32 %r48, %r5;
$L__BB57_15:
- mov.u32 %r51, %r3;
+ mul.wide.u32 %rd30, %r48, 8;
+ add.s64 %rd31, %rd2, %rd30;
+ ld.global.nc.u64 %rd32, [%rd31];
+ cvta.to.global.u64 %rd33, %rd32;
+ add.s64 %rd35, %rd33, %rd28;
+ ld.global.f32 %f8, [%rd35];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ add.s32 %r41, %r48, %r20;
+ mul.wide.u32 %rd36, %r41, 4;
+ add.s64 %rd37, %rd1, %rd36;
+ atom.global.add.f32 %f11, [%rd37], %f10;
+ add.s32 %r48, %r48, %r7;
+ setp.lt.u32 %p10, %r48, %r25;
+ @%p10 bra $L__BB57_15;
$L__BB57_16:
- setp.ge.u32 %p11, %r5, %r27;
- @%p11 bra $L__BB57_19;
-
- mul.wide.u32 %rd32, %r51, 8;
- add.s64 %rd33, %rd3, %rd32;
- ld.global.nc.u64 %rd34, [%rd33];
- cvta.to.global.u64 %rd35, %rd34;
- shl.b64 %rd36, %rd4, 2;
- add.s64 %rd37, %rd35, %rd36;
- ld.global.f32 %f2, [%rd37];
- mul.lo.s32 %r22, %r51, %r27;
- mov.u32 %r52, %r5;
+ add.s32 %r47, %r47, %r6;
+ setp.lt.u32 %p11, %r47, %r26;
+ @%p11 bra $L__BB57_13;
+
+$L__BB57_17:
+ add.s32 %r42, %r42, %r4;
+ setp.lt.u32 %p12, %r42, %r27;
+ @%p12 bra $L__BB57_2;
$L__BB57_18:
- mul.wide.u32 %rd38, %r52, 8;
- add.s64 %rd39, %rd2, %rd38;
- ld.global.nc.u64 %rd40, [%rd39];
- cvta.to.global.u64 %rd41, %rd40;
- add.s64 %rd43, %rd41, %rd36;
- ld.global.f32 %f11, [%rd43];
- sub.ftz.f32 %f12, %f2, %f11;
- mul.ftz.f32 %f13, %f12, %f12;
- add.s32 %r44, %r52, %r22;
- mul.wide.u32 %rd44, %r44, 4;
- add.s64 %rd45, %rd1, %rd44;
- atom.global.add.f32 %f14, [%rd45], %f13;
- add.s32 %r52, %r52, %r7;
- setp.lt.u32 %p12, %r52, %r27;
- @%p12 bra $L__BB57_18;
-
-$L__BB57_19:
- add.s32 %r51, %r51, %r6;
- setp.lt.u32 %p13, %r51, %r28;
- @%p13 bra $L__BB57_16;
-
-$L__BB57_20:
- add.s32 %r45, %r45, %r4;
- setp.lt.u32 %p14, %r45, %r29;
- @%p14 bra $L__BB57_2;
-
-$L__BB57_21:
ret;
}
- // .globl MultiCosineDistance
-.visible .entry MultiCosineDistance(
- .param .u64 MultiCosineDistance_param_0,
- .param .u64 MultiCosineDistance_param_1,
- .param .u64 MultiCosineDistance_param_2,
- .param .u64 MultiCosineDistance_param_3,
- .param .u64 MultiCosineDistance_param_4,
- .param .u32 MultiCosineDistance_param_5,
- .param .u32 MultiCosineDistance_param_6,
- .param .u32 MultiCosineDistance_param_7
+ // .globl CalculateDistances
+.visible .entry CalculateDistances(
+ .param .u64 CalculateDistances_param_0,
+ .param .u64 CalculateDistances_param_1,
+ .param .u64 CalculateDistances_param_2,
+ .param .u32 CalculateDistances_param_3,
+ .param .u32 CalculateDistances_param_4,
+ .param .u32 CalculateDistances_param_5
+)
+{
+ .reg .pred %p<11>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<31>;
+ .reg .b64 %rd<31>;
+
+
+ ld.param.u64 %rd7, [CalculateDistances_param_0];
+ ld.param.u64 %rd8, [CalculateDistances_param_1];
+ ld.param.u64 %rd9, [CalculateDistances_param_2];
+ ld.param.u32 %r16, [CalculateDistances_param_3];
+ ld.param.u32 %r17, [CalculateDistances_param_4];
+ ld.param.u32 %r18, [CalculateDistances_param_5];
+ cvta.to.global.u64 %rd1, %rd9;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd7;
+ mov.u32 %r19, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r20, %tid.x;
+ mad.lo.s32 %r26, %r1, %r19, %r20;
+ setp.ge.u32 %p1, %r26, %r17;
+ @%p1 bra $L__BB58_15;
+
+ mov.u32 %r21, %ntid.y;
+ mov.u32 %r22, %ctaid.y;
+ mov.u32 %r23, %tid.y;
+ mad.lo.s32 %r3, %r21, %r22, %r23;
+ mov.u32 %r24, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r24;
+ mov.u32 %r25, %nctaid.y;
+ mul.lo.s32 %r5, %r21, %r25;
+ setp.eq.s32 %p2, %r18, 0;
+ @%p2 bra $L__BB58_11;
+
+$L__BB58_3:
+ setp.ge.u32 %p3, %r3, %r16;
+ @%p3 bra $L__BB58_9;
+
+ setp.eq.s32 %p4, %r18, 2;
+ cvt.u64.u32 %rd4, %r26;
+ mul.wide.u32 %rd10, %r26, 4;
+ add.s64 %rd5, %rd3, %rd10;
+ @%p4 bra $L__BB58_7;
+ bra.uni $L__BB58_5;
+
+$L__BB58_7:
+ ld.global.nc.f32 %f1, [%rd5];
+ mov.u32 %r28, %r3;
+
+$L__BB58_8:
+ mul.wide.u32 %rd13, %r28, 8;
+ add.s64 %rd14, %rd2, %rd13;
+ ld.global.nc.u64 %rd15, [%rd14];
+ cvta.to.global.u64 %rd16, %rd15;
+ shl.b64 %rd17, %rd4, 2;
+ add.s64 %rd18, %rd16, %rd17;
+ ld.global.f32 %f4, [%rd18];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ mul.wide.u32 %rd19, %r28, 4;
+ add.s64 %rd20, %rd1, %rd19;
+ atom.global.add.f32 %f7, [%rd20], %f6;
+ add.s32 %r28, %r28, %r5;
+ setp.lt.u32 %p6, %r28, %r16;
+ @%p6 bra $L__BB58_8;
+ bra.uni $L__BB58_9;
+
+$L__BB58_5:
+ mov.u32 %r27, %r3;
+
+$L__BB58_6:
+ mul.wide.u32 %rd11, %r27, 4;
+ add.s64 %rd12, %rd1, %rd11;
+ atom.global.add.f32 %f3, [%rd12], 0f00000000;
+ add.s32 %r27, %r27, %r5;
+ setp.lt.u32 %p5, %r27, %r16;
+ @%p5 bra $L__BB58_6;
+
+$L__BB58_9:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p7, %r26, %r17;
+ @%p7 bra $L__BB58_3;
+ bra.uni $L__BB58_15;
+
+$L__BB58_11:
+ setp.ge.u32 %p8, %r3, %r16;
+ @%p8 bra $L__BB58_14;
+
+ cvt.u64.u32 %rd6, %r26;
+ mul.wide.u32 %rd21, %r26, 4;
+ add.s64 %rd22, %rd3, %rd21;
+ ld.global.nc.f32 %f2, [%rd22];
+ mov.u32 %r30, %r3;
+
+$L__BB58_13:
+ mul.wide.u32 %rd23, %r30, 8;
+ add.s64 %rd24, %rd2, %rd23;
+ ld.global.nc.u64 %rd25, [%rd24];
+ cvta.to.global.u64 %rd26, %rd25;
+ shl.b64 %rd27, %rd6, 2;
+ add.s64 %rd28, %rd26, %rd27;
+ ld.global.f32 %f8, [%rd28];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ mul.wide.u32 %rd29, %r30, 4;
+ add.s64 %rd30, %rd1, %rd29;
+ atom.global.add.f32 %f11, [%rd30], %f10;
+ add.s32 %r30, %r30, %r5;
+ setp.lt.u32 %p9, %r30, %r16;
+ @%p9 bra $L__BB58_13;
+
+$L__BB58_14:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p10, %r26, %r17;
+ @%p10 bra $L__BB58_11;
+
+$L__BB58_15:
+ ret;
+
+}
+ // .globl CosineMultiDistance
+.visible .entry CosineMultiDistance(
+ .param .u64 CosineMultiDistance_param_0,
+ .param .u64 CosineMultiDistance_param_1,
+ .param .u64 CosineMultiDistance_param_2,
+ .param .u64 CosineMultiDistance_param_3,
+ .param .u64 CosineMultiDistance_param_4,
+ .param .u32 CosineMultiDistance_param_5,
+ .param .u32 CosineMultiDistance_param_6,
+ .param .u32 CosineMultiDistance_param_7
)
{
.reg .pred %p<7>;
@@ -4224,20 +4332,20 @@ $L__BB57_21:
.reg .b64 %rd<28>;
- ld.param.u64 %rd7, [MultiCosineDistance_param_0];
- ld.param.u64 %rd8, [MultiCosineDistance_param_1];
- ld.param.u64 %rd9, [MultiCosineDistance_param_2];
- ld.param.u64 %rd10, [MultiCosineDistance_param_3];
- ld.param.u64 %rd11, [MultiCosineDistance_param_4];
- ld.param.u32 %r17, [MultiCosineDistance_param_5];
- ld.param.u32 %r18, [MultiCosineDistance_param_6];
- ld.param.u32 %r19, [MultiCosineDistance_param_7];
+ ld.param.u64 %rd7, [CosineMultiDistance_param_0];
+ ld.param.u64 %rd8, [CosineMultiDistance_param_1];
+ ld.param.u64 %rd9, [CosineMultiDistance_param_2];
+ ld.param.u64 %rd10, [CosineMultiDistance_param_3];
+ ld.param.u64 %rd11, [CosineMultiDistance_param_4];
+ ld.param.u32 %r17, [CosineMultiDistance_param_5];
+ ld.param.u32 %r18, [CosineMultiDistance_param_6];
+ ld.param.u32 %r19, [CosineMultiDistance_param_7];
mov.u32 %r20, %ctaid.x;
mov.u32 %r1, %ntid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r31, %r1, %r20, %r21;
setp.ge.u32 %p1, %r31, %r19;
- @%p1 bra $L__BB58_9;
+ @%p1 bra $L__BB59_9;
mov.u32 %r22, %ntid.y;
mov.u32 %r23, %ctaid.y;
@@ -4259,16 +4367,16 @@ $L__BB57_21:
cvta.to.global.u64 %rd4, %rd10;
cvta.to.global.u64 %rd5, %rd11;
-$L__BB58_2:
+$L__BB59_2:
setp.ge.u32 %p2, %r3, %r18;
- @%p2 bra $L__BB58_8;
+ @%p2 bra $L__BB59_8;
cvt.u64.u32 %rd6, %r31;
mov.u32 %r32, %r3;
-$L__BB58_4:
+$L__BB59_4:
setp.ge.u32 %p3, %r5, %r17;
- @%p3 bra $L__BB58_7;
+ @%p3 bra $L__BB59_7;
mul.wide.u32 %rd12, %r32, 8;
add.s64 %rd13, %rd1, %rd12;
@@ -4281,7 +4389,7 @@ $L__BB58_4:
mad.lo.s32 %r33, %r32, %r17, %r5;
mov.u32 %r34, %r5;
-$L__BB58_6:
+$L__BB59_6:
mul.wide.u32 %rd18, %r34, 8;
add.s64 %rd19, %rd2, %rd18;
ld.global.nc.u64 %rd20, [%rd19];
@@ -4300,19 +4408,105 @@ $L__BB58_6:
add.s32 %r33, %r33, %r7;
add.s32 %r34, %r34, %r7;
setp.lt.u32 %p4, %r34, %r17;
- @%p4 bra $L__BB58_6;
+ @%p4 bra $L__BB59_6;
-$L__BB58_7:
+$L__BB59_7:
add.s32 %r32, %r32, %r6;
setp.lt.u32 %p5, %r32, %r18;
- @%p5 bra $L__BB58_4;
+ @%p5 bra $L__BB59_4;
-$L__BB58_8:
+$L__BB59_8:
add.s32 %r31, %r31, %r4;
setp.lt.u32 %p6, %r31, %r19;
- @%p6 bra $L__BB58_2;
+ @%p6 bra $L__BB59_2;
-$L__BB58_9:
+$L__BB59_9:
+ ret;
+
+}
+ // .globl CosineDistances
+.visible .entry CosineDistances(
+ .param .u64 CosineDistances_param_0,
+ .param .u64 CosineDistances_param_1,
+ .param .u64 CosineDistances_param_2,
+ .param .u64 CosineDistances_param_3,
+ .param .u64 CosineDistances_param_4,
+ .param .u32 CosineDistances_param_5,
+ .param .u32 CosineDistances_param_6
+)
+{
+ .reg .pred %p<5>;
+ .reg .f32 %f<9>;
+ .reg .b32 %r<21>;
+ .reg .b64 %rd<24>;
+
+
+ ld.param.u64 %rd7, [CosineDistances_param_0];
+ ld.param.u64 %rd8, [CosineDistances_param_1];
+ ld.param.u64 %rd9, [CosineDistances_param_2];
+ ld.param.u64 %rd10, [CosineDistances_param_3];
+ ld.param.u64 %rd11, [CosineDistances_param_4];
+ ld.param.u32 %r10, [CosineDistances_param_5];
+ ld.param.u32 %r11, [CosineDistances_param_6];
+ mov.u32 %r12, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r13, %tid.x;
+ mad.lo.s32 %r19, %r1, %r12, %r13;
+ setp.ge.u32 %p1, %r19, %r11;
+ @%p1 bra $L__BB60_6;
+
+ mov.u32 %r14, %ntid.y;
+ mov.u32 %r15, %ctaid.y;
+ mov.u32 %r16, %tid.y;
+ mad.lo.s32 %r3, %r14, %r15, %r16;
+ mov.u32 %r17, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r17;
+ mov.u32 %r18, %nctaid.y;
+ mul.lo.s32 %r5, %r14, %r18;
+ cvta.to.global.u64 %rd1, %rd7;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd9;
+ cvta.to.global.u64 %rd4, %rd10;
+ cvta.to.global.u64 %rd5, %rd11;
+
+$L__BB60_2:
+ setp.ge.u32 %p2, %r3, %r10;
+ @%p2 bra $L__BB60_5;
+
+ cvt.u64.u32 %rd6, %r19;
+ mul.wide.u32 %rd12, %r19, 4;
+ add.s64 %rd13, %rd1, %rd12;
+ ld.global.nc.f32 %f1, [%rd13];
+ mul.ftz.f32 %f2, %f1, %f1;
+ mov.u32 %r20, %r3;
+
+$L__BB60_4:
+ mul.wide.u32 %rd14, %r20, 8;
+ add.s64 %rd15, %rd2, %rd14;
+ ld.global.nc.u64 %rd16, [%rd15];
+ cvta.to.global.u64 %rd17, %rd16;
+ shl.b64 %rd18, %rd6, 2;
+ add.s64 %rd19, %rd17, %rd18;
+ ld.global.f32 %f3, [%rd19];
+ mul.wide.u32 %rd20, %r20, 4;
+ add.s64 %rd21, %rd3, %rd20;
+ atom.global.add.f32 %f4, [%rd21], %f2;
+ add.s64 %rd22, %rd4, %rd20;
+ mul.ftz.f32 %f5, %f1, %f3;
+ atom.global.add.f32 %f6, [%rd22], %f5;
+ add.s64 %rd23, %rd5, %rd20;
+ mul.ftz.f32 %f7, %f3, %f3;
+ atom.global.add.f32 %f8, [%rd23], %f7;
+ add.s32 %r20, %r20, %r5;
+ setp.lt.u32 %p3, %r20, %r10;
+ @%p3 bra $L__BB60_4;
+
+$L__BB60_5:
+ add.s32 %r19, %r19, %r4;
+ setp.lt.u32 %p4, %r19, %r11;
+ @%p4 bra $L__BB60_2;
+
+$L__BB60_6:
ret;
}
@@ -4340,7 +4534,7 @@ $L__BB58_9:
mov.u32 %r2, %tid.x;
mad.lo.s32 %r3, %r22, %r1, %r2;
setp.ge.u32 %p1, %r3, %r20;
- @%p1 bra $L__BB59_2;
+ @%p1 bra $L__BB61_2;
cvta.to.global.u64 %rd3, %rd1;
mul.lo.s32 %r23, %r3, %r21;
@@ -4352,17 +4546,17 @@ $L__BB58_9:
add.s32 %r26, %r25, %r24;
st.shared.f32 [%r26], %f8;
-$L__BB59_2:
+$L__BB61_2:
bar.sync 0;
setp.ne.s32 %p2, %r2, 0;
- @%p2 bra $L__BB59_11;
+ @%p2 bra $L__BB61_11;
shl.b32 %r4, %r1, 10;
sub.s32 %r27, %r20, %r4;
min.u32 %r5, %r27, 1024;
setp.eq.s32 %p3, %r5, 0;
mov.f32 %f25, 0f00000000;
- @%p3 bra $L__BB59_10;
+ @%p3 bra $L__BB61_10;
not.b32 %r29, %r20;
add.s32 %r30, %r4, %r29;
@@ -4373,7 +4567,7 @@ $L__BB59_2:
setp.lt.u32 %p4, %r33, 3;
mov.f32 %f25, 0f00000000;
mov.u32 %r45, 0;
- @%p4 bra $L__BB59_7;
+ @%p4 bra $L__BB61_7;
add.s32 %r36, %r4, -1;
sub.s32 %r37, %r36, %r20;
@@ -4382,7 +4576,7 @@ $L__BB59_2:
neg.s32 %r42, %r39;
mov.u32 %r43, _ZZ9SumValuesE5block;
-$L__BB59_6:
+$L__BB61_6:
ld.shared.f32 %f13, [%r43];
add.ftz.f32 %f14, %f25, %f13;
ld.shared.f32 %f15, [%r43+4];
@@ -4395,32 +4589,32 @@ $L__BB59_6:
add.s32 %r43, %r43, 16;
add.s32 %r42, %r42, -4;
setp.ne.s32 %p5, %r42, 1;
- @%p5 bra $L__BB59_6;
+ @%p5 bra $L__BB61_6;
-$L__BB59_7:
+$L__BB61_7:
setp.eq.s32 %p6, %r47, 0;
- @%p6 bra $L__BB59_10;
+ @%p6 bra $L__BB61_10;
shl.b32 %r40, %r45, 2;
mov.u32 %r41, _ZZ9SumValuesE5block;
add.s32 %r46, %r41, %r40;
-$L__BB59_9:
+$L__BB61_9:
.pragma "nounroll";
ld.shared.f32 %f20, [%r46];
add.ftz.f32 %f25, %f25, %f20;
add.s32 %r46, %r46, 4;
add.s32 %r47, %r47, -1;
setp.ne.s32 %p7, %r47, 0;
- @%p7 bra $L__BB59_9;
+ @%p7 bra $L__BB61_9;
-$L__BB59_10:
+$L__BB61_10:
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f25;
-$L__BB59_11:
+$L__BB61_11:
ret;
}
diff --git a/BrightData.Cuda/cuda/brightwire_52.ptx b/BrightData.Cuda/cuda/brightwire_52.ptx
index 42cce186..66aa3204 100644
--- a/BrightData.Cuda/cuda/brightwire_52.ptx
+++ b/BrightData.Cuda/cuda/brightwire_52.ptx
@@ -4022,200 +4022,308 @@ $L__BB56_3:
ret;
}
- // .globl CalculateDistances
-.visible .entry CalculateDistances(
- .param .u64 CalculateDistances_param_0,
- .param .u64 CalculateDistances_param_1,
- .param .u64 CalculateDistances_param_2,
- .param .u32 CalculateDistances_param_3,
- .param .u32 CalculateDistances_param_4,
- .param .u32 CalculateDistances_param_5,
- .param .u32 CalculateDistances_param_6
+ // .globl CalculateMultiDistances
+.visible .entry CalculateMultiDistances(
+ .param .u64 CalculateMultiDistances_param_0,
+ .param .u64 CalculateMultiDistances_param_1,
+ .param .u64 CalculateMultiDistances_param_2,
+ .param .u32 CalculateMultiDistances_param_3,
+ .param .u32 CalculateMultiDistances_param_4,
+ .param .u32 CalculateMultiDistances_param_5,
+ .param .u32 CalculateMultiDistances_param_6
)
{
- .reg .pred %p<15>;
- .reg .f32 %f<15>;
- .reg .b32 %r<53>;
- .reg .b64 %rd<46>;
-
-
- ld.param.u64 %rd5, [CalculateDistances_param_0];
- ld.param.u64 %rd6, [CalculateDistances_param_1];
- ld.param.u64 %rd7, [CalculateDistances_param_2];
- ld.param.u32 %r27, [CalculateDistances_param_3];
- ld.param.u32 %r28, [CalculateDistances_param_4];
- ld.param.u32 %r29, [CalculateDistances_param_5];
- ld.param.u32 %r30, [CalculateDistances_param_6];
- cvta.to.global.u64 %rd1, %rd7;
- cvta.to.global.u64 %rd2, %rd6;
- cvta.to.global.u64 %rd3, %rd5;
- mov.u32 %r31, %ctaid.x;
+ .reg .pred %p<13>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<49>;
+ .reg .b64 %rd<38>;
+
+
+ ld.param.u64 %rd6, [CalculateMultiDistances_param_0];
+ ld.param.u64 %rd7, [CalculateMultiDistances_param_1];
+ ld.param.u64 %rd8, [CalculateMultiDistances_param_2];
+ ld.param.u32 %r25, [CalculateMultiDistances_param_3];
+ ld.param.u32 %r26, [CalculateMultiDistances_param_4];
+ ld.param.u32 %r27, [CalculateMultiDistances_param_5];
+ ld.param.u32 %r28, [CalculateMultiDistances_param_6];
+ cvta.to.global.u64 %rd1, %rd8;
+ cvta.to.global.u64 %rd2, %rd7;
+ cvta.to.global.u64 %rd3, %rd6;
+ mov.u32 %r29, %ctaid.x;
mov.u32 %r1, %ntid.x;
- mov.u32 %r32, %tid.x;
- mad.lo.s32 %r45, %r1, %r31, %r32;
- setp.ge.u32 %p1, %r45, %r29;
- @%p1 bra $L__BB57_21;
-
- mov.u32 %r33, %ntid.y;
- mov.u32 %r34, %ctaid.y;
- mov.u32 %r35, %tid.y;
- mad.lo.s32 %r3, %r33, %r34, %r35;
- mov.u32 %r36, %nctaid.x;
- mul.lo.s32 %r4, %r1, %r36;
- mov.u32 %r37, %ctaid.z;
- mov.u32 %r38, %ntid.z;
- mov.u32 %r39, %tid.z;
- mad.lo.s32 %r5, %r38, %r37, %r39;
- mov.u32 %r40, %nctaid.y;
- mul.lo.s32 %r6, %r33, %r40;
- mov.u32 %r41, %nctaid.z;
- mul.lo.s32 %r7, %r38, %r41;
+ mov.u32 %r30, %tid.x;
+ mad.lo.s32 %r42, %r1, %r29, %r30;
+ setp.ge.u32 %p1, %r42, %r27;
+ @%p1 bra $L__BB57_18;
+
+ mov.u32 %r31, %ntid.y;
+ mov.u32 %r32, %ctaid.y;
+ mov.u32 %r33, %tid.y;
+ mad.lo.s32 %r3, %r31, %r32, %r33;
+ mov.u32 %r34, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r34;
+ mov.u32 %r35, %ctaid.z;
+ mov.u32 %r36, %ntid.z;
+ mov.u32 %r37, %tid.z;
+ mad.lo.s32 %r5, %r36, %r35, %r37;
+ mov.u32 %r38, %nctaid.y;
+ mul.lo.s32 %r6, %r31, %r38;
+ mov.u32 %r39, %nctaid.z;
+ mul.lo.s32 %r7, %r36, %r39;
$L__BB57_2:
- setp.ge.u32 %p2, %r3, %r28;
- @%p2 bra $L__BB57_20;
+ setp.ge.u32 %p2, %r3, %r26;
+ @%p2 bra $L__BB57_17;
- setp.eq.s32 %p3, %r30, 0;
- cvt.u64.u32 %rd4, %r45;
- @%p3 bra $L__BB57_15;
+ setp.eq.s32 %p3, %r28, 0;
+ cvt.u64.u32 %rd4, %r42;
+ @%p3 bra $L__BB57_12;
- mov.u32 %r46, %r3;
+ mov.u32 %r43, %r3;
$L__BB57_5:
- setp.ge.u32 %p4, %r5, %r27;
- @%p4 bra $L__BB57_14;
-
- mul.wide.u32 %rd8, %r46, 8;
- add.s64 %rd9, %rd3, %rd8;
- ld.global.nc.u64 %rd10, [%rd9];
- cvta.to.global.u64 %rd11, %rd10;
- shl.b64 %rd12, %rd4, 2;
- add.s64 %rd13, %rd11, %rd12;
- ld.global.f32 %f1, [%rd13];
- mul.lo.s32 %r10, %r46, %r27;
- setp.eq.s32 %p5, %r30, 1;
- @%p5 bra $L__BB57_10;
-
- setp.ne.s32 %p6, %r30, 2;
- @%p6 bra $L__BB57_12;
+ setp.ge.u32 %p4, %r5, %r25;
+ @%p4 bra $L__BB57_11;
- add.s32 %r47, %r5, %r10;
- mov.u32 %r48, %r5;
+ setp.eq.s32 %p5, %r28, 2;
+ mul.wide.u32 %rd9, %r43, 8;
+ add.s64 %rd10, %rd3, %rd9;
+ ld.global.nc.u64 %rd11, [%rd10];
+ cvta.to.global.u64 %rd12, %rd11;
+ shl.b64 %rd13, %rd4, 2;
+ add.s64 %rd5, %rd12, %rd13;
+ mul.lo.s32 %r10, %r43, %r25;
+ @%p5 bra $L__BB57_9;
+ bra.uni $L__BB57_7;
$L__BB57_9:
- mul.wide.u32 %rd14, %r48, 8;
- add.s64 %rd15, %rd2, %rd14;
- ld.global.nc.u64 %rd16, [%rd15];
- cvta.to.global.u64 %rd17, %rd16;
- add.s64 %rd19, %rd17, %rd12;
- ld.global.f32 %f3, [%rd19];
- sub.ftz.f32 %f4, %f1, %f3;
- abs.ftz.f32 %f5, %f4;
- mul.wide.u32 %rd20, %r47, 4;
- add.s64 %rd21, %rd1, %rd20;
- atom.global.add.f32 %f6, [%rd21], %f5;
- add.s32 %r47, %r47, %r7;
- add.s32 %r48, %r48, %r7;
- setp.lt.u32 %p7, %r48, %r27;
- @%p7 bra $L__BB57_9;
- bra.uni $L__BB57_14;
+ ld.global.f32 %f1, [%rd5];
+ mov.u32 %r46, %r5;
$L__BB57_10:
- mov.u32 %r49, %r5;
+ mul.wide.u32 %rd16, %r46, 8;
+ add.s64 %rd17, %rd2, %rd16;
+ ld.global.nc.u64 %rd18, [%rd17];
+ cvta.to.global.u64 %rd19, %rd18;
+ add.s64 %rd21, %rd19, %rd13;
+ ld.global.f32 %f4, [%rd21];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ add.s32 %r40, %r46, %r10;
+ mul.wide.u32 %rd22, %r40, 4;
+ add.s64 %rd23, %rd1, %rd22;
+ atom.global.add.f32 %f7, [%rd23], %f6;
+ add.s32 %r46, %r46, %r7;
+ setp.lt.u32 %p7, %r46, %r25;
+ @%p7 bra $L__BB57_10;
+ bra.uni $L__BB57_11;
+
+$L__BB57_7:
+ add.s32 %r44, %r5, %r10;
+ mov.u32 %r45, %r5;
+
+$L__BB57_8:
+ mul.wide.u32 %rd14, %r44, 4;
+ add.s64 %rd15, %rd1, %rd14;
+ atom.global.add.f32 %f3, [%rd15], 0f00000000;
+ add.s32 %r44, %r44, %r7;
+ add.s32 %r45, %r45, %r7;
+ setp.lt.u32 %p6, %r45, %r25;
+ @%p6 bra $L__BB57_8;
$L__BB57_11:
- mul.wide.u32 %rd22, %r49, 8;
- add.s64 %rd23, %rd2, %rd22;
- ld.global.nc.u64 %rd24, [%rd23];
- cvta.to.global.u64 %rd25, %rd24;
- add.s64 %rd27, %rd25, %rd12;
- ld.global.f32 %f7, [%rd27];
- mul.ftz.f32 %f8, %f1, %f7;
- add.s32 %r42, %r49, %r10;
- mul.wide.u32 %rd28, %r42, 4;
- add.s64 %rd29, %rd1, %rd28;
- atom.global.add.f32 %f9, [%rd29], %f8;
- add.s32 %r49, %r49, %r7;
- setp.lt.u32 %p8, %r49, %r27;
- @%p8 bra $L__BB57_11;
- bra.uni $L__BB57_14;
+ add.s32 %r43, %r43, %r6;
+ setp.lt.u32 %p8, %r43, %r26;
+ @%p8 bra $L__BB57_5;
+ bra.uni $L__BB57_17;
$L__BB57_12:
- mov.u32 %r50, %r5;
+ mov.u32 %r47, %r3;
$L__BB57_13:
- add.s32 %r43, %r50, %r10;
- mul.wide.u32 %rd30, %r43, 4;
- add.s64 %rd31, %rd1, %rd30;
- atom.global.add.f32 %f10, [%rd31], 0f00000000;
- add.s32 %r50, %r50, %r7;
- setp.lt.u32 %p9, %r50, %r27;
- @%p9 bra $L__BB57_13;
-
-$L__BB57_14:
- add.s32 %r46, %r46, %r6;
- setp.lt.u32 %p10, %r46, %r28;
- @%p10 bra $L__BB57_5;
- bra.uni $L__BB57_20;
+ setp.ge.u32 %p9, %r5, %r25;
+ @%p9 bra $L__BB57_16;
+
+ mul.wide.u32 %rd24, %r47, 8;
+ add.s64 %rd25, %rd3, %rd24;
+ ld.global.nc.u64 %rd26, [%rd25];
+ cvta.to.global.u64 %rd27, %rd26;
+ shl.b64 %rd28, %rd4, 2;
+ add.s64 %rd29, %rd27, %rd28;
+ ld.global.f32 %f2, [%rd29];
+ mul.lo.s32 %r20, %r47, %r25;
+ mov.u32 %r48, %r5;
$L__BB57_15:
- mov.u32 %r51, %r3;
+ mul.wide.u32 %rd30, %r48, 8;
+ add.s64 %rd31, %rd2, %rd30;
+ ld.global.nc.u64 %rd32, [%rd31];
+ cvta.to.global.u64 %rd33, %rd32;
+ add.s64 %rd35, %rd33, %rd28;
+ ld.global.f32 %f8, [%rd35];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ add.s32 %r41, %r48, %r20;
+ mul.wide.u32 %rd36, %r41, 4;
+ add.s64 %rd37, %rd1, %rd36;
+ atom.global.add.f32 %f11, [%rd37], %f10;
+ add.s32 %r48, %r48, %r7;
+ setp.lt.u32 %p10, %r48, %r25;
+ @%p10 bra $L__BB57_15;
$L__BB57_16:
- setp.ge.u32 %p11, %r5, %r27;
- @%p11 bra $L__BB57_19;
-
- mul.wide.u32 %rd32, %r51, 8;
- add.s64 %rd33, %rd3, %rd32;
- ld.global.nc.u64 %rd34, [%rd33];
- cvta.to.global.u64 %rd35, %rd34;
- shl.b64 %rd36, %rd4, 2;
- add.s64 %rd37, %rd35, %rd36;
- ld.global.f32 %f2, [%rd37];
- mul.lo.s32 %r22, %r51, %r27;
- mov.u32 %r52, %r5;
+ add.s32 %r47, %r47, %r6;
+ setp.lt.u32 %p11, %r47, %r26;
+ @%p11 bra $L__BB57_13;
+
+$L__BB57_17:
+ add.s32 %r42, %r42, %r4;
+ setp.lt.u32 %p12, %r42, %r27;
+ @%p12 bra $L__BB57_2;
$L__BB57_18:
- mul.wide.u32 %rd38, %r52, 8;
- add.s64 %rd39, %rd2, %rd38;
- ld.global.nc.u64 %rd40, [%rd39];
- cvta.to.global.u64 %rd41, %rd40;
- add.s64 %rd43, %rd41, %rd36;
- ld.global.f32 %f11, [%rd43];
- sub.ftz.f32 %f12, %f2, %f11;
- mul.ftz.f32 %f13, %f12, %f12;
- add.s32 %r44, %r52, %r22;
- mul.wide.u32 %rd44, %r44, 4;
- add.s64 %rd45, %rd1, %rd44;
- atom.global.add.f32 %f14, [%rd45], %f13;
- add.s32 %r52, %r52, %r7;
- setp.lt.u32 %p12, %r52, %r27;
- @%p12 bra $L__BB57_18;
-
-$L__BB57_19:
- add.s32 %r51, %r51, %r6;
- setp.lt.u32 %p13, %r51, %r28;
- @%p13 bra $L__BB57_16;
-
-$L__BB57_20:
- add.s32 %r45, %r45, %r4;
- setp.lt.u32 %p14, %r45, %r29;
- @%p14 bra $L__BB57_2;
-
-$L__BB57_21:
ret;
}
- // .globl MultiCosineDistance
-.visible .entry MultiCosineDistance(
- .param .u64 MultiCosineDistance_param_0,
- .param .u64 MultiCosineDistance_param_1,
- .param .u64 MultiCosineDistance_param_2,
- .param .u64 MultiCosineDistance_param_3,
- .param .u64 MultiCosineDistance_param_4,
- .param .u32 MultiCosineDistance_param_5,
- .param .u32 MultiCosineDistance_param_6,
- .param .u32 MultiCosineDistance_param_7
+ // .globl CalculateDistances
+.visible .entry CalculateDistances(
+ .param .u64 CalculateDistances_param_0,
+ .param .u64 CalculateDistances_param_1,
+ .param .u64 CalculateDistances_param_2,
+ .param .u32 CalculateDistances_param_3,
+ .param .u32 CalculateDistances_param_4,
+ .param .u32 CalculateDistances_param_5
+)
+{
+ .reg .pred %p<11>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<31>;
+ .reg .b64 %rd<31>;
+
+
+ ld.param.u64 %rd7, [CalculateDistances_param_0];
+ ld.param.u64 %rd8, [CalculateDistances_param_1];
+ ld.param.u64 %rd9, [CalculateDistances_param_2];
+ ld.param.u32 %r16, [CalculateDistances_param_3];
+ ld.param.u32 %r17, [CalculateDistances_param_4];
+ ld.param.u32 %r18, [CalculateDistances_param_5];
+ cvta.to.global.u64 %rd1, %rd9;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd7;
+ mov.u32 %r19, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r20, %tid.x;
+ mad.lo.s32 %r26, %r1, %r19, %r20;
+ setp.ge.u32 %p1, %r26, %r17;
+ @%p1 bra $L__BB58_15;
+
+ mov.u32 %r21, %ntid.y;
+ mov.u32 %r22, %ctaid.y;
+ mov.u32 %r23, %tid.y;
+ mad.lo.s32 %r3, %r21, %r22, %r23;
+ mov.u32 %r24, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r24;
+ mov.u32 %r25, %nctaid.y;
+ mul.lo.s32 %r5, %r21, %r25;
+ setp.eq.s32 %p2, %r18, 0;
+ @%p2 bra $L__BB58_11;
+
+$L__BB58_3:
+ setp.ge.u32 %p3, %r3, %r16;
+ @%p3 bra $L__BB58_9;
+
+ setp.eq.s32 %p4, %r18, 2;
+ cvt.u64.u32 %rd4, %r26;
+ mul.wide.u32 %rd10, %r26, 4;
+ add.s64 %rd5, %rd3, %rd10;
+ @%p4 bra $L__BB58_7;
+ bra.uni $L__BB58_5;
+
+$L__BB58_7:
+ ld.global.nc.f32 %f1, [%rd5];
+ mov.u32 %r28, %r3;
+
+$L__BB58_8:
+ mul.wide.u32 %rd13, %r28, 8;
+ add.s64 %rd14, %rd2, %rd13;
+ ld.global.nc.u64 %rd15, [%rd14];
+ cvta.to.global.u64 %rd16, %rd15;
+ shl.b64 %rd17, %rd4, 2;
+ add.s64 %rd18, %rd16, %rd17;
+ ld.global.f32 %f4, [%rd18];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ mul.wide.u32 %rd19, %r28, 4;
+ add.s64 %rd20, %rd1, %rd19;
+ atom.global.add.f32 %f7, [%rd20], %f6;
+ add.s32 %r28, %r28, %r5;
+ setp.lt.u32 %p6, %r28, %r16;
+ @%p6 bra $L__BB58_8;
+ bra.uni $L__BB58_9;
+
+$L__BB58_5:
+ mov.u32 %r27, %r3;
+
+$L__BB58_6:
+ mul.wide.u32 %rd11, %r27, 4;
+ add.s64 %rd12, %rd1, %rd11;
+ atom.global.add.f32 %f3, [%rd12], 0f00000000;
+ add.s32 %r27, %r27, %r5;
+ setp.lt.u32 %p5, %r27, %r16;
+ @%p5 bra $L__BB58_6;
+
+$L__BB58_9:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p7, %r26, %r17;
+ @%p7 bra $L__BB58_3;
+ bra.uni $L__BB58_15;
+
+$L__BB58_11:
+ setp.ge.u32 %p8, %r3, %r16;
+ @%p8 bra $L__BB58_14;
+
+ cvt.u64.u32 %rd6, %r26;
+ mul.wide.u32 %rd21, %r26, 4;
+ add.s64 %rd22, %rd3, %rd21;
+ ld.global.nc.f32 %f2, [%rd22];
+ mov.u32 %r30, %r3;
+
+$L__BB58_13:
+ mul.wide.u32 %rd23, %r30, 8;
+ add.s64 %rd24, %rd2, %rd23;
+ ld.global.nc.u64 %rd25, [%rd24];
+ cvta.to.global.u64 %rd26, %rd25;
+ shl.b64 %rd27, %rd6, 2;
+ add.s64 %rd28, %rd26, %rd27;
+ ld.global.f32 %f8, [%rd28];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ mul.wide.u32 %rd29, %r30, 4;
+ add.s64 %rd30, %rd1, %rd29;
+ atom.global.add.f32 %f11, [%rd30], %f10;
+ add.s32 %r30, %r30, %r5;
+ setp.lt.u32 %p9, %r30, %r16;
+ @%p9 bra $L__BB58_13;
+
+$L__BB58_14:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p10, %r26, %r17;
+ @%p10 bra $L__BB58_11;
+
+$L__BB58_15:
+ ret;
+
+}
+ // .globl CosineMultiDistance
+.visible .entry CosineMultiDistance(
+ .param .u64 CosineMultiDistance_param_0,
+ .param .u64 CosineMultiDistance_param_1,
+ .param .u64 CosineMultiDistance_param_2,
+ .param .u64 CosineMultiDistance_param_3,
+ .param .u64 CosineMultiDistance_param_4,
+ .param .u32 CosineMultiDistance_param_5,
+ .param .u32 CosineMultiDistance_param_6,
+ .param .u32 CosineMultiDistance_param_7
)
{
.reg .pred %p<7>;
@@ -4224,20 +4332,20 @@ $L__BB57_21:
.reg .b64 %rd<28>;
- ld.param.u64 %rd7, [MultiCosineDistance_param_0];
- ld.param.u64 %rd8, [MultiCosineDistance_param_1];
- ld.param.u64 %rd9, [MultiCosineDistance_param_2];
- ld.param.u64 %rd10, [MultiCosineDistance_param_3];
- ld.param.u64 %rd11, [MultiCosineDistance_param_4];
- ld.param.u32 %r17, [MultiCosineDistance_param_5];
- ld.param.u32 %r18, [MultiCosineDistance_param_6];
- ld.param.u32 %r19, [MultiCosineDistance_param_7];
+ ld.param.u64 %rd7, [CosineMultiDistance_param_0];
+ ld.param.u64 %rd8, [CosineMultiDistance_param_1];
+ ld.param.u64 %rd9, [CosineMultiDistance_param_2];
+ ld.param.u64 %rd10, [CosineMultiDistance_param_3];
+ ld.param.u64 %rd11, [CosineMultiDistance_param_4];
+ ld.param.u32 %r17, [CosineMultiDistance_param_5];
+ ld.param.u32 %r18, [CosineMultiDistance_param_6];
+ ld.param.u32 %r19, [CosineMultiDistance_param_7];
mov.u32 %r20, %ctaid.x;
mov.u32 %r1, %ntid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r31, %r1, %r20, %r21;
setp.ge.u32 %p1, %r31, %r19;
- @%p1 bra $L__BB58_9;
+ @%p1 bra $L__BB59_9;
mov.u32 %r22, %ntid.y;
mov.u32 %r23, %ctaid.y;
@@ -4259,16 +4367,16 @@ $L__BB57_21:
cvta.to.global.u64 %rd4, %rd10;
cvta.to.global.u64 %rd5, %rd11;
-$L__BB58_2:
+$L__BB59_2:
setp.ge.u32 %p2, %r3, %r18;
- @%p2 bra $L__BB58_8;
+ @%p2 bra $L__BB59_8;
cvt.u64.u32 %rd6, %r31;
mov.u32 %r32, %r3;
-$L__BB58_4:
+$L__BB59_4:
setp.ge.u32 %p3, %r5, %r17;
- @%p3 bra $L__BB58_7;
+ @%p3 bra $L__BB59_7;
mul.wide.u32 %rd12, %r32, 8;
add.s64 %rd13, %rd1, %rd12;
@@ -4281,7 +4389,7 @@ $L__BB58_4:
mad.lo.s32 %r33, %r32, %r17, %r5;
mov.u32 %r34, %r5;
-$L__BB58_6:
+$L__BB59_6:
mul.wide.u32 %rd18, %r34, 8;
add.s64 %rd19, %rd2, %rd18;
ld.global.nc.u64 %rd20, [%rd19];
@@ -4300,19 +4408,105 @@ $L__BB58_6:
add.s32 %r33, %r33, %r7;
add.s32 %r34, %r34, %r7;
setp.lt.u32 %p4, %r34, %r17;
- @%p4 bra $L__BB58_6;
+ @%p4 bra $L__BB59_6;
-$L__BB58_7:
+$L__BB59_7:
add.s32 %r32, %r32, %r6;
setp.lt.u32 %p5, %r32, %r18;
- @%p5 bra $L__BB58_4;
+ @%p5 bra $L__BB59_4;
-$L__BB58_8:
+$L__BB59_8:
add.s32 %r31, %r31, %r4;
setp.lt.u32 %p6, %r31, %r19;
- @%p6 bra $L__BB58_2;
+ @%p6 bra $L__BB59_2;
-$L__BB58_9:
+$L__BB59_9:
+ ret;
+
+}
+ // .globl CosineDistances
+.visible .entry CosineDistances(
+ .param .u64 CosineDistances_param_0,
+ .param .u64 CosineDistances_param_1,
+ .param .u64 CosineDistances_param_2,
+ .param .u64 CosineDistances_param_3,
+ .param .u64 CosineDistances_param_4,
+ .param .u32 CosineDistances_param_5,
+ .param .u32 CosineDistances_param_6
+)
+{
+ .reg .pred %p<5>;
+ .reg .f32 %f<9>;
+ .reg .b32 %r<21>;
+ .reg .b64 %rd<24>;
+
+
+ ld.param.u64 %rd7, [CosineDistances_param_0];
+ ld.param.u64 %rd8, [CosineDistances_param_1];
+ ld.param.u64 %rd9, [CosineDistances_param_2];
+ ld.param.u64 %rd10, [CosineDistances_param_3];
+ ld.param.u64 %rd11, [CosineDistances_param_4];
+ ld.param.u32 %r10, [CosineDistances_param_5];
+ ld.param.u32 %r11, [CosineDistances_param_6];
+ mov.u32 %r12, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r13, %tid.x;
+ mad.lo.s32 %r19, %r1, %r12, %r13;
+ setp.ge.u32 %p1, %r19, %r11;
+ @%p1 bra $L__BB60_6;
+
+ mov.u32 %r14, %ntid.y;
+ mov.u32 %r15, %ctaid.y;
+ mov.u32 %r16, %tid.y;
+ mad.lo.s32 %r3, %r14, %r15, %r16;
+ mov.u32 %r17, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r17;
+ mov.u32 %r18, %nctaid.y;
+ mul.lo.s32 %r5, %r14, %r18;
+ cvta.to.global.u64 %rd1, %rd7;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd9;
+ cvta.to.global.u64 %rd4, %rd10;
+ cvta.to.global.u64 %rd5, %rd11;
+
+$L__BB60_2:
+ setp.ge.u32 %p2, %r3, %r10;
+ @%p2 bra $L__BB60_5;
+
+ cvt.u64.u32 %rd6, %r19;
+ mul.wide.u32 %rd12, %r19, 4;
+ add.s64 %rd13, %rd1, %rd12;
+ ld.global.nc.f32 %f1, [%rd13];
+ mul.ftz.f32 %f2, %f1, %f1;
+ mov.u32 %r20, %r3;
+
+$L__BB60_4:
+ mul.wide.u32 %rd14, %r20, 8;
+ add.s64 %rd15, %rd2, %rd14;
+ ld.global.nc.u64 %rd16, [%rd15];
+ cvta.to.global.u64 %rd17, %rd16;
+ shl.b64 %rd18, %rd6, 2;
+ add.s64 %rd19, %rd17, %rd18;
+ ld.global.f32 %f3, [%rd19];
+ mul.wide.u32 %rd20, %r20, 4;
+ add.s64 %rd21, %rd3, %rd20;
+ atom.global.add.f32 %f4, [%rd21], %f2;
+ add.s64 %rd22, %rd4, %rd20;
+ mul.ftz.f32 %f5, %f1, %f3;
+ atom.global.add.f32 %f6, [%rd22], %f5;
+ add.s64 %rd23, %rd5, %rd20;
+ mul.ftz.f32 %f7, %f3, %f3;
+ atom.global.add.f32 %f8, [%rd23], %f7;
+ add.s32 %r20, %r20, %r5;
+ setp.lt.u32 %p3, %r20, %r10;
+ @%p3 bra $L__BB60_4;
+
+$L__BB60_5:
+ add.s32 %r19, %r19, %r4;
+ setp.lt.u32 %p4, %r19, %r11;
+ @%p4 bra $L__BB60_2;
+
+$L__BB60_6:
ret;
}
@@ -4340,7 +4534,7 @@ $L__BB58_9:
mov.u32 %r2, %tid.x;
mad.lo.s32 %r3, %r22, %r1, %r2;
setp.ge.u32 %p1, %r3, %r20;
- @%p1 bra $L__BB59_2;
+ @%p1 bra $L__BB61_2;
cvta.to.global.u64 %rd3, %rd1;
mul.lo.s32 %r23, %r3, %r21;
@@ -4352,17 +4546,17 @@ $L__BB58_9:
add.s32 %r26, %r25, %r24;
st.shared.f32 [%r26], %f8;
-$L__BB59_2:
+$L__BB61_2:
bar.sync 0;
setp.ne.s32 %p2, %r2, 0;
- @%p2 bra $L__BB59_11;
+ @%p2 bra $L__BB61_11;
shl.b32 %r4, %r1, 10;
sub.s32 %r27, %r20, %r4;
min.u32 %r5, %r27, 1024;
setp.eq.s32 %p3, %r5, 0;
mov.f32 %f25, 0f00000000;
- @%p3 bra $L__BB59_10;
+ @%p3 bra $L__BB61_10;
not.b32 %r29, %r20;
add.s32 %r30, %r4, %r29;
@@ -4373,7 +4567,7 @@ $L__BB59_2:
setp.lt.u32 %p4, %r33, 3;
mov.f32 %f25, 0f00000000;
mov.u32 %r45, 0;
- @%p4 bra $L__BB59_7;
+ @%p4 bra $L__BB61_7;
add.s32 %r36, %r4, -1;
sub.s32 %r37, %r36, %r20;
@@ -4382,7 +4576,7 @@ $L__BB59_2:
neg.s32 %r42, %r39;
mov.u32 %r43, _ZZ9SumValuesE5block;
-$L__BB59_6:
+$L__BB61_6:
ld.shared.f32 %f13, [%r43];
add.ftz.f32 %f14, %f25, %f13;
ld.shared.f32 %f15, [%r43+4];
@@ -4395,32 +4589,32 @@ $L__BB59_6:
add.s32 %r43, %r43, 16;
add.s32 %r42, %r42, -4;
setp.ne.s32 %p5, %r42, 1;
- @%p5 bra $L__BB59_6;
+ @%p5 bra $L__BB61_6;
-$L__BB59_7:
+$L__BB61_7:
setp.eq.s32 %p6, %r47, 0;
- @%p6 bra $L__BB59_10;
+ @%p6 bra $L__BB61_10;
shl.b32 %r40, %r45, 2;
mov.u32 %r41, _ZZ9SumValuesE5block;
add.s32 %r46, %r41, %r40;
-$L__BB59_9:
+$L__BB61_9:
.pragma "nounroll";
ld.shared.f32 %f20, [%r46];
add.ftz.f32 %f25, %f25, %f20;
add.s32 %r46, %r46, 4;
add.s32 %r47, %r47, -1;
setp.ne.s32 %p7, %r47, 0;
- @%p7 bra $L__BB59_9;
+ @%p7 bra $L__BB61_9;
-$L__BB59_10:
+$L__BB61_10:
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f25;
-$L__BB59_11:
+$L__BB61_11:
ret;
}
diff --git a/BrightData.Cuda/cuda/brightwire_53.ptx b/BrightData.Cuda/cuda/brightwire_53.ptx
index bb167564..08740d7c 100644
--- a/BrightData.Cuda/cuda/brightwire_53.ptx
+++ b/BrightData.Cuda/cuda/brightwire_53.ptx
@@ -4022,200 +4022,308 @@ $L__BB56_3:
ret;
}
- // .globl CalculateDistances
-.visible .entry CalculateDistances(
- .param .u64 CalculateDistances_param_0,
- .param .u64 CalculateDistances_param_1,
- .param .u64 CalculateDistances_param_2,
- .param .u32 CalculateDistances_param_3,
- .param .u32 CalculateDistances_param_4,
- .param .u32 CalculateDistances_param_5,
- .param .u32 CalculateDistances_param_6
+ // .globl CalculateMultiDistances
+.visible .entry CalculateMultiDistances(
+ .param .u64 CalculateMultiDistances_param_0,
+ .param .u64 CalculateMultiDistances_param_1,
+ .param .u64 CalculateMultiDistances_param_2,
+ .param .u32 CalculateMultiDistances_param_3,
+ .param .u32 CalculateMultiDistances_param_4,
+ .param .u32 CalculateMultiDistances_param_5,
+ .param .u32 CalculateMultiDistances_param_6
)
{
- .reg .pred %p<15>;
- .reg .f32 %f<15>;
- .reg .b32 %r<53>;
- .reg .b64 %rd<46>;
-
-
- ld.param.u64 %rd5, [CalculateDistances_param_0];
- ld.param.u64 %rd6, [CalculateDistances_param_1];
- ld.param.u64 %rd7, [CalculateDistances_param_2];
- ld.param.u32 %r27, [CalculateDistances_param_3];
- ld.param.u32 %r28, [CalculateDistances_param_4];
- ld.param.u32 %r29, [CalculateDistances_param_5];
- ld.param.u32 %r30, [CalculateDistances_param_6];
- cvta.to.global.u64 %rd1, %rd7;
- cvta.to.global.u64 %rd2, %rd6;
- cvta.to.global.u64 %rd3, %rd5;
- mov.u32 %r31, %ctaid.x;
+ .reg .pred %p<13>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<49>;
+ .reg .b64 %rd<38>;
+
+
+ ld.param.u64 %rd6, [CalculateMultiDistances_param_0];
+ ld.param.u64 %rd7, [CalculateMultiDistances_param_1];
+ ld.param.u64 %rd8, [CalculateMultiDistances_param_2];
+ ld.param.u32 %r25, [CalculateMultiDistances_param_3];
+ ld.param.u32 %r26, [CalculateMultiDistances_param_4];
+ ld.param.u32 %r27, [CalculateMultiDistances_param_5];
+ ld.param.u32 %r28, [CalculateMultiDistances_param_6];
+ cvta.to.global.u64 %rd1, %rd8;
+ cvta.to.global.u64 %rd2, %rd7;
+ cvta.to.global.u64 %rd3, %rd6;
+ mov.u32 %r29, %ctaid.x;
mov.u32 %r1, %ntid.x;
- mov.u32 %r32, %tid.x;
- mad.lo.s32 %r45, %r1, %r31, %r32;
- setp.ge.u32 %p1, %r45, %r29;
- @%p1 bra $L__BB57_21;
-
- mov.u32 %r33, %ntid.y;
- mov.u32 %r34, %ctaid.y;
- mov.u32 %r35, %tid.y;
- mad.lo.s32 %r3, %r33, %r34, %r35;
- mov.u32 %r36, %nctaid.x;
- mul.lo.s32 %r4, %r1, %r36;
- mov.u32 %r37, %ctaid.z;
- mov.u32 %r38, %ntid.z;
- mov.u32 %r39, %tid.z;
- mad.lo.s32 %r5, %r38, %r37, %r39;
- mov.u32 %r40, %nctaid.y;
- mul.lo.s32 %r6, %r33, %r40;
- mov.u32 %r41, %nctaid.z;
- mul.lo.s32 %r7, %r38, %r41;
+ mov.u32 %r30, %tid.x;
+ mad.lo.s32 %r42, %r1, %r29, %r30;
+ setp.ge.u32 %p1, %r42, %r27;
+ @%p1 bra $L__BB57_18;
+
+ mov.u32 %r31, %ntid.y;
+ mov.u32 %r32, %ctaid.y;
+ mov.u32 %r33, %tid.y;
+ mad.lo.s32 %r3, %r31, %r32, %r33;
+ mov.u32 %r34, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r34;
+ mov.u32 %r35, %ctaid.z;
+ mov.u32 %r36, %ntid.z;
+ mov.u32 %r37, %tid.z;
+ mad.lo.s32 %r5, %r36, %r35, %r37;
+ mov.u32 %r38, %nctaid.y;
+ mul.lo.s32 %r6, %r31, %r38;
+ mov.u32 %r39, %nctaid.z;
+ mul.lo.s32 %r7, %r36, %r39;
$L__BB57_2:
- setp.ge.u32 %p2, %r3, %r28;
- @%p2 bra $L__BB57_20;
+ setp.ge.u32 %p2, %r3, %r26;
+ @%p2 bra $L__BB57_17;
- setp.eq.s32 %p3, %r30, 0;
- cvt.u64.u32 %rd4, %r45;
- @%p3 bra $L__BB57_15;
+ setp.eq.s32 %p3, %r28, 0;
+ cvt.u64.u32 %rd4, %r42;
+ @%p3 bra $L__BB57_12;
- mov.u32 %r46, %r3;
+ mov.u32 %r43, %r3;
$L__BB57_5:
- setp.ge.u32 %p4, %r5, %r27;
- @%p4 bra $L__BB57_14;
-
- mul.wide.u32 %rd8, %r46, 8;
- add.s64 %rd9, %rd3, %rd8;
- ld.global.nc.u64 %rd10, [%rd9];
- cvta.to.global.u64 %rd11, %rd10;
- shl.b64 %rd12, %rd4, 2;
- add.s64 %rd13, %rd11, %rd12;
- ld.global.f32 %f1, [%rd13];
- mul.lo.s32 %r10, %r46, %r27;
- setp.eq.s32 %p5, %r30, 1;
- @%p5 bra $L__BB57_10;
-
- setp.ne.s32 %p6, %r30, 2;
- @%p6 bra $L__BB57_12;
+ setp.ge.u32 %p4, %r5, %r25;
+ @%p4 bra $L__BB57_11;
- add.s32 %r47, %r5, %r10;
- mov.u32 %r48, %r5;
+ setp.eq.s32 %p5, %r28, 2;
+ mul.wide.u32 %rd9, %r43, 8;
+ add.s64 %rd10, %rd3, %rd9;
+ ld.global.nc.u64 %rd11, [%rd10];
+ cvta.to.global.u64 %rd12, %rd11;
+ shl.b64 %rd13, %rd4, 2;
+ add.s64 %rd5, %rd12, %rd13;
+ mul.lo.s32 %r10, %r43, %r25;
+ @%p5 bra $L__BB57_9;
+ bra.uni $L__BB57_7;
$L__BB57_9:
- mul.wide.u32 %rd14, %r48, 8;
- add.s64 %rd15, %rd2, %rd14;
- ld.global.nc.u64 %rd16, [%rd15];
- cvta.to.global.u64 %rd17, %rd16;
- add.s64 %rd19, %rd17, %rd12;
- ld.global.f32 %f3, [%rd19];
- sub.ftz.f32 %f4, %f1, %f3;
- abs.ftz.f32 %f5, %f4;
- mul.wide.u32 %rd20, %r47, 4;
- add.s64 %rd21, %rd1, %rd20;
- atom.global.add.f32 %f6, [%rd21], %f5;
- add.s32 %r47, %r47, %r7;
- add.s32 %r48, %r48, %r7;
- setp.lt.u32 %p7, %r48, %r27;
- @%p7 bra $L__BB57_9;
- bra.uni $L__BB57_14;
+ ld.global.f32 %f1, [%rd5];
+ mov.u32 %r46, %r5;
$L__BB57_10:
- mov.u32 %r49, %r5;
+ mul.wide.u32 %rd16, %r46, 8;
+ add.s64 %rd17, %rd2, %rd16;
+ ld.global.nc.u64 %rd18, [%rd17];
+ cvta.to.global.u64 %rd19, %rd18;
+ add.s64 %rd21, %rd19, %rd13;
+ ld.global.f32 %f4, [%rd21];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ add.s32 %r40, %r46, %r10;
+ mul.wide.u32 %rd22, %r40, 4;
+ add.s64 %rd23, %rd1, %rd22;
+ atom.global.add.f32 %f7, [%rd23], %f6;
+ add.s32 %r46, %r46, %r7;
+ setp.lt.u32 %p7, %r46, %r25;
+ @%p7 bra $L__BB57_10;
+ bra.uni $L__BB57_11;
+
+$L__BB57_7:
+ add.s32 %r44, %r5, %r10;
+ mov.u32 %r45, %r5;
+
+$L__BB57_8:
+ mul.wide.u32 %rd14, %r44, 4;
+ add.s64 %rd15, %rd1, %rd14;
+ atom.global.add.f32 %f3, [%rd15], 0f00000000;
+ add.s32 %r44, %r44, %r7;
+ add.s32 %r45, %r45, %r7;
+ setp.lt.u32 %p6, %r45, %r25;
+ @%p6 bra $L__BB57_8;
$L__BB57_11:
- mul.wide.u32 %rd22, %r49, 8;
- add.s64 %rd23, %rd2, %rd22;
- ld.global.nc.u64 %rd24, [%rd23];
- cvta.to.global.u64 %rd25, %rd24;
- add.s64 %rd27, %rd25, %rd12;
- ld.global.f32 %f7, [%rd27];
- mul.ftz.f32 %f8, %f1, %f7;
- add.s32 %r42, %r49, %r10;
- mul.wide.u32 %rd28, %r42, 4;
- add.s64 %rd29, %rd1, %rd28;
- atom.global.add.f32 %f9, [%rd29], %f8;
- add.s32 %r49, %r49, %r7;
- setp.lt.u32 %p8, %r49, %r27;
- @%p8 bra $L__BB57_11;
- bra.uni $L__BB57_14;
+ add.s32 %r43, %r43, %r6;
+ setp.lt.u32 %p8, %r43, %r26;
+ @%p8 bra $L__BB57_5;
+ bra.uni $L__BB57_17;
$L__BB57_12:
- mov.u32 %r50, %r5;
+ mov.u32 %r47, %r3;
$L__BB57_13:
- add.s32 %r43, %r50, %r10;
- mul.wide.u32 %rd30, %r43, 4;
- add.s64 %rd31, %rd1, %rd30;
- atom.global.add.f32 %f10, [%rd31], 0f00000000;
- add.s32 %r50, %r50, %r7;
- setp.lt.u32 %p9, %r50, %r27;
- @%p9 bra $L__BB57_13;
-
-$L__BB57_14:
- add.s32 %r46, %r46, %r6;
- setp.lt.u32 %p10, %r46, %r28;
- @%p10 bra $L__BB57_5;
- bra.uni $L__BB57_20;
+ setp.ge.u32 %p9, %r5, %r25;
+ @%p9 bra $L__BB57_16;
+
+ mul.wide.u32 %rd24, %r47, 8;
+ add.s64 %rd25, %rd3, %rd24;
+ ld.global.nc.u64 %rd26, [%rd25];
+ cvta.to.global.u64 %rd27, %rd26;
+ shl.b64 %rd28, %rd4, 2;
+ add.s64 %rd29, %rd27, %rd28;
+ ld.global.f32 %f2, [%rd29];
+ mul.lo.s32 %r20, %r47, %r25;
+ mov.u32 %r48, %r5;
$L__BB57_15:
- mov.u32 %r51, %r3;
+ mul.wide.u32 %rd30, %r48, 8;
+ add.s64 %rd31, %rd2, %rd30;
+ ld.global.nc.u64 %rd32, [%rd31];
+ cvta.to.global.u64 %rd33, %rd32;
+ add.s64 %rd35, %rd33, %rd28;
+ ld.global.f32 %f8, [%rd35];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ add.s32 %r41, %r48, %r20;
+ mul.wide.u32 %rd36, %r41, 4;
+ add.s64 %rd37, %rd1, %rd36;
+ atom.global.add.f32 %f11, [%rd37], %f10;
+ add.s32 %r48, %r48, %r7;
+ setp.lt.u32 %p10, %r48, %r25;
+ @%p10 bra $L__BB57_15;
$L__BB57_16:
- setp.ge.u32 %p11, %r5, %r27;
- @%p11 bra $L__BB57_19;
-
- mul.wide.u32 %rd32, %r51, 8;
- add.s64 %rd33, %rd3, %rd32;
- ld.global.nc.u64 %rd34, [%rd33];
- cvta.to.global.u64 %rd35, %rd34;
- shl.b64 %rd36, %rd4, 2;
- add.s64 %rd37, %rd35, %rd36;
- ld.global.f32 %f2, [%rd37];
- mul.lo.s32 %r22, %r51, %r27;
- mov.u32 %r52, %r5;
+ add.s32 %r47, %r47, %r6;
+ setp.lt.u32 %p11, %r47, %r26;
+ @%p11 bra $L__BB57_13;
+
+$L__BB57_17:
+ add.s32 %r42, %r42, %r4;
+ setp.lt.u32 %p12, %r42, %r27;
+ @%p12 bra $L__BB57_2;
$L__BB57_18:
- mul.wide.u32 %rd38, %r52, 8;
- add.s64 %rd39, %rd2, %rd38;
- ld.global.nc.u64 %rd40, [%rd39];
- cvta.to.global.u64 %rd41, %rd40;
- add.s64 %rd43, %rd41, %rd36;
- ld.global.f32 %f11, [%rd43];
- sub.ftz.f32 %f12, %f2, %f11;
- mul.ftz.f32 %f13, %f12, %f12;
- add.s32 %r44, %r52, %r22;
- mul.wide.u32 %rd44, %r44, 4;
- add.s64 %rd45, %rd1, %rd44;
- atom.global.add.f32 %f14, [%rd45], %f13;
- add.s32 %r52, %r52, %r7;
- setp.lt.u32 %p12, %r52, %r27;
- @%p12 bra $L__BB57_18;
-
-$L__BB57_19:
- add.s32 %r51, %r51, %r6;
- setp.lt.u32 %p13, %r51, %r28;
- @%p13 bra $L__BB57_16;
-
-$L__BB57_20:
- add.s32 %r45, %r45, %r4;
- setp.lt.u32 %p14, %r45, %r29;
- @%p14 bra $L__BB57_2;
-
-$L__BB57_21:
ret;
}
- // .globl MultiCosineDistance
-.visible .entry MultiCosineDistance(
- .param .u64 MultiCosineDistance_param_0,
- .param .u64 MultiCosineDistance_param_1,
- .param .u64 MultiCosineDistance_param_2,
- .param .u64 MultiCosineDistance_param_3,
- .param .u64 MultiCosineDistance_param_4,
- .param .u32 MultiCosineDistance_param_5,
- .param .u32 MultiCosineDistance_param_6,
- .param .u32 MultiCosineDistance_param_7
+ // .globl CalculateDistances
+.visible .entry CalculateDistances(
+ .param .u64 CalculateDistances_param_0,
+ .param .u64 CalculateDistances_param_1,
+ .param .u64 CalculateDistances_param_2,
+ .param .u32 CalculateDistances_param_3,
+ .param .u32 CalculateDistances_param_4,
+ .param .u32 CalculateDistances_param_5
+)
+{
+ .reg .pred %p<11>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<31>;
+ .reg .b64 %rd<31>;
+
+
+ ld.param.u64 %rd7, [CalculateDistances_param_0];
+ ld.param.u64 %rd8, [CalculateDistances_param_1];
+ ld.param.u64 %rd9, [CalculateDistances_param_2];
+ ld.param.u32 %r16, [CalculateDistances_param_3];
+ ld.param.u32 %r17, [CalculateDistances_param_4];
+ ld.param.u32 %r18, [CalculateDistances_param_5];
+ cvta.to.global.u64 %rd1, %rd9;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd7;
+ mov.u32 %r19, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r20, %tid.x;
+ mad.lo.s32 %r26, %r1, %r19, %r20;
+ setp.ge.u32 %p1, %r26, %r17;
+ @%p1 bra $L__BB58_15;
+
+ mov.u32 %r21, %ntid.y;
+ mov.u32 %r22, %ctaid.y;
+ mov.u32 %r23, %tid.y;
+ mad.lo.s32 %r3, %r21, %r22, %r23;
+ mov.u32 %r24, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r24;
+ mov.u32 %r25, %nctaid.y;
+ mul.lo.s32 %r5, %r21, %r25;
+ setp.eq.s32 %p2, %r18, 0;
+ @%p2 bra $L__BB58_11;
+
+$L__BB58_3:
+ setp.ge.u32 %p3, %r3, %r16;
+ @%p3 bra $L__BB58_9;
+
+ setp.eq.s32 %p4, %r18, 2;
+ cvt.u64.u32 %rd4, %r26;
+ mul.wide.u32 %rd10, %r26, 4;
+ add.s64 %rd5, %rd3, %rd10;
+ @%p4 bra $L__BB58_7;
+ bra.uni $L__BB58_5;
+
+$L__BB58_7:
+ ld.global.nc.f32 %f1, [%rd5];
+ mov.u32 %r28, %r3;
+
+$L__BB58_8:
+ mul.wide.u32 %rd13, %r28, 8;
+ add.s64 %rd14, %rd2, %rd13;
+ ld.global.nc.u64 %rd15, [%rd14];
+ cvta.to.global.u64 %rd16, %rd15;
+ shl.b64 %rd17, %rd4, 2;
+ add.s64 %rd18, %rd16, %rd17;
+ ld.global.f32 %f4, [%rd18];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ mul.wide.u32 %rd19, %r28, 4;
+ add.s64 %rd20, %rd1, %rd19;
+ atom.global.add.f32 %f7, [%rd20], %f6;
+ add.s32 %r28, %r28, %r5;
+ setp.lt.u32 %p6, %r28, %r16;
+ @%p6 bra $L__BB58_8;
+ bra.uni $L__BB58_9;
+
+$L__BB58_5:
+ mov.u32 %r27, %r3;
+
+$L__BB58_6:
+ mul.wide.u32 %rd11, %r27, 4;
+ add.s64 %rd12, %rd1, %rd11;
+ atom.global.add.f32 %f3, [%rd12], 0f00000000;
+ add.s32 %r27, %r27, %r5;
+ setp.lt.u32 %p5, %r27, %r16;
+ @%p5 bra $L__BB58_6;
+
+$L__BB58_9:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p7, %r26, %r17;
+ @%p7 bra $L__BB58_3;
+ bra.uni $L__BB58_15;
+
+$L__BB58_11:
+ setp.ge.u32 %p8, %r3, %r16;
+ @%p8 bra $L__BB58_14;
+
+ cvt.u64.u32 %rd6, %r26;
+ mul.wide.u32 %rd21, %r26, 4;
+ add.s64 %rd22, %rd3, %rd21;
+ ld.global.nc.f32 %f2, [%rd22];
+ mov.u32 %r30, %r3;
+
+$L__BB58_13:
+ mul.wide.u32 %rd23, %r30, 8;
+ add.s64 %rd24, %rd2, %rd23;
+ ld.global.nc.u64 %rd25, [%rd24];
+ cvta.to.global.u64 %rd26, %rd25;
+ shl.b64 %rd27, %rd6, 2;
+ add.s64 %rd28, %rd26, %rd27;
+ ld.global.f32 %f8, [%rd28];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ mul.wide.u32 %rd29, %r30, 4;
+ add.s64 %rd30, %rd1, %rd29;
+ atom.global.add.f32 %f11, [%rd30], %f10;
+ add.s32 %r30, %r30, %r5;
+ setp.lt.u32 %p9, %r30, %r16;
+ @%p9 bra $L__BB58_13;
+
+$L__BB58_14:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p10, %r26, %r17;
+ @%p10 bra $L__BB58_11;
+
+$L__BB58_15:
+ ret;
+
+}
+ // .globl CosineMultiDistance
+.visible .entry CosineMultiDistance(
+ .param .u64 CosineMultiDistance_param_0,
+ .param .u64 CosineMultiDistance_param_1,
+ .param .u64 CosineMultiDistance_param_2,
+ .param .u64 CosineMultiDistance_param_3,
+ .param .u64 CosineMultiDistance_param_4,
+ .param .u32 CosineMultiDistance_param_5,
+ .param .u32 CosineMultiDistance_param_6,
+ .param .u32 CosineMultiDistance_param_7
)
{
.reg .pred %p<7>;
@@ -4224,20 +4332,20 @@ $L__BB57_21:
.reg .b64 %rd<28>;
- ld.param.u64 %rd7, [MultiCosineDistance_param_0];
- ld.param.u64 %rd8, [MultiCosineDistance_param_1];
- ld.param.u64 %rd9, [MultiCosineDistance_param_2];
- ld.param.u64 %rd10, [MultiCosineDistance_param_3];
- ld.param.u64 %rd11, [MultiCosineDistance_param_4];
- ld.param.u32 %r17, [MultiCosineDistance_param_5];
- ld.param.u32 %r18, [MultiCosineDistance_param_6];
- ld.param.u32 %r19, [MultiCosineDistance_param_7];
+ ld.param.u64 %rd7, [CosineMultiDistance_param_0];
+ ld.param.u64 %rd8, [CosineMultiDistance_param_1];
+ ld.param.u64 %rd9, [CosineMultiDistance_param_2];
+ ld.param.u64 %rd10, [CosineMultiDistance_param_3];
+ ld.param.u64 %rd11, [CosineMultiDistance_param_4];
+ ld.param.u32 %r17, [CosineMultiDistance_param_5];
+ ld.param.u32 %r18, [CosineMultiDistance_param_6];
+ ld.param.u32 %r19, [CosineMultiDistance_param_7];
mov.u32 %r20, %ctaid.x;
mov.u32 %r1, %ntid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r31, %r1, %r20, %r21;
setp.ge.u32 %p1, %r31, %r19;
- @%p1 bra $L__BB58_9;
+ @%p1 bra $L__BB59_9;
mov.u32 %r22, %ntid.y;
mov.u32 %r23, %ctaid.y;
@@ -4259,16 +4367,16 @@ $L__BB57_21:
cvta.to.global.u64 %rd4, %rd10;
cvta.to.global.u64 %rd5, %rd11;
-$L__BB58_2:
+$L__BB59_2:
setp.ge.u32 %p2, %r3, %r18;
- @%p2 bra $L__BB58_8;
+ @%p2 bra $L__BB59_8;
cvt.u64.u32 %rd6, %r31;
mov.u32 %r32, %r3;
-$L__BB58_4:
+$L__BB59_4:
setp.ge.u32 %p3, %r5, %r17;
- @%p3 bra $L__BB58_7;
+ @%p3 bra $L__BB59_7;
mul.wide.u32 %rd12, %r32, 8;
add.s64 %rd13, %rd1, %rd12;
@@ -4281,7 +4389,7 @@ $L__BB58_4:
mad.lo.s32 %r33, %r32, %r17, %r5;
mov.u32 %r34, %r5;
-$L__BB58_6:
+$L__BB59_6:
mul.wide.u32 %rd18, %r34, 8;
add.s64 %rd19, %rd2, %rd18;
ld.global.nc.u64 %rd20, [%rd19];
@@ -4300,19 +4408,105 @@ $L__BB58_6:
add.s32 %r33, %r33, %r7;
add.s32 %r34, %r34, %r7;
setp.lt.u32 %p4, %r34, %r17;
- @%p4 bra $L__BB58_6;
+ @%p4 bra $L__BB59_6;
-$L__BB58_7:
+$L__BB59_7:
add.s32 %r32, %r32, %r6;
setp.lt.u32 %p5, %r32, %r18;
- @%p5 bra $L__BB58_4;
+ @%p5 bra $L__BB59_4;
-$L__BB58_8:
+$L__BB59_8:
add.s32 %r31, %r31, %r4;
setp.lt.u32 %p6, %r31, %r19;
- @%p6 bra $L__BB58_2;
+ @%p6 bra $L__BB59_2;
-$L__BB58_9:
+$L__BB59_9:
+ ret;
+
+}
+ // .globl CosineDistances
+.visible .entry CosineDistances(
+ .param .u64 CosineDistances_param_0,
+ .param .u64 CosineDistances_param_1,
+ .param .u64 CosineDistances_param_2,
+ .param .u64 CosineDistances_param_3,
+ .param .u64 CosineDistances_param_4,
+ .param .u32 CosineDistances_param_5,
+ .param .u32 CosineDistances_param_6
+)
+{
+ .reg .pred %p<5>;
+ .reg .f32 %f<9>;
+ .reg .b32 %r<21>;
+ .reg .b64 %rd<24>;
+
+
+ ld.param.u64 %rd7, [CosineDistances_param_0];
+ ld.param.u64 %rd8, [CosineDistances_param_1];
+ ld.param.u64 %rd9, [CosineDistances_param_2];
+ ld.param.u64 %rd10, [CosineDistances_param_3];
+ ld.param.u64 %rd11, [CosineDistances_param_4];
+ ld.param.u32 %r10, [CosineDistances_param_5];
+ ld.param.u32 %r11, [CosineDistances_param_6];
+ mov.u32 %r12, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r13, %tid.x;
+ mad.lo.s32 %r19, %r1, %r12, %r13;
+ setp.ge.u32 %p1, %r19, %r11;
+ @%p1 bra $L__BB60_6;
+
+ mov.u32 %r14, %ntid.y;
+ mov.u32 %r15, %ctaid.y;
+ mov.u32 %r16, %tid.y;
+ mad.lo.s32 %r3, %r14, %r15, %r16;
+ mov.u32 %r17, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r17;
+ mov.u32 %r18, %nctaid.y;
+ mul.lo.s32 %r5, %r14, %r18;
+ cvta.to.global.u64 %rd1, %rd7;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd9;
+ cvta.to.global.u64 %rd4, %rd10;
+ cvta.to.global.u64 %rd5, %rd11;
+
+$L__BB60_2:
+ setp.ge.u32 %p2, %r3, %r10;
+ @%p2 bra $L__BB60_5;
+
+ cvt.u64.u32 %rd6, %r19;
+ mul.wide.u32 %rd12, %r19, 4;
+ add.s64 %rd13, %rd1, %rd12;
+ ld.global.nc.f32 %f1, [%rd13];
+ mul.ftz.f32 %f2, %f1, %f1;
+ mov.u32 %r20, %r3;
+
+$L__BB60_4:
+ mul.wide.u32 %rd14, %r20, 8;
+ add.s64 %rd15, %rd2, %rd14;
+ ld.global.nc.u64 %rd16, [%rd15];
+ cvta.to.global.u64 %rd17, %rd16;
+ shl.b64 %rd18, %rd6, 2;
+ add.s64 %rd19, %rd17, %rd18;
+ ld.global.f32 %f3, [%rd19];
+ mul.wide.u32 %rd20, %r20, 4;
+ add.s64 %rd21, %rd3, %rd20;
+ atom.global.add.f32 %f4, [%rd21], %f2;
+ add.s64 %rd22, %rd4, %rd20;
+ mul.ftz.f32 %f5, %f1, %f3;
+ atom.global.add.f32 %f6, [%rd22], %f5;
+ add.s64 %rd23, %rd5, %rd20;
+ mul.ftz.f32 %f7, %f3, %f3;
+ atom.global.add.f32 %f8, [%rd23], %f7;
+ add.s32 %r20, %r20, %r5;
+ setp.lt.u32 %p3, %r20, %r10;
+ @%p3 bra $L__BB60_4;
+
+$L__BB60_5:
+ add.s32 %r19, %r19, %r4;
+ setp.lt.u32 %p4, %r19, %r11;
+ @%p4 bra $L__BB60_2;
+
+$L__BB60_6:
ret;
}
@@ -4340,7 +4534,7 @@ $L__BB58_9:
mov.u32 %r2, %tid.x;
mad.lo.s32 %r3, %r22, %r1, %r2;
setp.ge.u32 %p1, %r3, %r20;
- @%p1 bra $L__BB59_2;
+ @%p1 bra $L__BB61_2;
cvta.to.global.u64 %rd3, %rd1;
mul.lo.s32 %r23, %r3, %r21;
@@ -4352,17 +4546,17 @@ $L__BB58_9:
add.s32 %r26, %r25, %r24;
st.shared.f32 [%r26], %f8;
-$L__BB59_2:
+$L__BB61_2:
bar.sync 0;
setp.ne.s32 %p2, %r2, 0;
- @%p2 bra $L__BB59_11;
+ @%p2 bra $L__BB61_11;
shl.b32 %r4, %r1, 10;
sub.s32 %r27, %r20, %r4;
min.u32 %r5, %r27, 1024;
setp.eq.s32 %p3, %r5, 0;
mov.f32 %f25, 0f00000000;
- @%p3 bra $L__BB59_10;
+ @%p3 bra $L__BB61_10;
not.b32 %r29, %r20;
add.s32 %r30, %r4, %r29;
@@ -4373,7 +4567,7 @@ $L__BB59_2:
setp.lt.u32 %p4, %r33, 3;
mov.f32 %f25, 0f00000000;
mov.u32 %r45, 0;
- @%p4 bra $L__BB59_7;
+ @%p4 bra $L__BB61_7;
add.s32 %r36, %r4, -1;
sub.s32 %r37, %r36, %r20;
@@ -4382,7 +4576,7 @@ $L__BB59_2:
neg.s32 %r42, %r39;
mov.u32 %r43, _ZZ9SumValuesE5block;
-$L__BB59_6:
+$L__BB61_6:
ld.shared.f32 %f13, [%r43];
add.ftz.f32 %f14, %f25, %f13;
ld.shared.f32 %f15, [%r43+4];
@@ -4395,32 +4589,32 @@ $L__BB59_6:
add.s32 %r43, %r43, 16;
add.s32 %r42, %r42, -4;
setp.ne.s32 %p5, %r42, 1;
- @%p5 bra $L__BB59_6;
+ @%p5 bra $L__BB61_6;
-$L__BB59_7:
+$L__BB61_7:
setp.eq.s32 %p6, %r47, 0;
- @%p6 bra $L__BB59_10;
+ @%p6 bra $L__BB61_10;
shl.b32 %r40, %r45, 2;
mov.u32 %r41, _ZZ9SumValuesE5block;
add.s32 %r46, %r41, %r40;
-$L__BB59_9:
+$L__BB61_9:
.pragma "nounroll";
ld.shared.f32 %f20, [%r46];
add.ftz.f32 %f25, %f25, %f20;
add.s32 %r46, %r46, 4;
add.s32 %r47, %r47, -1;
setp.ne.s32 %p7, %r47, 0;
- @%p7 bra $L__BB59_9;
+ @%p7 bra $L__BB61_9;
-$L__BB59_10:
+$L__BB61_10:
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f25;
-$L__BB59_11:
+$L__BB61_11:
ret;
}
diff --git a/BrightData.Cuda/cuda/brightwire_60.ptx b/BrightData.Cuda/cuda/brightwire_60.ptx
index 3be9b95b..6f747af4 100644
--- a/BrightData.Cuda/cuda/brightwire_60.ptx
+++ b/BrightData.Cuda/cuda/brightwire_60.ptx
@@ -4022,200 +4022,308 @@ $L__BB56_3:
ret;
}
- // .globl CalculateDistances
-.visible .entry CalculateDistances(
- .param .u64 CalculateDistances_param_0,
- .param .u64 CalculateDistances_param_1,
- .param .u64 CalculateDistances_param_2,
- .param .u32 CalculateDistances_param_3,
- .param .u32 CalculateDistances_param_4,
- .param .u32 CalculateDistances_param_5,
- .param .u32 CalculateDistances_param_6
+ // .globl CalculateMultiDistances
+.visible .entry CalculateMultiDistances(
+ .param .u64 CalculateMultiDistances_param_0,
+ .param .u64 CalculateMultiDistances_param_1,
+ .param .u64 CalculateMultiDistances_param_2,
+ .param .u32 CalculateMultiDistances_param_3,
+ .param .u32 CalculateMultiDistances_param_4,
+ .param .u32 CalculateMultiDistances_param_5,
+ .param .u32 CalculateMultiDistances_param_6
)
{
- .reg .pred %p<15>;
- .reg .f32 %f<15>;
- .reg .b32 %r<53>;
- .reg .b64 %rd<46>;
-
-
- ld.param.u64 %rd5, [CalculateDistances_param_0];
- ld.param.u64 %rd6, [CalculateDistances_param_1];
- ld.param.u64 %rd7, [CalculateDistances_param_2];
- ld.param.u32 %r27, [CalculateDistances_param_3];
- ld.param.u32 %r28, [CalculateDistances_param_4];
- ld.param.u32 %r29, [CalculateDistances_param_5];
- ld.param.u32 %r30, [CalculateDistances_param_6];
- cvta.to.global.u64 %rd1, %rd7;
- cvta.to.global.u64 %rd2, %rd6;
- cvta.to.global.u64 %rd3, %rd5;
- mov.u32 %r31, %ctaid.x;
+ .reg .pred %p<13>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<49>;
+ .reg .b64 %rd<38>;
+
+
+ ld.param.u64 %rd6, [CalculateMultiDistances_param_0];
+ ld.param.u64 %rd7, [CalculateMultiDistances_param_1];
+ ld.param.u64 %rd8, [CalculateMultiDistances_param_2];
+ ld.param.u32 %r25, [CalculateMultiDistances_param_3];
+ ld.param.u32 %r26, [CalculateMultiDistances_param_4];
+ ld.param.u32 %r27, [CalculateMultiDistances_param_5];
+ ld.param.u32 %r28, [CalculateMultiDistances_param_6];
+ cvta.to.global.u64 %rd1, %rd8;
+ cvta.to.global.u64 %rd2, %rd7;
+ cvta.to.global.u64 %rd3, %rd6;
+ mov.u32 %r29, %ctaid.x;
mov.u32 %r1, %ntid.x;
- mov.u32 %r32, %tid.x;
- mad.lo.s32 %r45, %r1, %r31, %r32;
- setp.ge.u32 %p1, %r45, %r29;
- @%p1 bra $L__BB57_21;
-
- mov.u32 %r33, %ntid.y;
- mov.u32 %r34, %ctaid.y;
- mov.u32 %r35, %tid.y;
- mad.lo.s32 %r3, %r33, %r34, %r35;
- mov.u32 %r36, %nctaid.x;
- mul.lo.s32 %r4, %r1, %r36;
- mov.u32 %r37, %ctaid.z;
- mov.u32 %r38, %ntid.z;
- mov.u32 %r39, %tid.z;
- mad.lo.s32 %r5, %r38, %r37, %r39;
- mov.u32 %r40, %nctaid.y;
- mul.lo.s32 %r6, %r33, %r40;
- mov.u32 %r41, %nctaid.z;
- mul.lo.s32 %r7, %r38, %r41;
+ mov.u32 %r30, %tid.x;
+ mad.lo.s32 %r42, %r1, %r29, %r30;
+ setp.ge.u32 %p1, %r42, %r27;
+ @%p1 bra $L__BB57_18;
+
+ mov.u32 %r31, %ntid.y;
+ mov.u32 %r32, %ctaid.y;
+ mov.u32 %r33, %tid.y;
+ mad.lo.s32 %r3, %r31, %r32, %r33;
+ mov.u32 %r34, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r34;
+ mov.u32 %r35, %ctaid.z;
+ mov.u32 %r36, %ntid.z;
+ mov.u32 %r37, %tid.z;
+ mad.lo.s32 %r5, %r36, %r35, %r37;
+ mov.u32 %r38, %nctaid.y;
+ mul.lo.s32 %r6, %r31, %r38;
+ mov.u32 %r39, %nctaid.z;
+ mul.lo.s32 %r7, %r36, %r39;
$L__BB57_2:
- setp.ge.u32 %p2, %r3, %r28;
- @%p2 bra $L__BB57_20;
+ setp.ge.u32 %p2, %r3, %r26;
+ @%p2 bra $L__BB57_17;
- setp.eq.s32 %p3, %r30, 0;
- cvt.u64.u32 %rd4, %r45;
- @%p3 bra $L__BB57_15;
+ setp.eq.s32 %p3, %r28, 0;
+ cvt.u64.u32 %rd4, %r42;
+ @%p3 bra $L__BB57_12;
- mov.u32 %r46, %r3;
+ mov.u32 %r43, %r3;
$L__BB57_5:
- setp.ge.u32 %p4, %r5, %r27;
- @%p4 bra $L__BB57_14;
-
- mul.wide.u32 %rd8, %r46, 8;
- add.s64 %rd9, %rd3, %rd8;
- ld.global.nc.u64 %rd10, [%rd9];
- cvta.to.global.u64 %rd11, %rd10;
- shl.b64 %rd12, %rd4, 2;
- add.s64 %rd13, %rd11, %rd12;
- ld.global.f32 %f1, [%rd13];
- mul.lo.s32 %r10, %r46, %r27;
- setp.eq.s32 %p5, %r30, 1;
- @%p5 bra $L__BB57_10;
-
- setp.ne.s32 %p6, %r30, 2;
- @%p6 bra $L__BB57_12;
+ setp.ge.u32 %p4, %r5, %r25;
+ @%p4 bra $L__BB57_11;
- add.s32 %r47, %r5, %r10;
- mov.u32 %r48, %r5;
+ setp.eq.s32 %p5, %r28, 2;
+ mul.wide.u32 %rd9, %r43, 8;
+ add.s64 %rd10, %rd3, %rd9;
+ ld.global.nc.u64 %rd11, [%rd10];
+ cvta.to.global.u64 %rd12, %rd11;
+ shl.b64 %rd13, %rd4, 2;
+ add.s64 %rd5, %rd12, %rd13;
+ mul.lo.s32 %r10, %r43, %r25;
+ @%p5 bra $L__BB57_9;
+ bra.uni $L__BB57_7;
$L__BB57_9:
- mul.wide.u32 %rd14, %r48, 8;
- add.s64 %rd15, %rd2, %rd14;
- ld.global.nc.u64 %rd16, [%rd15];
- cvta.to.global.u64 %rd17, %rd16;
- add.s64 %rd19, %rd17, %rd12;
- ld.global.f32 %f3, [%rd19];
- sub.ftz.f32 %f4, %f1, %f3;
- abs.ftz.f32 %f5, %f4;
- mul.wide.u32 %rd20, %r47, 4;
- add.s64 %rd21, %rd1, %rd20;
- atom.global.add.f32 %f6, [%rd21], %f5;
- add.s32 %r47, %r47, %r7;
- add.s32 %r48, %r48, %r7;
- setp.lt.u32 %p7, %r48, %r27;
- @%p7 bra $L__BB57_9;
- bra.uni $L__BB57_14;
+ ld.global.f32 %f1, [%rd5];
+ mov.u32 %r46, %r5;
$L__BB57_10:
- mov.u32 %r49, %r5;
+ mul.wide.u32 %rd16, %r46, 8;
+ add.s64 %rd17, %rd2, %rd16;
+ ld.global.nc.u64 %rd18, [%rd17];
+ cvta.to.global.u64 %rd19, %rd18;
+ add.s64 %rd21, %rd19, %rd13;
+ ld.global.f32 %f4, [%rd21];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ add.s32 %r40, %r46, %r10;
+ mul.wide.u32 %rd22, %r40, 4;
+ add.s64 %rd23, %rd1, %rd22;
+ atom.global.add.f32 %f7, [%rd23], %f6;
+ add.s32 %r46, %r46, %r7;
+ setp.lt.u32 %p7, %r46, %r25;
+ @%p7 bra $L__BB57_10;
+ bra.uni $L__BB57_11;
+
+$L__BB57_7:
+ add.s32 %r44, %r5, %r10;
+ mov.u32 %r45, %r5;
+
+$L__BB57_8:
+ mul.wide.u32 %rd14, %r44, 4;
+ add.s64 %rd15, %rd1, %rd14;
+ atom.global.add.f32 %f3, [%rd15], 0f00000000;
+ add.s32 %r44, %r44, %r7;
+ add.s32 %r45, %r45, %r7;
+ setp.lt.u32 %p6, %r45, %r25;
+ @%p6 bra $L__BB57_8;
$L__BB57_11:
- mul.wide.u32 %rd22, %r49, 8;
- add.s64 %rd23, %rd2, %rd22;
- ld.global.nc.u64 %rd24, [%rd23];
- cvta.to.global.u64 %rd25, %rd24;
- add.s64 %rd27, %rd25, %rd12;
- ld.global.f32 %f7, [%rd27];
- mul.ftz.f32 %f8, %f1, %f7;
- add.s32 %r42, %r49, %r10;
- mul.wide.u32 %rd28, %r42, 4;
- add.s64 %rd29, %rd1, %rd28;
- atom.global.add.f32 %f9, [%rd29], %f8;
- add.s32 %r49, %r49, %r7;
- setp.lt.u32 %p8, %r49, %r27;
- @%p8 bra $L__BB57_11;
- bra.uni $L__BB57_14;
+ add.s32 %r43, %r43, %r6;
+ setp.lt.u32 %p8, %r43, %r26;
+ @%p8 bra $L__BB57_5;
+ bra.uni $L__BB57_17;
$L__BB57_12:
- mov.u32 %r50, %r5;
+ mov.u32 %r47, %r3;
$L__BB57_13:
- add.s32 %r43, %r50, %r10;
- mul.wide.u32 %rd30, %r43, 4;
- add.s64 %rd31, %rd1, %rd30;
- atom.global.add.f32 %f10, [%rd31], 0f00000000;
- add.s32 %r50, %r50, %r7;
- setp.lt.u32 %p9, %r50, %r27;
- @%p9 bra $L__BB57_13;
-
-$L__BB57_14:
- add.s32 %r46, %r46, %r6;
- setp.lt.u32 %p10, %r46, %r28;
- @%p10 bra $L__BB57_5;
- bra.uni $L__BB57_20;
+ setp.ge.u32 %p9, %r5, %r25;
+ @%p9 bra $L__BB57_16;
+
+ mul.wide.u32 %rd24, %r47, 8;
+ add.s64 %rd25, %rd3, %rd24;
+ ld.global.nc.u64 %rd26, [%rd25];
+ cvta.to.global.u64 %rd27, %rd26;
+ shl.b64 %rd28, %rd4, 2;
+ add.s64 %rd29, %rd27, %rd28;
+ ld.global.f32 %f2, [%rd29];
+ mul.lo.s32 %r20, %r47, %r25;
+ mov.u32 %r48, %r5;
$L__BB57_15:
- mov.u32 %r51, %r3;
+ mul.wide.u32 %rd30, %r48, 8;
+ add.s64 %rd31, %rd2, %rd30;
+ ld.global.nc.u64 %rd32, [%rd31];
+ cvta.to.global.u64 %rd33, %rd32;
+ add.s64 %rd35, %rd33, %rd28;
+ ld.global.f32 %f8, [%rd35];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ add.s32 %r41, %r48, %r20;
+ mul.wide.u32 %rd36, %r41, 4;
+ add.s64 %rd37, %rd1, %rd36;
+ atom.global.add.f32 %f11, [%rd37], %f10;
+ add.s32 %r48, %r48, %r7;
+ setp.lt.u32 %p10, %r48, %r25;
+ @%p10 bra $L__BB57_15;
$L__BB57_16:
- setp.ge.u32 %p11, %r5, %r27;
- @%p11 bra $L__BB57_19;
-
- mul.wide.u32 %rd32, %r51, 8;
- add.s64 %rd33, %rd3, %rd32;
- ld.global.nc.u64 %rd34, [%rd33];
- cvta.to.global.u64 %rd35, %rd34;
- shl.b64 %rd36, %rd4, 2;
- add.s64 %rd37, %rd35, %rd36;
- ld.global.f32 %f2, [%rd37];
- mul.lo.s32 %r22, %r51, %r27;
- mov.u32 %r52, %r5;
+ add.s32 %r47, %r47, %r6;
+ setp.lt.u32 %p11, %r47, %r26;
+ @%p11 bra $L__BB57_13;
+
+$L__BB57_17:
+ add.s32 %r42, %r42, %r4;
+ setp.lt.u32 %p12, %r42, %r27;
+ @%p12 bra $L__BB57_2;
$L__BB57_18:
- mul.wide.u32 %rd38, %r52, 8;
- add.s64 %rd39, %rd2, %rd38;
- ld.global.nc.u64 %rd40, [%rd39];
- cvta.to.global.u64 %rd41, %rd40;
- add.s64 %rd43, %rd41, %rd36;
- ld.global.f32 %f11, [%rd43];
- sub.ftz.f32 %f12, %f2, %f11;
- mul.ftz.f32 %f13, %f12, %f12;
- add.s32 %r44, %r52, %r22;
- mul.wide.u32 %rd44, %r44, 4;
- add.s64 %rd45, %rd1, %rd44;
- atom.global.add.f32 %f14, [%rd45], %f13;
- add.s32 %r52, %r52, %r7;
- setp.lt.u32 %p12, %r52, %r27;
- @%p12 bra $L__BB57_18;
-
-$L__BB57_19:
- add.s32 %r51, %r51, %r6;
- setp.lt.u32 %p13, %r51, %r28;
- @%p13 bra $L__BB57_16;
-
-$L__BB57_20:
- add.s32 %r45, %r45, %r4;
- setp.lt.u32 %p14, %r45, %r29;
- @%p14 bra $L__BB57_2;
-
-$L__BB57_21:
ret;
}
- // .globl MultiCosineDistance
-.visible .entry MultiCosineDistance(
- .param .u64 MultiCosineDistance_param_0,
- .param .u64 MultiCosineDistance_param_1,
- .param .u64 MultiCosineDistance_param_2,
- .param .u64 MultiCosineDistance_param_3,
- .param .u64 MultiCosineDistance_param_4,
- .param .u32 MultiCosineDistance_param_5,
- .param .u32 MultiCosineDistance_param_6,
- .param .u32 MultiCosineDistance_param_7
+ // .globl CalculateDistances
+.visible .entry CalculateDistances(
+ .param .u64 CalculateDistances_param_0,
+ .param .u64 CalculateDistances_param_1,
+ .param .u64 CalculateDistances_param_2,
+ .param .u32 CalculateDistances_param_3,
+ .param .u32 CalculateDistances_param_4,
+ .param .u32 CalculateDistances_param_5
+)
+{
+ .reg .pred %p<11>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<31>;
+ .reg .b64 %rd<31>;
+
+
+ ld.param.u64 %rd7, [CalculateDistances_param_0];
+ ld.param.u64 %rd8, [CalculateDistances_param_1];
+ ld.param.u64 %rd9, [CalculateDistances_param_2];
+ ld.param.u32 %r16, [CalculateDistances_param_3];
+ ld.param.u32 %r17, [CalculateDistances_param_4];
+ ld.param.u32 %r18, [CalculateDistances_param_5];
+ cvta.to.global.u64 %rd1, %rd9;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd7;
+ mov.u32 %r19, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r20, %tid.x;
+ mad.lo.s32 %r26, %r1, %r19, %r20;
+ setp.ge.u32 %p1, %r26, %r17;
+ @%p1 bra $L__BB58_15;
+
+ mov.u32 %r21, %ntid.y;
+ mov.u32 %r22, %ctaid.y;
+ mov.u32 %r23, %tid.y;
+ mad.lo.s32 %r3, %r21, %r22, %r23;
+ mov.u32 %r24, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r24;
+ mov.u32 %r25, %nctaid.y;
+ mul.lo.s32 %r5, %r21, %r25;
+ setp.eq.s32 %p2, %r18, 0;
+ @%p2 bra $L__BB58_11;
+
+$L__BB58_3:
+ setp.ge.u32 %p3, %r3, %r16;
+ @%p3 bra $L__BB58_9;
+
+ setp.eq.s32 %p4, %r18, 2;
+ cvt.u64.u32 %rd4, %r26;
+ mul.wide.u32 %rd10, %r26, 4;
+ add.s64 %rd5, %rd3, %rd10;
+ @%p4 bra $L__BB58_7;
+ bra.uni $L__BB58_5;
+
+$L__BB58_7:
+ ld.global.nc.f32 %f1, [%rd5];
+ mov.u32 %r28, %r3;
+
+$L__BB58_8:
+ mul.wide.u32 %rd13, %r28, 8;
+ add.s64 %rd14, %rd2, %rd13;
+ ld.global.nc.u64 %rd15, [%rd14];
+ cvta.to.global.u64 %rd16, %rd15;
+ shl.b64 %rd17, %rd4, 2;
+ add.s64 %rd18, %rd16, %rd17;
+ ld.global.f32 %f4, [%rd18];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ mul.wide.u32 %rd19, %r28, 4;
+ add.s64 %rd20, %rd1, %rd19;
+ atom.global.add.f32 %f7, [%rd20], %f6;
+ add.s32 %r28, %r28, %r5;
+ setp.lt.u32 %p6, %r28, %r16;
+ @%p6 bra $L__BB58_8;
+ bra.uni $L__BB58_9;
+
+$L__BB58_5:
+ mov.u32 %r27, %r3;
+
+$L__BB58_6:
+ mul.wide.u32 %rd11, %r27, 4;
+ add.s64 %rd12, %rd1, %rd11;
+ atom.global.add.f32 %f3, [%rd12], 0f00000000;
+ add.s32 %r27, %r27, %r5;
+ setp.lt.u32 %p5, %r27, %r16;
+ @%p5 bra $L__BB58_6;
+
+$L__BB58_9:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p7, %r26, %r17;
+ @%p7 bra $L__BB58_3;
+ bra.uni $L__BB58_15;
+
+$L__BB58_11:
+ setp.ge.u32 %p8, %r3, %r16;
+ @%p8 bra $L__BB58_14;
+
+ cvt.u64.u32 %rd6, %r26;
+ mul.wide.u32 %rd21, %r26, 4;
+ add.s64 %rd22, %rd3, %rd21;
+ ld.global.nc.f32 %f2, [%rd22];
+ mov.u32 %r30, %r3;
+
+$L__BB58_13:
+ mul.wide.u32 %rd23, %r30, 8;
+ add.s64 %rd24, %rd2, %rd23;
+ ld.global.nc.u64 %rd25, [%rd24];
+ cvta.to.global.u64 %rd26, %rd25;
+ shl.b64 %rd27, %rd6, 2;
+ add.s64 %rd28, %rd26, %rd27;
+ ld.global.f32 %f8, [%rd28];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ mul.wide.u32 %rd29, %r30, 4;
+ add.s64 %rd30, %rd1, %rd29;
+ atom.global.add.f32 %f11, [%rd30], %f10;
+ add.s32 %r30, %r30, %r5;
+ setp.lt.u32 %p9, %r30, %r16;
+ @%p9 bra $L__BB58_13;
+
+$L__BB58_14:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p10, %r26, %r17;
+ @%p10 bra $L__BB58_11;
+
+$L__BB58_15:
+ ret;
+
+}
+ // .globl CosineMultiDistance
+.visible .entry CosineMultiDistance(
+ .param .u64 CosineMultiDistance_param_0,
+ .param .u64 CosineMultiDistance_param_1,
+ .param .u64 CosineMultiDistance_param_2,
+ .param .u64 CosineMultiDistance_param_3,
+ .param .u64 CosineMultiDistance_param_4,
+ .param .u32 CosineMultiDistance_param_5,
+ .param .u32 CosineMultiDistance_param_6,
+ .param .u32 CosineMultiDistance_param_7
)
{
.reg .pred %p<7>;
@@ -4224,20 +4332,20 @@ $L__BB57_21:
.reg .b64 %rd<28>;
- ld.param.u64 %rd7, [MultiCosineDistance_param_0];
- ld.param.u64 %rd8, [MultiCosineDistance_param_1];
- ld.param.u64 %rd9, [MultiCosineDistance_param_2];
- ld.param.u64 %rd10, [MultiCosineDistance_param_3];
- ld.param.u64 %rd11, [MultiCosineDistance_param_4];
- ld.param.u32 %r17, [MultiCosineDistance_param_5];
- ld.param.u32 %r18, [MultiCosineDistance_param_6];
- ld.param.u32 %r19, [MultiCosineDistance_param_7];
+ ld.param.u64 %rd7, [CosineMultiDistance_param_0];
+ ld.param.u64 %rd8, [CosineMultiDistance_param_1];
+ ld.param.u64 %rd9, [CosineMultiDistance_param_2];
+ ld.param.u64 %rd10, [CosineMultiDistance_param_3];
+ ld.param.u64 %rd11, [CosineMultiDistance_param_4];
+ ld.param.u32 %r17, [CosineMultiDistance_param_5];
+ ld.param.u32 %r18, [CosineMultiDistance_param_6];
+ ld.param.u32 %r19, [CosineMultiDistance_param_7];
mov.u32 %r20, %ctaid.x;
mov.u32 %r1, %ntid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r31, %r1, %r20, %r21;
setp.ge.u32 %p1, %r31, %r19;
- @%p1 bra $L__BB58_9;
+ @%p1 bra $L__BB59_9;
mov.u32 %r22, %ntid.y;
mov.u32 %r23, %ctaid.y;
@@ -4259,16 +4367,16 @@ $L__BB57_21:
cvta.to.global.u64 %rd4, %rd10;
cvta.to.global.u64 %rd5, %rd11;
-$L__BB58_2:
+$L__BB59_2:
setp.ge.u32 %p2, %r3, %r18;
- @%p2 bra $L__BB58_8;
+ @%p2 bra $L__BB59_8;
cvt.u64.u32 %rd6, %r31;
mov.u32 %r32, %r3;
-$L__BB58_4:
+$L__BB59_4:
setp.ge.u32 %p3, %r5, %r17;
- @%p3 bra $L__BB58_7;
+ @%p3 bra $L__BB59_7;
mul.wide.u32 %rd12, %r32, 8;
add.s64 %rd13, %rd1, %rd12;
@@ -4281,7 +4389,7 @@ $L__BB58_4:
mad.lo.s32 %r33, %r32, %r17, %r5;
mov.u32 %r34, %r5;
-$L__BB58_6:
+$L__BB59_6:
mul.wide.u32 %rd18, %r34, 8;
add.s64 %rd19, %rd2, %rd18;
ld.global.nc.u64 %rd20, [%rd19];
@@ -4300,19 +4408,105 @@ $L__BB58_6:
add.s32 %r33, %r33, %r7;
add.s32 %r34, %r34, %r7;
setp.lt.u32 %p4, %r34, %r17;
- @%p4 bra $L__BB58_6;
+ @%p4 bra $L__BB59_6;
-$L__BB58_7:
+$L__BB59_7:
add.s32 %r32, %r32, %r6;
setp.lt.u32 %p5, %r32, %r18;
- @%p5 bra $L__BB58_4;
+ @%p5 bra $L__BB59_4;
-$L__BB58_8:
+$L__BB59_8:
add.s32 %r31, %r31, %r4;
setp.lt.u32 %p6, %r31, %r19;
- @%p6 bra $L__BB58_2;
+ @%p6 bra $L__BB59_2;
-$L__BB58_9:
+$L__BB59_9:
+ ret;
+
+}
+ // .globl CosineDistances
+.visible .entry CosineDistances(
+ .param .u64 CosineDistances_param_0,
+ .param .u64 CosineDistances_param_1,
+ .param .u64 CosineDistances_param_2,
+ .param .u64 CosineDistances_param_3,
+ .param .u64 CosineDistances_param_4,
+ .param .u32 CosineDistances_param_5,
+ .param .u32 CosineDistances_param_6
+)
+{
+ .reg .pred %p<5>;
+ .reg .f32 %f<9>;
+ .reg .b32 %r<21>;
+ .reg .b64 %rd<24>;
+
+
+ ld.param.u64 %rd7, [CosineDistances_param_0];
+ ld.param.u64 %rd8, [CosineDistances_param_1];
+ ld.param.u64 %rd9, [CosineDistances_param_2];
+ ld.param.u64 %rd10, [CosineDistances_param_3];
+ ld.param.u64 %rd11, [CosineDistances_param_4];
+ ld.param.u32 %r10, [CosineDistances_param_5];
+ ld.param.u32 %r11, [CosineDistances_param_6];
+ mov.u32 %r12, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r13, %tid.x;
+ mad.lo.s32 %r19, %r1, %r12, %r13;
+ setp.ge.u32 %p1, %r19, %r11;
+ @%p1 bra $L__BB60_6;
+
+ mov.u32 %r14, %ntid.y;
+ mov.u32 %r15, %ctaid.y;
+ mov.u32 %r16, %tid.y;
+ mad.lo.s32 %r3, %r14, %r15, %r16;
+ mov.u32 %r17, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r17;
+ mov.u32 %r18, %nctaid.y;
+ mul.lo.s32 %r5, %r14, %r18;
+ cvta.to.global.u64 %rd1, %rd7;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd9;
+ cvta.to.global.u64 %rd4, %rd10;
+ cvta.to.global.u64 %rd5, %rd11;
+
+$L__BB60_2:
+ setp.ge.u32 %p2, %r3, %r10;
+ @%p2 bra $L__BB60_5;
+
+ cvt.u64.u32 %rd6, %r19;
+ mul.wide.u32 %rd12, %r19, 4;
+ add.s64 %rd13, %rd1, %rd12;
+ ld.global.nc.f32 %f1, [%rd13];
+ mul.ftz.f32 %f2, %f1, %f1;
+ mov.u32 %r20, %r3;
+
+$L__BB60_4:
+ mul.wide.u32 %rd14, %r20, 8;
+ add.s64 %rd15, %rd2, %rd14;
+ ld.global.nc.u64 %rd16, [%rd15];
+ cvta.to.global.u64 %rd17, %rd16;
+ shl.b64 %rd18, %rd6, 2;
+ add.s64 %rd19, %rd17, %rd18;
+ ld.global.f32 %f3, [%rd19];
+ mul.wide.u32 %rd20, %r20, 4;
+ add.s64 %rd21, %rd3, %rd20;
+ atom.global.add.f32 %f4, [%rd21], %f2;
+ add.s64 %rd22, %rd4, %rd20;
+ mul.ftz.f32 %f5, %f1, %f3;
+ atom.global.add.f32 %f6, [%rd22], %f5;
+ add.s64 %rd23, %rd5, %rd20;
+ mul.ftz.f32 %f7, %f3, %f3;
+ atom.global.add.f32 %f8, [%rd23], %f7;
+ add.s32 %r20, %r20, %r5;
+ setp.lt.u32 %p3, %r20, %r10;
+ @%p3 bra $L__BB60_4;
+
+$L__BB60_5:
+ add.s32 %r19, %r19, %r4;
+ setp.lt.u32 %p4, %r19, %r11;
+ @%p4 bra $L__BB60_2;
+
+$L__BB60_6:
ret;
}
@@ -4340,7 +4534,7 @@ $L__BB58_9:
mov.u32 %r2, %tid.x;
mad.lo.s32 %r3, %r22, %r1, %r2;
setp.ge.u32 %p1, %r3, %r20;
- @%p1 bra $L__BB59_2;
+ @%p1 bra $L__BB61_2;
cvta.to.global.u64 %rd3, %rd1;
mul.lo.s32 %r23, %r3, %r21;
@@ -4352,17 +4546,17 @@ $L__BB58_9:
add.s32 %r26, %r25, %r24;
st.shared.f32 [%r26], %f8;
-$L__BB59_2:
+$L__BB61_2:
bar.sync 0;
setp.ne.s32 %p2, %r2, 0;
- @%p2 bra $L__BB59_11;
+ @%p2 bra $L__BB61_11;
shl.b32 %r4, %r1, 10;
sub.s32 %r27, %r20, %r4;
min.u32 %r5, %r27, 1024;
setp.eq.s32 %p3, %r5, 0;
mov.f32 %f25, 0f00000000;
- @%p3 bra $L__BB59_10;
+ @%p3 bra $L__BB61_10;
not.b32 %r29, %r20;
add.s32 %r30, %r4, %r29;
@@ -4373,7 +4567,7 @@ $L__BB59_2:
setp.lt.u32 %p4, %r33, 3;
mov.f32 %f25, 0f00000000;
mov.u32 %r45, 0;
- @%p4 bra $L__BB59_7;
+ @%p4 bra $L__BB61_7;
add.s32 %r36, %r4, -1;
sub.s32 %r37, %r36, %r20;
@@ -4382,7 +4576,7 @@ $L__BB59_2:
neg.s32 %r42, %r39;
mov.u32 %r43, _ZZ9SumValuesE5block;
-$L__BB59_6:
+$L__BB61_6:
ld.shared.f32 %f13, [%r43];
add.ftz.f32 %f14, %f25, %f13;
ld.shared.f32 %f15, [%r43+4];
@@ -4395,32 +4589,32 @@ $L__BB59_6:
add.s32 %r43, %r43, 16;
add.s32 %r42, %r42, -4;
setp.ne.s32 %p5, %r42, 1;
- @%p5 bra $L__BB59_6;
+ @%p5 bra $L__BB61_6;
-$L__BB59_7:
+$L__BB61_7:
setp.eq.s32 %p6, %r47, 0;
- @%p6 bra $L__BB59_10;
+ @%p6 bra $L__BB61_10;
shl.b32 %r40, %r45, 2;
mov.u32 %r41, _ZZ9SumValuesE5block;
add.s32 %r46, %r41, %r40;
-$L__BB59_9:
+$L__BB61_9:
.pragma "nounroll";
ld.shared.f32 %f20, [%r46];
add.ftz.f32 %f25, %f25, %f20;
add.s32 %r46, %r46, 4;
add.s32 %r47, %r47, -1;
setp.ne.s32 %p7, %r47, 0;
- @%p7 bra $L__BB59_9;
+ @%p7 bra $L__BB61_9;
-$L__BB59_10:
+$L__BB61_10:
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f25;
-$L__BB59_11:
+$L__BB61_11:
ret;
}
diff --git a/BrightData.Cuda/cuda/brightwire_61.ptx b/BrightData.Cuda/cuda/brightwire_61.ptx
index 2736bd0a..9e033fca 100644
--- a/BrightData.Cuda/cuda/brightwire_61.ptx
+++ b/BrightData.Cuda/cuda/brightwire_61.ptx
@@ -4022,200 +4022,308 @@ $L__BB56_3:
ret;
}
- // .globl CalculateDistances
-.visible .entry CalculateDistances(
- .param .u64 CalculateDistances_param_0,
- .param .u64 CalculateDistances_param_1,
- .param .u64 CalculateDistances_param_2,
- .param .u32 CalculateDistances_param_3,
- .param .u32 CalculateDistances_param_4,
- .param .u32 CalculateDistances_param_5,
- .param .u32 CalculateDistances_param_6
+ // .globl CalculateMultiDistances
+.visible .entry CalculateMultiDistances(
+ .param .u64 CalculateMultiDistances_param_0,
+ .param .u64 CalculateMultiDistances_param_1,
+ .param .u64 CalculateMultiDistances_param_2,
+ .param .u32 CalculateMultiDistances_param_3,
+ .param .u32 CalculateMultiDistances_param_4,
+ .param .u32 CalculateMultiDistances_param_5,
+ .param .u32 CalculateMultiDistances_param_6
)
{
- .reg .pred %p<15>;
- .reg .f32 %f<15>;
- .reg .b32 %r<53>;
- .reg .b64 %rd<46>;
-
-
- ld.param.u64 %rd5, [CalculateDistances_param_0];
- ld.param.u64 %rd6, [CalculateDistances_param_1];
- ld.param.u64 %rd7, [CalculateDistances_param_2];
- ld.param.u32 %r27, [CalculateDistances_param_3];
- ld.param.u32 %r28, [CalculateDistances_param_4];
- ld.param.u32 %r29, [CalculateDistances_param_5];
- ld.param.u32 %r30, [CalculateDistances_param_6];
- cvta.to.global.u64 %rd1, %rd7;
- cvta.to.global.u64 %rd2, %rd6;
- cvta.to.global.u64 %rd3, %rd5;
- mov.u32 %r31, %ctaid.x;
+ .reg .pred %p<13>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<49>;
+ .reg .b64 %rd<38>;
+
+
+ ld.param.u64 %rd6, [CalculateMultiDistances_param_0];
+ ld.param.u64 %rd7, [CalculateMultiDistances_param_1];
+ ld.param.u64 %rd8, [CalculateMultiDistances_param_2];
+ ld.param.u32 %r25, [CalculateMultiDistances_param_3];
+ ld.param.u32 %r26, [CalculateMultiDistances_param_4];
+ ld.param.u32 %r27, [CalculateMultiDistances_param_5];
+ ld.param.u32 %r28, [CalculateMultiDistances_param_6];
+ cvta.to.global.u64 %rd1, %rd8;
+ cvta.to.global.u64 %rd2, %rd7;
+ cvta.to.global.u64 %rd3, %rd6;
+ mov.u32 %r29, %ctaid.x;
mov.u32 %r1, %ntid.x;
- mov.u32 %r32, %tid.x;
- mad.lo.s32 %r45, %r1, %r31, %r32;
- setp.ge.u32 %p1, %r45, %r29;
- @%p1 bra $L__BB57_21;
-
- mov.u32 %r33, %ntid.y;
- mov.u32 %r34, %ctaid.y;
- mov.u32 %r35, %tid.y;
- mad.lo.s32 %r3, %r33, %r34, %r35;
- mov.u32 %r36, %nctaid.x;
- mul.lo.s32 %r4, %r1, %r36;
- mov.u32 %r37, %ctaid.z;
- mov.u32 %r38, %ntid.z;
- mov.u32 %r39, %tid.z;
- mad.lo.s32 %r5, %r38, %r37, %r39;
- mov.u32 %r40, %nctaid.y;
- mul.lo.s32 %r6, %r33, %r40;
- mov.u32 %r41, %nctaid.z;
- mul.lo.s32 %r7, %r38, %r41;
+ mov.u32 %r30, %tid.x;
+ mad.lo.s32 %r42, %r1, %r29, %r30;
+ setp.ge.u32 %p1, %r42, %r27;
+ @%p1 bra $L__BB57_18;
+
+ mov.u32 %r31, %ntid.y;
+ mov.u32 %r32, %ctaid.y;
+ mov.u32 %r33, %tid.y;
+ mad.lo.s32 %r3, %r31, %r32, %r33;
+ mov.u32 %r34, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r34;
+ mov.u32 %r35, %ctaid.z;
+ mov.u32 %r36, %ntid.z;
+ mov.u32 %r37, %tid.z;
+ mad.lo.s32 %r5, %r36, %r35, %r37;
+ mov.u32 %r38, %nctaid.y;
+ mul.lo.s32 %r6, %r31, %r38;
+ mov.u32 %r39, %nctaid.z;
+ mul.lo.s32 %r7, %r36, %r39;
$L__BB57_2:
- setp.ge.u32 %p2, %r3, %r28;
- @%p2 bra $L__BB57_20;
+ setp.ge.u32 %p2, %r3, %r26;
+ @%p2 bra $L__BB57_17;
- setp.eq.s32 %p3, %r30, 0;
- cvt.u64.u32 %rd4, %r45;
- @%p3 bra $L__BB57_15;
+ setp.eq.s32 %p3, %r28, 0;
+ cvt.u64.u32 %rd4, %r42;
+ @%p3 bra $L__BB57_12;
- mov.u32 %r46, %r3;
+ mov.u32 %r43, %r3;
$L__BB57_5:
- setp.ge.u32 %p4, %r5, %r27;
- @%p4 bra $L__BB57_14;
-
- mul.wide.u32 %rd8, %r46, 8;
- add.s64 %rd9, %rd3, %rd8;
- ld.global.nc.u64 %rd10, [%rd9];
- cvta.to.global.u64 %rd11, %rd10;
- shl.b64 %rd12, %rd4, 2;
- add.s64 %rd13, %rd11, %rd12;
- ld.global.f32 %f1, [%rd13];
- mul.lo.s32 %r10, %r46, %r27;
- setp.eq.s32 %p5, %r30, 1;
- @%p5 bra $L__BB57_10;
-
- setp.ne.s32 %p6, %r30, 2;
- @%p6 bra $L__BB57_12;
+ setp.ge.u32 %p4, %r5, %r25;
+ @%p4 bra $L__BB57_11;
- add.s32 %r47, %r5, %r10;
- mov.u32 %r48, %r5;
+ setp.eq.s32 %p5, %r28, 2;
+ mul.wide.u32 %rd9, %r43, 8;
+ add.s64 %rd10, %rd3, %rd9;
+ ld.global.nc.u64 %rd11, [%rd10];
+ cvta.to.global.u64 %rd12, %rd11;
+ shl.b64 %rd13, %rd4, 2;
+ add.s64 %rd5, %rd12, %rd13;
+ mul.lo.s32 %r10, %r43, %r25;
+ @%p5 bra $L__BB57_9;
+ bra.uni $L__BB57_7;
$L__BB57_9:
- mul.wide.u32 %rd14, %r48, 8;
- add.s64 %rd15, %rd2, %rd14;
- ld.global.nc.u64 %rd16, [%rd15];
- cvta.to.global.u64 %rd17, %rd16;
- add.s64 %rd19, %rd17, %rd12;
- ld.global.f32 %f3, [%rd19];
- sub.ftz.f32 %f4, %f1, %f3;
- abs.ftz.f32 %f5, %f4;
- mul.wide.u32 %rd20, %r47, 4;
- add.s64 %rd21, %rd1, %rd20;
- atom.global.add.f32 %f6, [%rd21], %f5;
- add.s32 %r47, %r47, %r7;
- add.s32 %r48, %r48, %r7;
- setp.lt.u32 %p7, %r48, %r27;
- @%p7 bra $L__BB57_9;
- bra.uni $L__BB57_14;
+ ld.global.f32 %f1, [%rd5];
+ mov.u32 %r46, %r5;
$L__BB57_10:
- mov.u32 %r49, %r5;
+ mul.wide.u32 %rd16, %r46, 8;
+ add.s64 %rd17, %rd2, %rd16;
+ ld.global.nc.u64 %rd18, [%rd17];
+ cvta.to.global.u64 %rd19, %rd18;
+ add.s64 %rd21, %rd19, %rd13;
+ ld.global.f32 %f4, [%rd21];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ add.s32 %r40, %r46, %r10;
+ mul.wide.u32 %rd22, %r40, 4;
+ add.s64 %rd23, %rd1, %rd22;
+ atom.global.add.f32 %f7, [%rd23], %f6;
+ add.s32 %r46, %r46, %r7;
+ setp.lt.u32 %p7, %r46, %r25;
+ @%p7 bra $L__BB57_10;
+ bra.uni $L__BB57_11;
+
+$L__BB57_7:
+ add.s32 %r44, %r5, %r10;
+ mov.u32 %r45, %r5;
+
+$L__BB57_8:
+ mul.wide.u32 %rd14, %r44, 4;
+ add.s64 %rd15, %rd1, %rd14;
+ atom.global.add.f32 %f3, [%rd15], 0f00000000;
+ add.s32 %r44, %r44, %r7;
+ add.s32 %r45, %r45, %r7;
+ setp.lt.u32 %p6, %r45, %r25;
+ @%p6 bra $L__BB57_8;
$L__BB57_11:
- mul.wide.u32 %rd22, %r49, 8;
- add.s64 %rd23, %rd2, %rd22;
- ld.global.nc.u64 %rd24, [%rd23];
- cvta.to.global.u64 %rd25, %rd24;
- add.s64 %rd27, %rd25, %rd12;
- ld.global.f32 %f7, [%rd27];
- mul.ftz.f32 %f8, %f1, %f7;
- add.s32 %r42, %r49, %r10;
- mul.wide.u32 %rd28, %r42, 4;
- add.s64 %rd29, %rd1, %rd28;
- atom.global.add.f32 %f9, [%rd29], %f8;
- add.s32 %r49, %r49, %r7;
- setp.lt.u32 %p8, %r49, %r27;
- @%p8 bra $L__BB57_11;
- bra.uni $L__BB57_14;
+ add.s32 %r43, %r43, %r6;
+ setp.lt.u32 %p8, %r43, %r26;
+ @%p8 bra $L__BB57_5;
+ bra.uni $L__BB57_17;
$L__BB57_12:
- mov.u32 %r50, %r5;
+ mov.u32 %r47, %r3;
$L__BB57_13:
- add.s32 %r43, %r50, %r10;
- mul.wide.u32 %rd30, %r43, 4;
- add.s64 %rd31, %rd1, %rd30;
- atom.global.add.f32 %f10, [%rd31], 0f00000000;
- add.s32 %r50, %r50, %r7;
- setp.lt.u32 %p9, %r50, %r27;
- @%p9 bra $L__BB57_13;
-
-$L__BB57_14:
- add.s32 %r46, %r46, %r6;
- setp.lt.u32 %p10, %r46, %r28;
- @%p10 bra $L__BB57_5;
- bra.uni $L__BB57_20;
+ setp.ge.u32 %p9, %r5, %r25;
+ @%p9 bra $L__BB57_16;
+
+ mul.wide.u32 %rd24, %r47, 8;
+ add.s64 %rd25, %rd3, %rd24;
+ ld.global.nc.u64 %rd26, [%rd25];
+ cvta.to.global.u64 %rd27, %rd26;
+ shl.b64 %rd28, %rd4, 2;
+ add.s64 %rd29, %rd27, %rd28;
+ ld.global.f32 %f2, [%rd29];
+ mul.lo.s32 %r20, %r47, %r25;
+ mov.u32 %r48, %r5;
$L__BB57_15:
- mov.u32 %r51, %r3;
+ mul.wide.u32 %rd30, %r48, 8;
+ add.s64 %rd31, %rd2, %rd30;
+ ld.global.nc.u64 %rd32, [%rd31];
+ cvta.to.global.u64 %rd33, %rd32;
+ add.s64 %rd35, %rd33, %rd28;
+ ld.global.f32 %f8, [%rd35];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ add.s32 %r41, %r48, %r20;
+ mul.wide.u32 %rd36, %r41, 4;
+ add.s64 %rd37, %rd1, %rd36;
+ atom.global.add.f32 %f11, [%rd37], %f10;
+ add.s32 %r48, %r48, %r7;
+ setp.lt.u32 %p10, %r48, %r25;
+ @%p10 bra $L__BB57_15;
$L__BB57_16:
- setp.ge.u32 %p11, %r5, %r27;
- @%p11 bra $L__BB57_19;
-
- mul.wide.u32 %rd32, %r51, 8;
- add.s64 %rd33, %rd3, %rd32;
- ld.global.nc.u64 %rd34, [%rd33];
- cvta.to.global.u64 %rd35, %rd34;
- shl.b64 %rd36, %rd4, 2;
- add.s64 %rd37, %rd35, %rd36;
- ld.global.f32 %f2, [%rd37];
- mul.lo.s32 %r22, %r51, %r27;
- mov.u32 %r52, %r5;
+ add.s32 %r47, %r47, %r6;
+ setp.lt.u32 %p11, %r47, %r26;
+ @%p11 bra $L__BB57_13;
+
+$L__BB57_17:
+ add.s32 %r42, %r42, %r4;
+ setp.lt.u32 %p12, %r42, %r27;
+ @%p12 bra $L__BB57_2;
$L__BB57_18:
- mul.wide.u32 %rd38, %r52, 8;
- add.s64 %rd39, %rd2, %rd38;
- ld.global.nc.u64 %rd40, [%rd39];
- cvta.to.global.u64 %rd41, %rd40;
- add.s64 %rd43, %rd41, %rd36;
- ld.global.f32 %f11, [%rd43];
- sub.ftz.f32 %f12, %f2, %f11;
- mul.ftz.f32 %f13, %f12, %f12;
- add.s32 %r44, %r52, %r22;
- mul.wide.u32 %rd44, %r44, 4;
- add.s64 %rd45, %rd1, %rd44;
- atom.global.add.f32 %f14, [%rd45], %f13;
- add.s32 %r52, %r52, %r7;
- setp.lt.u32 %p12, %r52, %r27;
- @%p12 bra $L__BB57_18;
-
-$L__BB57_19:
- add.s32 %r51, %r51, %r6;
- setp.lt.u32 %p13, %r51, %r28;
- @%p13 bra $L__BB57_16;
-
-$L__BB57_20:
- add.s32 %r45, %r45, %r4;
- setp.lt.u32 %p14, %r45, %r29;
- @%p14 bra $L__BB57_2;
-
-$L__BB57_21:
ret;
}
- // .globl MultiCosineDistance
-.visible .entry MultiCosineDistance(
- .param .u64 MultiCosineDistance_param_0,
- .param .u64 MultiCosineDistance_param_1,
- .param .u64 MultiCosineDistance_param_2,
- .param .u64 MultiCosineDistance_param_3,
- .param .u64 MultiCosineDistance_param_4,
- .param .u32 MultiCosineDistance_param_5,
- .param .u32 MultiCosineDistance_param_6,
- .param .u32 MultiCosineDistance_param_7
+ // .globl CalculateDistances
+.visible .entry CalculateDistances(
+ .param .u64 CalculateDistances_param_0,
+ .param .u64 CalculateDistances_param_1,
+ .param .u64 CalculateDistances_param_2,
+ .param .u32 CalculateDistances_param_3,
+ .param .u32 CalculateDistances_param_4,
+ .param .u32 CalculateDistances_param_5
+)
+{
+ .reg .pred %p<11>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<31>;
+ .reg .b64 %rd<31>;
+
+
+ ld.param.u64 %rd7, [CalculateDistances_param_0];
+ ld.param.u64 %rd8, [CalculateDistances_param_1];
+ ld.param.u64 %rd9, [CalculateDistances_param_2];
+ ld.param.u32 %r16, [CalculateDistances_param_3];
+ ld.param.u32 %r17, [CalculateDistances_param_4];
+ ld.param.u32 %r18, [CalculateDistances_param_5];
+ cvta.to.global.u64 %rd1, %rd9;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd7;
+ mov.u32 %r19, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r20, %tid.x;
+ mad.lo.s32 %r26, %r1, %r19, %r20;
+ setp.ge.u32 %p1, %r26, %r17;
+ @%p1 bra $L__BB58_15;
+
+ mov.u32 %r21, %ntid.y;
+ mov.u32 %r22, %ctaid.y;
+ mov.u32 %r23, %tid.y;
+ mad.lo.s32 %r3, %r21, %r22, %r23;
+ mov.u32 %r24, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r24;
+ mov.u32 %r25, %nctaid.y;
+ mul.lo.s32 %r5, %r21, %r25;
+ setp.eq.s32 %p2, %r18, 0;
+ @%p2 bra $L__BB58_11;
+
+$L__BB58_3:
+ setp.ge.u32 %p3, %r3, %r16;
+ @%p3 bra $L__BB58_9;
+
+ setp.eq.s32 %p4, %r18, 2;
+ cvt.u64.u32 %rd4, %r26;
+ mul.wide.u32 %rd10, %r26, 4;
+ add.s64 %rd5, %rd3, %rd10;
+ @%p4 bra $L__BB58_7;
+ bra.uni $L__BB58_5;
+
+$L__BB58_7:
+ ld.global.nc.f32 %f1, [%rd5];
+ mov.u32 %r28, %r3;
+
+$L__BB58_8:
+ mul.wide.u32 %rd13, %r28, 8;
+ add.s64 %rd14, %rd2, %rd13;
+ ld.global.nc.u64 %rd15, [%rd14];
+ cvta.to.global.u64 %rd16, %rd15;
+ shl.b64 %rd17, %rd4, 2;
+ add.s64 %rd18, %rd16, %rd17;
+ ld.global.f32 %f4, [%rd18];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ mul.wide.u32 %rd19, %r28, 4;
+ add.s64 %rd20, %rd1, %rd19;
+ atom.global.add.f32 %f7, [%rd20], %f6;
+ add.s32 %r28, %r28, %r5;
+ setp.lt.u32 %p6, %r28, %r16;
+ @%p6 bra $L__BB58_8;
+ bra.uni $L__BB58_9;
+
+$L__BB58_5:
+ mov.u32 %r27, %r3;
+
+$L__BB58_6:
+ mul.wide.u32 %rd11, %r27, 4;
+ add.s64 %rd12, %rd1, %rd11;
+ atom.global.add.f32 %f3, [%rd12], 0f00000000;
+ add.s32 %r27, %r27, %r5;
+ setp.lt.u32 %p5, %r27, %r16;
+ @%p5 bra $L__BB58_6;
+
+$L__BB58_9:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p7, %r26, %r17;
+ @%p7 bra $L__BB58_3;
+ bra.uni $L__BB58_15;
+
+$L__BB58_11:
+ setp.ge.u32 %p8, %r3, %r16;
+ @%p8 bra $L__BB58_14;
+
+ cvt.u64.u32 %rd6, %r26;
+ mul.wide.u32 %rd21, %r26, 4;
+ add.s64 %rd22, %rd3, %rd21;
+ ld.global.nc.f32 %f2, [%rd22];
+ mov.u32 %r30, %r3;
+
+$L__BB58_13:
+ mul.wide.u32 %rd23, %r30, 8;
+ add.s64 %rd24, %rd2, %rd23;
+ ld.global.nc.u64 %rd25, [%rd24];
+ cvta.to.global.u64 %rd26, %rd25;
+ shl.b64 %rd27, %rd6, 2;
+ add.s64 %rd28, %rd26, %rd27;
+ ld.global.f32 %f8, [%rd28];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ mul.wide.u32 %rd29, %r30, 4;
+ add.s64 %rd30, %rd1, %rd29;
+ atom.global.add.f32 %f11, [%rd30], %f10;
+ add.s32 %r30, %r30, %r5;
+ setp.lt.u32 %p9, %r30, %r16;
+ @%p9 bra $L__BB58_13;
+
+$L__BB58_14:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p10, %r26, %r17;
+ @%p10 bra $L__BB58_11;
+
+$L__BB58_15:
+ ret;
+
+}
+ // .globl CosineMultiDistance
+.visible .entry CosineMultiDistance(
+ .param .u64 CosineMultiDistance_param_0,
+ .param .u64 CosineMultiDistance_param_1,
+ .param .u64 CosineMultiDistance_param_2,
+ .param .u64 CosineMultiDistance_param_3,
+ .param .u64 CosineMultiDistance_param_4,
+ .param .u32 CosineMultiDistance_param_5,
+ .param .u32 CosineMultiDistance_param_6,
+ .param .u32 CosineMultiDistance_param_7
)
{
.reg .pred %p<7>;
@@ -4224,20 +4332,20 @@ $L__BB57_21:
.reg .b64 %rd<28>;
- ld.param.u64 %rd7, [MultiCosineDistance_param_0];
- ld.param.u64 %rd8, [MultiCosineDistance_param_1];
- ld.param.u64 %rd9, [MultiCosineDistance_param_2];
- ld.param.u64 %rd10, [MultiCosineDistance_param_3];
- ld.param.u64 %rd11, [MultiCosineDistance_param_4];
- ld.param.u32 %r17, [MultiCosineDistance_param_5];
- ld.param.u32 %r18, [MultiCosineDistance_param_6];
- ld.param.u32 %r19, [MultiCosineDistance_param_7];
+ ld.param.u64 %rd7, [CosineMultiDistance_param_0];
+ ld.param.u64 %rd8, [CosineMultiDistance_param_1];
+ ld.param.u64 %rd9, [CosineMultiDistance_param_2];
+ ld.param.u64 %rd10, [CosineMultiDistance_param_3];
+ ld.param.u64 %rd11, [CosineMultiDistance_param_4];
+ ld.param.u32 %r17, [CosineMultiDistance_param_5];
+ ld.param.u32 %r18, [CosineMultiDistance_param_6];
+ ld.param.u32 %r19, [CosineMultiDistance_param_7];
mov.u32 %r20, %ctaid.x;
mov.u32 %r1, %ntid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r31, %r1, %r20, %r21;
setp.ge.u32 %p1, %r31, %r19;
- @%p1 bra $L__BB58_9;
+ @%p1 bra $L__BB59_9;
mov.u32 %r22, %ntid.y;
mov.u32 %r23, %ctaid.y;
@@ -4259,16 +4367,16 @@ $L__BB57_21:
cvta.to.global.u64 %rd4, %rd10;
cvta.to.global.u64 %rd5, %rd11;
-$L__BB58_2:
+$L__BB59_2:
setp.ge.u32 %p2, %r3, %r18;
- @%p2 bra $L__BB58_8;
+ @%p2 bra $L__BB59_8;
cvt.u64.u32 %rd6, %r31;
mov.u32 %r32, %r3;
-$L__BB58_4:
+$L__BB59_4:
setp.ge.u32 %p3, %r5, %r17;
- @%p3 bra $L__BB58_7;
+ @%p3 bra $L__BB59_7;
mul.wide.u32 %rd12, %r32, 8;
add.s64 %rd13, %rd1, %rd12;
@@ -4281,7 +4389,7 @@ $L__BB58_4:
mad.lo.s32 %r33, %r32, %r17, %r5;
mov.u32 %r34, %r5;
-$L__BB58_6:
+$L__BB59_6:
mul.wide.u32 %rd18, %r34, 8;
add.s64 %rd19, %rd2, %rd18;
ld.global.nc.u64 %rd20, [%rd19];
@@ -4300,19 +4408,105 @@ $L__BB58_6:
add.s32 %r33, %r33, %r7;
add.s32 %r34, %r34, %r7;
setp.lt.u32 %p4, %r34, %r17;
- @%p4 bra $L__BB58_6;
+ @%p4 bra $L__BB59_6;
-$L__BB58_7:
+$L__BB59_7:
add.s32 %r32, %r32, %r6;
setp.lt.u32 %p5, %r32, %r18;
- @%p5 bra $L__BB58_4;
+ @%p5 bra $L__BB59_4;
-$L__BB58_8:
+$L__BB59_8:
add.s32 %r31, %r31, %r4;
setp.lt.u32 %p6, %r31, %r19;
- @%p6 bra $L__BB58_2;
+ @%p6 bra $L__BB59_2;
-$L__BB58_9:
+$L__BB59_9:
+ ret;
+
+}
+ // .globl CosineDistances
+.visible .entry CosineDistances(
+ .param .u64 CosineDistances_param_0,
+ .param .u64 CosineDistances_param_1,
+ .param .u64 CosineDistances_param_2,
+ .param .u64 CosineDistances_param_3,
+ .param .u64 CosineDistances_param_4,
+ .param .u32 CosineDistances_param_5,
+ .param .u32 CosineDistances_param_6
+)
+{
+ .reg .pred %p<5>;
+ .reg .f32 %f<9>;
+ .reg .b32 %r<21>;
+ .reg .b64 %rd<24>;
+
+
+ ld.param.u64 %rd7, [CosineDistances_param_0];
+ ld.param.u64 %rd8, [CosineDistances_param_1];
+ ld.param.u64 %rd9, [CosineDistances_param_2];
+ ld.param.u64 %rd10, [CosineDistances_param_3];
+ ld.param.u64 %rd11, [CosineDistances_param_4];
+ ld.param.u32 %r10, [CosineDistances_param_5];
+ ld.param.u32 %r11, [CosineDistances_param_6];
+ mov.u32 %r12, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r13, %tid.x;
+ mad.lo.s32 %r19, %r1, %r12, %r13;
+ setp.ge.u32 %p1, %r19, %r11;
+ @%p1 bra $L__BB60_6;
+
+ mov.u32 %r14, %ntid.y;
+ mov.u32 %r15, %ctaid.y;
+ mov.u32 %r16, %tid.y;
+ mad.lo.s32 %r3, %r14, %r15, %r16;
+ mov.u32 %r17, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r17;
+ mov.u32 %r18, %nctaid.y;
+ mul.lo.s32 %r5, %r14, %r18;
+ cvta.to.global.u64 %rd1, %rd7;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd9;
+ cvta.to.global.u64 %rd4, %rd10;
+ cvta.to.global.u64 %rd5, %rd11;
+
+$L__BB60_2:
+ setp.ge.u32 %p2, %r3, %r10;
+ @%p2 bra $L__BB60_5;
+
+ cvt.u64.u32 %rd6, %r19;
+ mul.wide.u32 %rd12, %r19, 4;
+ add.s64 %rd13, %rd1, %rd12;
+ ld.global.nc.f32 %f1, [%rd13];
+ mul.ftz.f32 %f2, %f1, %f1;
+ mov.u32 %r20, %r3;
+
+$L__BB60_4:
+ mul.wide.u32 %rd14, %r20, 8;
+ add.s64 %rd15, %rd2, %rd14;
+ ld.global.nc.u64 %rd16, [%rd15];
+ cvta.to.global.u64 %rd17, %rd16;
+ shl.b64 %rd18, %rd6, 2;
+ add.s64 %rd19, %rd17, %rd18;
+ ld.global.f32 %f3, [%rd19];
+ mul.wide.u32 %rd20, %r20, 4;
+ add.s64 %rd21, %rd3, %rd20;
+ atom.global.add.f32 %f4, [%rd21], %f2;
+ add.s64 %rd22, %rd4, %rd20;
+ mul.ftz.f32 %f5, %f1, %f3;
+ atom.global.add.f32 %f6, [%rd22], %f5;
+ add.s64 %rd23, %rd5, %rd20;
+ mul.ftz.f32 %f7, %f3, %f3;
+ atom.global.add.f32 %f8, [%rd23], %f7;
+ add.s32 %r20, %r20, %r5;
+ setp.lt.u32 %p3, %r20, %r10;
+ @%p3 bra $L__BB60_4;
+
+$L__BB60_5:
+ add.s32 %r19, %r19, %r4;
+ setp.lt.u32 %p4, %r19, %r11;
+ @%p4 bra $L__BB60_2;
+
+$L__BB60_6:
ret;
}
@@ -4340,7 +4534,7 @@ $L__BB58_9:
mov.u32 %r2, %tid.x;
mad.lo.s32 %r3, %r22, %r1, %r2;
setp.ge.u32 %p1, %r3, %r20;
- @%p1 bra $L__BB59_2;
+ @%p1 bra $L__BB61_2;
cvta.to.global.u64 %rd3, %rd1;
mul.lo.s32 %r23, %r3, %r21;
@@ -4352,17 +4546,17 @@ $L__BB58_9:
add.s32 %r26, %r25, %r24;
st.shared.f32 [%r26], %f8;
-$L__BB59_2:
+$L__BB61_2:
bar.sync 0;
setp.ne.s32 %p2, %r2, 0;
- @%p2 bra $L__BB59_11;
+ @%p2 bra $L__BB61_11;
shl.b32 %r4, %r1, 10;
sub.s32 %r27, %r20, %r4;
min.u32 %r5, %r27, 1024;
setp.eq.s32 %p3, %r5, 0;
mov.f32 %f25, 0f00000000;
- @%p3 bra $L__BB59_10;
+ @%p3 bra $L__BB61_10;
not.b32 %r29, %r20;
add.s32 %r30, %r4, %r29;
@@ -4373,7 +4567,7 @@ $L__BB59_2:
setp.lt.u32 %p4, %r33, 3;
mov.f32 %f25, 0f00000000;
mov.u32 %r45, 0;
- @%p4 bra $L__BB59_7;
+ @%p4 bra $L__BB61_7;
add.s32 %r36, %r4, -1;
sub.s32 %r37, %r36, %r20;
@@ -4382,7 +4576,7 @@ $L__BB59_2:
neg.s32 %r42, %r39;
mov.u32 %r43, _ZZ9SumValuesE5block;
-$L__BB59_6:
+$L__BB61_6:
ld.shared.f32 %f13, [%r43];
add.ftz.f32 %f14, %f25, %f13;
ld.shared.f32 %f15, [%r43+4];
@@ -4395,32 +4589,32 @@ $L__BB59_6:
add.s32 %r43, %r43, 16;
add.s32 %r42, %r42, -4;
setp.ne.s32 %p5, %r42, 1;
- @%p5 bra $L__BB59_6;
+ @%p5 bra $L__BB61_6;
-$L__BB59_7:
+$L__BB61_7:
setp.eq.s32 %p6, %r47, 0;
- @%p6 bra $L__BB59_10;
+ @%p6 bra $L__BB61_10;
shl.b32 %r40, %r45, 2;
mov.u32 %r41, _ZZ9SumValuesE5block;
add.s32 %r46, %r41, %r40;
-$L__BB59_9:
+$L__BB61_9:
.pragma "nounroll";
ld.shared.f32 %f20, [%r46];
add.ftz.f32 %f25, %f25, %f20;
add.s32 %r46, %r46, 4;
add.s32 %r47, %r47, -1;
setp.ne.s32 %p7, %r47, 0;
- @%p7 bra $L__BB59_9;
+ @%p7 bra $L__BB61_9;
-$L__BB59_10:
+$L__BB61_10:
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f25;
-$L__BB59_11:
+$L__BB61_11:
ret;
}
diff --git a/BrightData.Cuda/cuda/brightwire_62.ptx b/BrightData.Cuda/cuda/brightwire_62.ptx
index 5a821223..2c544749 100644
--- a/BrightData.Cuda/cuda/brightwire_62.ptx
+++ b/BrightData.Cuda/cuda/brightwire_62.ptx
@@ -4022,200 +4022,308 @@ $L__BB56_3:
ret;
}
- // .globl CalculateDistances
-.visible .entry CalculateDistances(
- .param .u64 CalculateDistances_param_0,
- .param .u64 CalculateDistances_param_1,
- .param .u64 CalculateDistances_param_2,
- .param .u32 CalculateDistances_param_3,
- .param .u32 CalculateDistances_param_4,
- .param .u32 CalculateDistances_param_5,
- .param .u32 CalculateDistances_param_6
+ // .globl CalculateMultiDistances
+.visible .entry CalculateMultiDistances(
+ .param .u64 CalculateMultiDistances_param_0,
+ .param .u64 CalculateMultiDistances_param_1,
+ .param .u64 CalculateMultiDistances_param_2,
+ .param .u32 CalculateMultiDistances_param_3,
+ .param .u32 CalculateMultiDistances_param_4,
+ .param .u32 CalculateMultiDistances_param_5,
+ .param .u32 CalculateMultiDistances_param_6
)
{
- .reg .pred %p<15>;
- .reg .f32 %f<15>;
- .reg .b32 %r<53>;
- .reg .b64 %rd<46>;
-
-
- ld.param.u64 %rd5, [CalculateDistances_param_0];
- ld.param.u64 %rd6, [CalculateDistances_param_1];
- ld.param.u64 %rd7, [CalculateDistances_param_2];
- ld.param.u32 %r27, [CalculateDistances_param_3];
- ld.param.u32 %r28, [CalculateDistances_param_4];
- ld.param.u32 %r29, [CalculateDistances_param_5];
- ld.param.u32 %r30, [CalculateDistances_param_6];
- cvta.to.global.u64 %rd1, %rd7;
- cvta.to.global.u64 %rd2, %rd6;
- cvta.to.global.u64 %rd3, %rd5;
- mov.u32 %r31, %ctaid.x;
+ .reg .pred %p<13>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<49>;
+ .reg .b64 %rd<38>;
+
+
+ ld.param.u64 %rd6, [CalculateMultiDistances_param_0];
+ ld.param.u64 %rd7, [CalculateMultiDistances_param_1];
+ ld.param.u64 %rd8, [CalculateMultiDistances_param_2];
+ ld.param.u32 %r25, [CalculateMultiDistances_param_3];
+ ld.param.u32 %r26, [CalculateMultiDistances_param_4];
+ ld.param.u32 %r27, [CalculateMultiDistances_param_5];
+ ld.param.u32 %r28, [CalculateMultiDistances_param_6];
+ cvta.to.global.u64 %rd1, %rd8;
+ cvta.to.global.u64 %rd2, %rd7;
+ cvta.to.global.u64 %rd3, %rd6;
+ mov.u32 %r29, %ctaid.x;
mov.u32 %r1, %ntid.x;
- mov.u32 %r32, %tid.x;
- mad.lo.s32 %r45, %r1, %r31, %r32;
- setp.ge.u32 %p1, %r45, %r29;
- @%p1 bra $L__BB57_21;
-
- mov.u32 %r33, %ntid.y;
- mov.u32 %r34, %ctaid.y;
- mov.u32 %r35, %tid.y;
- mad.lo.s32 %r3, %r33, %r34, %r35;
- mov.u32 %r36, %nctaid.x;
- mul.lo.s32 %r4, %r1, %r36;
- mov.u32 %r37, %ctaid.z;
- mov.u32 %r38, %ntid.z;
- mov.u32 %r39, %tid.z;
- mad.lo.s32 %r5, %r38, %r37, %r39;
- mov.u32 %r40, %nctaid.y;
- mul.lo.s32 %r6, %r33, %r40;
- mov.u32 %r41, %nctaid.z;
- mul.lo.s32 %r7, %r38, %r41;
+ mov.u32 %r30, %tid.x;
+ mad.lo.s32 %r42, %r1, %r29, %r30;
+ setp.ge.u32 %p1, %r42, %r27;
+ @%p1 bra $L__BB57_18;
+
+ mov.u32 %r31, %ntid.y;
+ mov.u32 %r32, %ctaid.y;
+ mov.u32 %r33, %tid.y;
+ mad.lo.s32 %r3, %r31, %r32, %r33;
+ mov.u32 %r34, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r34;
+ mov.u32 %r35, %ctaid.z;
+ mov.u32 %r36, %ntid.z;
+ mov.u32 %r37, %tid.z;
+ mad.lo.s32 %r5, %r36, %r35, %r37;
+ mov.u32 %r38, %nctaid.y;
+ mul.lo.s32 %r6, %r31, %r38;
+ mov.u32 %r39, %nctaid.z;
+ mul.lo.s32 %r7, %r36, %r39;
$L__BB57_2:
- setp.ge.u32 %p2, %r3, %r28;
- @%p2 bra $L__BB57_20;
+ setp.ge.u32 %p2, %r3, %r26;
+ @%p2 bra $L__BB57_17;
- setp.eq.s32 %p3, %r30, 0;
- cvt.u64.u32 %rd4, %r45;
- @%p3 bra $L__BB57_15;
+ setp.eq.s32 %p3, %r28, 0;
+ cvt.u64.u32 %rd4, %r42;
+ @%p3 bra $L__BB57_12;
- mov.u32 %r46, %r3;
+ mov.u32 %r43, %r3;
$L__BB57_5:
- setp.ge.u32 %p4, %r5, %r27;
- @%p4 bra $L__BB57_14;
-
- mul.wide.u32 %rd8, %r46, 8;
- add.s64 %rd9, %rd3, %rd8;
- ld.global.nc.u64 %rd10, [%rd9];
- cvta.to.global.u64 %rd11, %rd10;
- shl.b64 %rd12, %rd4, 2;
- add.s64 %rd13, %rd11, %rd12;
- ld.global.f32 %f1, [%rd13];
- mul.lo.s32 %r10, %r46, %r27;
- setp.eq.s32 %p5, %r30, 1;
- @%p5 bra $L__BB57_10;
-
- setp.ne.s32 %p6, %r30, 2;
- @%p6 bra $L__BB57_12;
+ setp.ge.u32 %p4, %r5, %r25;
+ @%p4 bra $L__BB57_11;
- add.s32 %r47, %r5, %r10;
- mov.u32 %r48, %r5;
+ setp.eq.s32 %p5, %r28, 2;
+ mul.wide.u32 %rd9, %r43, 8;
+ add.s64 %rd10, %rd3, %rd9;
+ ld.global.nc.u64 %rd11, [%rd10];
+ cvta.to.global.u64 %rd12, %rd11;
+ shl.b64 %rd13, %rd4, 2;
+ add.s64 %rd5, %rd12, %rd13;
+ mul.lo.s32 %r10, %r43, %r25;
+ @%p5 bra $L__BB57_9;
+ bra.uni $L__BB57_7;
$L__BB57_9:
- mul.wide.u32 %rd14, %r48, 8;
- add.s64 %rd15, %rd2, %rd14;
- ld.global.nc.u64 %rd16, [%rd15];
- cvta.to.global.u64 %rd17, %rd16;
- add.s64 %rd19, %rd17, %rd12;
- ld.global.f32 %f3, [%rd19];
- sub.ftz.f32 %f4, %f1, %f3;
- abs.ftz.f32 %f5, %f4;
- mul.wide.u32 %rd20, %r47, 4;
- add.s64 %rd21, %rd1, %rd20;
- atom.global.add.f32 %f6, [%rd21], %f5;
- add.s32 %r47, %r47, %r7;
- add.s32 %r48, %r48, %r7;
- setp.lt.u32 %p7, %r48, %r27;
- @%p7 bra $L__BB57_9;
- bra.uni $L__BB57_14;
+ ld.global.f32 %f1, [%rd5];
+ mov.u32 %r46, %r5;
$L__BB57_10:
- mov.u32 %r49, %r5;
+ mul.wide.u32 %rd16, %r46, 8;
+ add.s64 %rd17, %rd2, %rd16;
+ ld.global.nc.u64 %rd18, [%rd17];
+ cvta.to.global.u64 %rd19, %rd18;
+ add.s64 %rd21, %rd19, %rd13;
+ ld.global.f32 %f4, [%rd21];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ add.s32 %r40, %r46, %r10;
+ mul.wide.u32 %rd22, %r40, 4;
+ add.s64 %rd23, %rd1, %rd22;
+ atom.global.add.f32 %f7, [%rd23], %f6;
+ add.s32 %r46, %r46, %r7;
+ setp.lt.u32 %p7, %r46, %r25;
+ @%p7 bra $L__BB57_10;
+ bra.uni $L__BB57_11;
+
+$L__BB57_7:
+ add.s32 %r44, %r5, %r10;
+ mov.u32 %r45, %r5;
+
+$L__BB57_8:
+ mul.wide.u32 %rd14, %r44, 4;
+ add.s64 %rd15, %rd1, %rd14;
+ atom.global.add.f32 %f3, [%rd15], 0f00000000;
+ add.s32 %r44, %r44, %r7;
+ add.s32 %r45, %r45, %r7;
+ setp.lt.u32 %p6, %r45, %r25;
+ @%p6 bra $L__BB57_8;
$L__BB57_11:
- mul.wide.u32 %rd22, %r49, 8;
- add.s64 %rd23, %rd2, %rd22;
- ld.global.nc.u64 %rd24, [%rd23];
- cvta.to.global.u64 %rd25, %rd24;
- add.s64 %rd27, %rd25, %rd12;
- ld.global.f32 %f7, [%rd27];
- mul.ftz.f32 %f8, %f1, %f7;
- add.s32 %r42, %r49, %r10;
- mul.wide.u32 %rd28, %r42, 4;
- add.s64 %rd29, %rd1, %rd28;
- atom.global.add.f32 %f9, [%rd29], %f8;
- add.s32 %r49, %r49, %r7;
- setp.lt.u32 %p8, %r49, %r27;
- @%p8 bra $L__BB57_11;
- bra.uni $L__BB57_14;
+ add.s32 %r43, %r43, %r6;
+ setp.lt.u32 %p8, %r43, %r26;
+ @%p8 bra $L__BB57_5;
+ bra.uni $L__BB57_17;
$L__BB57_12:
- mov.u32 %r50, %r5;
+ mov.u32 %r47, %r3;
$L__BB57_13:
- add.s32 %r43, %r50, %r10;
- mul.wide.u32 %rd30, %r43, 4;
- add.s64 %rd31, %rd1, %rd30;
- atom.global.add.f32 %f10, [%rd31], 0f00000000;
- add.s32 %r50, %r50, %r7;
- setp.lt.u32 %p9, %r50, %r27;
- @%p9 bra $L__BB57_13;
-
-$L__BB57_14:
- add.s32 %r46, %r46, %r6;
- setp.lt.u32 %p10, %r46, %r28;
- @%p10 bra $L__BB57_5;
- bra.uni $L__BB57_20;
+ setp.ge.u32 %p9, %r5, %r25;
+ @%p9 bra $L__BB57_16;
+
+ mul.wide.u32 %rd24, %r47, 8;
+ add.s64 %rd25, %rd3, %rd24;
+ ld.global.nc.u64 %rd26, [%rd25];
+ cvta.to.global.u64 %rd27, %rd26;
+ shl.b64 %rd28, %rd4, 2;
+ add.s64 %rd29, %rd27, %rd28;
+ ld.global.f32 %f2, [%rd29];
+ mul.lo.s32 %r20, %r47, %r25;
+ mov.u32 %r48, %r5;
$L__BB57_15:
- mov.u32 %r51, %r3;
+ mul.wide.u32 %rd30, %r48, 8;
+ add.s64 %rd31, %rd2, %rd30;
+ ld.global.nc.u64 %rd32, [%rd31];
+ cvta.to.global.u64 %rd33, %rd32;
+ add.s64 %rd35, %rd33, %rd28;
+ ld.global.f32 %f8, [%rd35];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ add.s32 %r41, %r48, %r20;
+ mul.wide.u32 %rd36, %r41, 4;
+ add.s64 %rd37, %rd1, %rd36;
+ atom.global.add.f32 %f11, [%rd37], %f10;
+ add.s32 %r48, %r48, %r7;
+ setp.lt.u32 %p10, %r48, %r25;
+ @%p10 bra $L__BB57_15;
$L__BB57_16:
- setp.ge.u32 %p11, %r5, %r27;
- @%p11 bra $L__BB57_19;
-
- mul.wide.u32 %rd32, %r51, 8;
- add.s64 %rd33, %rd3, %rd32;
- ld.global.nc.u64 %rd34, [%rd33];
- cvta.to.global.u64 %rd35, %rd34;
- shl.b64 %rd36, %rd4, 2;
- add.s64 %rd37, %rd35, %rd36;
- ld.global.f32 %f2, [%rd37];
- mul.lo.s32 %r22, %r51, %r27;
- mov.u32 %r52, %r5;
+ add.s32 %r47, %r47, %r6;
+ setp.lt.u32 %p11, %r47, %r26;
+ @%p11 bra $L__BB57_13;
+
+$L__BB57_17:
+ add.s32 %r42, %r42, %r4;
+ setp.lt.u32 %p12, %r42, %r27;
+ @%p12 bra $L__BB57_2;
$L__BB57_18:
- mul.wide.u32 %rd38, %r52, 8;
- add.s64 %rd39, %rd2, %rd38;
- ld.global.nc.u64 %rd40, [%rd39];
- cvta.to.global.u64 %rd41, %rd40;
- add.s64 %rd43, %rd41, %rd36;
- ld.global.f32 %f11, [%rd43];
- sub.ftz.f32 %f12, %f2, %f11;
- mul.ftz.f32 %f13, %f12, %f12;
- add.s32 %r44, %r52, %r22;
- mul.wide.u32 %rd44, %r44, 4;
- add.s64 %rd45, %rd1, %rd44;
- atom.global.add.f32 %f14, [%rd45], %f13;
- add.s32 %r52, %r52, %r7;
- setp.lt.u32 %p12, %r52, %r27;
- @%p12 bra $L__BB57_18;
-
-$L__BB57_19:
- add.s32 %r51, %r51, %r6;
- setp.lt.u32 %p13, %r51, %r28;
- @%p13 bra $L__BB57_16;
-
-$L__BB57_20:
- add.s32 %r45, %r45, %r4;
- setp.lt.u32 %p14, %r45, %r29;
- @%p14 bra $L__BB57_2;
-
-$L__BB57_21:
ret;
}
- // .globl MultiCosineDistance
-.visible .entry MultiCosineDistance(
- .param .u64 MultiCosineDistance_param_0,
- .param .u64 MultiCosineDistance_param_1,
- .param .u64 MultiCosineDistance_param_2,
- .param .u64 MultiCosineDistance_param_3,
- .param .u64 MultiCosineDistance_param_4,
- .param .u32 MultiCosineDistance_param_5,
- .param .u32 MultiCosineDistance_param_6,
- .param .u32 MultiCosineDistance_param_7
+ // .globl CalculateDistances
+.visible .entry CalculateDistances(
+ .param .u64 CalculateDistances_param_0,
+ .param .u64 CalculateDistances_param_1,
+ .param .u64 CalculateDistances_param_2,
+ .param .u32 CalculateDistances_param_3,
+ .param .u32 CalculateDistances_param_4,
+ .param .u32 CalculateDistances_param_5
+)
+{
+ .reg .pred %p<11>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<31>;
+ .reg .b64 %rd<31>;
+
+
+ ld.param.u64 %rd7, [CalculateDistances_param_0];
+ ld.param.u64 %rd8, [CalculateDistances_param_1];
+ ld.param.u64 %rd9, [CalculateDistances_param_2];
+ ld.param.u32 %r16, [CalculateDistances_param_3];
+ ld.param.u32 %r17, [CalculateDistances_param_4];
+ ld.param.u32 %r18, [CalculateDistances_param_5];
+ cvta.to.global.u64 %rd1, %rd9;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd7;
+ mov.u32 %r19, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r20, %tid.x;
+ mad.lo.s32 %r26, %r1, %r19, %r20;
+ setp.ge.u32 %p1, %r26, %r17;
+ @%p1 bra $L__BB58_15;
+
+ mov.u32 %r21, %ntid.y;
+ mov.u32 %r22, %ctaid.y;
+ mov.u32 %r23, %tid.y;
+ mad.lo.s32 %r3, %r21, %r22, %r23;
+ mov.u32 %r24, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r24;
+ mov.u32 %r25, %nctaid.y;
+ mul.lo.s32 %r5, %r21, %r25;
+ setp.eq.s32 %p2, %r18, 0;
+ @%p2 bra $L__BB58_11;
+
+$L__BB58_3:
+ setp.ge.u32 %p3, %r3, %r16;
+ @%p3 bra $L__BB58_9;
+
+ setp.eq.s32 %p4, %r18, 2;
+ cvt.u64.u32 %rd4, %r26;
+ mul.wide.u32 %rd10, %r26, 4;
+ add.s64 %rd5, %rd3, %rd10;
+ @%p4 bra $L__BB58_7;
+ bra.uni $L__BB58_5;
+
+$L__BB58_7:
+ ld.global.nc.f32 %f1, [%rd5];
+ mov.u32 %r28, %r3;
+
+$L__BB58_8:
+ mul.wide.u32 %rd13, %r28, 8;
+ add.s64 %rd14, %rd2, %rd13;
+ ld.global.nc.u64 %rd15, [%rd14];
+ cvta.to.global.u64 %rd16, %rd15;
+ shl.b64 %rd17, %rd4, 2;
+ add.s64 %rd18, %rd16, %rd17;
+ ld.global.f32 %f4, [%rd18];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ mul.wide.u32 %rd19, %r28, 4;
+ add.s64 %rd20, %rd1, %rd19;
+ atom.global.add.f32 %f7, [%rd20], %f6;
+ add.s32 %r28, %r28, %r5;
+ setp.lt.u32 %p6, %r28, %r16;
+ @%p6 bra $L__BB58_8;
+ bra.uni $L__BB58_9;
+
+$L__BB58_5:
+ mov.u32 %r27, %r3;
+
+$L__BB58_6:
+ mul.wide.u32 %rd11, %r27, 4;
+ add.s64 %rd12, %rd1, %rd11;
+ atom.global.add.f32 %f3, [%rd12], 0f00000000;
+ add.s32 %r27, %r27, %r5;
+ setp.lt.u32 %p5, %r27, %r16;
+ @%p5 bra $L__BB58_6;
+
+$L__BB58_9:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p7, %r26, %r17;
+ @%p7 bra $L__BB58_3;
+ bra.uni $L__BB58_15;
+
+$L__BB58_11:
+ setp.ge.u32 %p8, %r3, %r16;
+ @%p8 bra $L__BB58_14;
+
+ cvt.u64.u32 %rd6, %r26;
+ mul.wide.u32 %rd21, %r26, 4;
+ add.s64 %rd22, %rd3, %rd21;
+ ld.global.nc.f32 %f2, [%rd22];
+ mov.u32 %r30, %r3;
+
+$L__BB58_13:
+ mul.wide.u32 %rd23, %r30, 8;
+ add.s64 %rd24, %rd2, %rd23;
+ ld.global.nc.u64 %rd25, [%rd24];
+ cvta.to.global.u64 %rd26, %rd25;
+ shl.b64 %rd27, %rd6, 2;
+ add.s64 %rd28, %rd26, %rd27;
+ ld.global.f32 %f8, [%rd28];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ mul.wide.u32 %rd29, %r30, 4;
+ add.s64 %rd30, %rd1, %rd29;
+ atom.global.add.f32 %f11, [%rd30], %f10;
+ add.s32 %r30, %r30, %r5;
+ setp.lt.u32 %p9, %r30, %r16;
+ @%p9 bra $L__BB58_13;
+
+$L__BB58_14:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p10, %r26, %r17;
+ @%p10 bra $L__BB58_11;
+
+$L__BB58_15:
+ ret;
+
+}
+ // .globl CosineMultiDistance
+.visible .entry CosineMultiDistance(
+ .param .u64 CosineMultiDistance_param_0,
+ .param .u64 CosineMultiDistance_param_1,
+ .param .u64 CosineMultiDistance_param_2,
+ .param .u64 CosineMultiDistance_param_3,
+ .param .u64 CosineMultiDistance_param_4,
+ .param .u32 CosineMultiDistance_param_5,
+ .param .u32 CosineMultiDistance_param_6,
+ .param .u32 CosineMultiDistance_param_7
)
{
.reg .pred %p<7>;
@@ -4224,20 +4332,20 @@ $L__BB57_21:
.reg .b64 %rd<28>;
- ld.param.u64 %rd7, [MultiCosineDistance_param_0];
- ld.param.u64 %rd8, [MultiCosineDistance_param_1];
- ld.param.u64 %rd9, [MultiCosineDistance_param_2];
- ld.param.u64 %rd10, [MultiCosineDistance_param_3];
- ld.param.u64 %rd11, [MultiCosineDistance_param_4];
- ld.param.u32 %r17, [MultiCosineDistance_param_5];
- ld.param.u32 %r18, [MultiCosineDistance_param_6];
- ld.param.u32 %r19, [MultiCosineDistance_param_7];
+ ld.param.u64 %rd7, [CosineMultiDistance_param_0];
+ ld.param.u64 %rd8, [CosineMultiDistance_param_1];
+ ld.param.u64 %rd9, [CosineMultiDistance_param_2];
+ ld.param.u64 %rd10, [CosineMultiDistance_param_3];
+ ld.param.u64 %rd11, [CosineMultiDistance_param_4];
+ ld.param.u32 %r17, [CosineMultiDistance_param_5];
+ ld.param.u32 %r18, [CosineMultiDistance_param_6];
+ ld.param.u32 %r19, [CosineMultiDistance_param_7];
mov.u32 %r20, %ctaid.x;
mov.u32 %r1, %ntid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r31, %r1, %r20, %r21;
setp.ge.u32 %p1, %r31, %r19;
- @%p1 bra $L__BB58_9;
+ @%p1 bra $L__BB59_9;
mov.u32 %r22, %ntid.y;
mov.u32 %r23, %ctaid.y;
@@ -4259,16 +4367,16 @@ $L__BB57_21:
cvta.to.global.u64 %rd4, %rd10;
cvta.to.global.u64 %rd5, %rd11;
-$L__BB58_2:
+$L__BB59_2:
setp.ge.u32 %p2, %r3, %r18;
- @%p2 bra $L__BB58_8;
+ @%p2 bra $L__BB59_8;
cvt.u64.u32 %rd6, %r31;
mov.u32 %r32, %r3;
-$L__BB58_4:
+$L__BB59_4:
setp.ge.u32 %p3, %r5, %r17;
- @%p3 bra $L__BB58_7;
+ @%p3 bra $L__BB59_7;
mul.wide.u32 %rd12, %r32, 8;
add.s64 %rd13, %rd1, %rd12;
@@ -4281,7 +4389,7 @@ $L__BB58_4:
mad.lo.s32 %r33, %r32, %r17, %r5;
mov.u32 %r34, %r5;
-$L__BB58_6:
+$L__BB59_6:
mul.wide.u32 %rd18, %r34, 8;
add.s64 %rd19, %rd2, %rd18;
ld.global.nc.u64 %rd20, [%rd19];
@@ -4300,19 +4408,105 @@ $L__BB58_6:
add.s32 %r33, %r33, %r7;
add.s32 %r34, %r34, %r7;
setp.lt.u32 %p4, %r34, %r17;
- @%p4 bra $L__BB58_6;
+ @%p4 bra $L__BB59_6;
-$L__BB58_7:
+$L__BB59_7:
add.s32 %r32, %r32, %r6;
setp.lt.u32 %p5, %r32, %r18;
- @%p5 bra $L__BB58_4;
+ @%p5 bra $L__BB59_4;
-$L__BB58_8:
+$L__BB59_8:
add.s32 %r31, %r31, %r4;
setp.lt.u32 %p6, %r31, %r19;
- @%p6 bra $L__BB58_2;
+ @%p6 bra $L__BB59_2;
-$L__BB58_9:
+$L__BB59_9:
+ ret;
+
+}
+ // .globl CosineDistances
+.visible .entry CosineDistances(
+ .param .u64 CosineDistances_param_0,
+ .param .u64 CosineDistances_param_1,
+ .param .u64 CosineDistances_param_2,
+ .param .u64 CosineDistances_param_3,
+ .param .u64 CosineDistances_param_4,
+ .param .u32 CosineDistances_param_5,
+ .param .u32 CosineDistances_param_6
+)
+{
+ .reg .pred %p<5>;
+ .reg .f32 %f<9>;
+ .reg .b32 %r<21>;
+ .reg .b64 %rd<24>;
+
+
+ ld.param.u64 %rd7, [CosineDistances_param_0];
+ ld.param.u64 %rd8, [CosineDistances_param_1];
+ ld.param.u64 %rd9, [CosineDistances_param_2];
+ ld.param.u64 %rd10, [CosineDistances_param_3];
+ ld.param.u64 %rd11, [CosineDistances_param_4];
+ ld.param.u32 %r10, [CosineDistances_param_5];
+ ld.param.u32 %r11, [CosineDistances_param_6];
+ mov.u32 %r12, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r13, %tid.x;
+ mad.lo.s32 %r19, %r1, %r12, %r13;
+ setp.ge.u32 %p1, %r19, %r11;
+ @%p1 bra $L__BB60_6;
+
+ mov.u32 %r14, %ntid.y;
+ mov.u32 %r15, %ctaid.y;
+ mov.u32 %r16, %tid.y;
+ mad.lo.s32 %r3, %r14, %r15, %r16;
+ mov.u32 %r17, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r17;
+ mov.u32 %r18, %nctaid.y;
+ mul.lo.s32 %r5, %r14, %r18;
+ cvta.to.global.u64 %rd1, %rd7;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd9;
+ cvta.to.global.u64 %rd4, %rd10;
+ cvta.to.global.u64 %rd5, %rd11;
+
+$L__BB60_2:
+ setp.ge.u32 %p2, %r3, %r10;
+ @%p2 bra $L__BB60_5;
+
+ cvt.u64.u32 %rd6, %r19;
+ mul.wide.u32 %rd12, %r19, 4;
+ add.s64 %rd13, %rd1, %rd12;
+ ld.global.nc.f32 %f1, [%rd13];
+ mul.ftz.f32 %f2, %f1, %f1;
+ mov.u32 %r20, %r3;
+
+$L__BB60_4:
+ mul.wide.u32 %rd14, %r20, 8;
+ add.s64 %rd15, %rd2, %rd14;
+ ld.global.nc.u64 %rd16, [%rd15];
+ cvta.to.global.u64 %rd17, %rd16;
+ shl.b64 %rd18, %rd6, 2;
+ add.s64 %rd19, %rd17, %rd18;
+ ld.global.f32 %f3, [%rd19];
+ mul.wide.u32 %rd20, %r20, 4;
+ add.s64 %rd21, %rd3, %rd20;
+ atom.global.add.f32 %f4, [%rd21], %f2;
+ add.s64 %rd22, %rd4, %rd20;
+ mul.ftz.f32 %f5, %f1, %f3;
+ atom.global.add.f32 %f6, [%rd22], %f5;
+ add.s64 %rd23, %rd5, %rd20;
+ mul.ftz.f32 %f7, %f3, %f3;
+ atom.global.add.f32 %f8, [%rd23], %f7;
+ add.s32 %r20, %r20, %r5;
+ setp.lt.u32 %p3, %r20, %r10;
+ @%p3 bra $L__BB60_4;
+
+$L__BB60_5:
+ add.s32 %r19, %r19, %r4;
+ setp.lt.u32 %p4, %r19, %r11;
+ @%p4 bra $L__BB60_2;
+
+$L__BB60_6:
ret;
}
@@ -4340,7 +4534,7 @@ $L__BB58_9:
mov.u32 %r2, %tid.x;
mad.lo.s32 %r3, %r22, %r1, %r2;
setp.ge.u32 %p1, %r3, %r20;
- @%p1 bra $L__BB59_2;
+ @%p1 bra $L__BB61_2;
cvta.to.global.u64 %rd3, %rd1;
mul.lo.s32 %r23, %r3, %r21;
@@ -4352,17 +4546,17 @@ $L__BB58_9:
add.s32 %r26, %r25, %r24;
st.shared.f32 [%r26], %f8;
-$L__BB59_2:
+$L__BB61_2:
bar.sync 0;
setp.ne.s32 %p2, %r2, 0;
- @%p2 bra $L__BB59_11;
+ @%p2 bra $L__BB61_11;
shl.b32 %r4, %r1, 10;
sub.s32 %r27, %r20, %r4;
min.u32 %r5, %r27, 1024;
setp.eq.s32 %p3, %r5, 0;
mov.f32 %f25, 0f00000000;
- @%p3 bra $L__BB59_10;
+ @%p3 bra $L__BB61_10;
not.b32 %r29, %r20;
add.s32 %r30, %r4, %r29;
@@ -4373,7 +4567,7 @@ $L__BB59_2:
setp.lt.u32 %p4, %r33, 3;
mov.f32 %f25, 0f00000000;
mov.u32 %r45, 0;
- @%p4 bra $L__BB59_7;
+ @%p4 bra $L__BB61_7;
add.s32 %r36, %r4, -1;
sub.s32 %r37, %r36, %r20;
@@ -4382,7 +4576,7 @@ $L__BB59_2:
neg.s32 %r42, %r39;
mov.u32 %r43, _ZZ9SumValuesE5block;
-$L__BB59_6:
+$L__BB61_6:
ld.shared.f32 %f13, [%r43];
add.ftz.f32 %f14, %f25, %f13;
ld.shared.f32 %f15, [%r43+4];
@@ -4395,32 +4589,32 @@ $L__BB59_6:
add.s32 %r43, %r43, 16;
add.s32 %r42, %r42, -4;
setp.ne.s32 %p5, %r42, 1;
- @%p5 bra $L__BB59_6;
+ @%p5 bra $L__BB61_6;
-$L__BB59_7:
+$L__BB61_7:
setp.eq.s32 %p6, %r47, 0;
- @%p6 bra $L__BB59_10;
+ @%p6 bra $L__BB61_10;
shl.b32 %r40, %r45, 2;
mov.u32 %r41, _ZZ9SumValuesE5block;
add.s32 %r46, %r41, %r40;
-$L__BB59_9:
+$L__BB61_9:
.pragma "nounroll";
ld.shared.f32 %f20, [%r46];
add.ftz.f32 %f25, %f25, %f20;
add.s32 %r46, %r46, 4;
add.s32 %r47, %r47, -1;
setp.ne.s32 %p7, %r47, 0;
- @%p7 bra $L__BB59_9;
+ @%p7 bra $L__BB61_9;
-$L__BB59_10:
+$L__BB61_10:
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f25;
-$L__BB59_11:
+$L__BB61_11:
ret;
}
diff --git a/BrightData.Cuda/cuda/brightwire_70.ptx b/BrightData.Cuda/cuda/brightwire_70.ptx
index f2a86115..ca8ad5e5 100644
--- a/BrightData.Cuda/cuda/brightwire_70.ptx
+++ b/BrightData.Cuda/cuda/brightwire_70.ptx
@@ -4022,200 +4022,308 @@ $L__BB56_3:
ret;
}
- // .globl CalculateDistances
-.visible .entry CalculateDistances(
- .param .u64 CalculateDistances_param_0,
- .param .u64 CalculateDistances_param_1,
- .param .u64 CalculateDistances_param_2,
- .param .u32 CalculateDistances_param_3,
- .param .u32 CalculateDistances_param_4,
- .param .u32 CalculateDistances_param_5,
- .param .u32 CalculateDistances_param_6
+ // .globl CalculateMultiDistances
+.visible .entry CalculateMultiDistances(
+ .param .u64 CalculateMultiDistances_param_0,
+ .param .u64 CalculateMultiDistances_param_1,
+ .param .u64 CalculateMultiDistances_param_2,
+ .param .u32 CalculateMultiDistances_param_3,
+ .param .u32 CalculateMultiDistances_param_4,
+ .param .u32 CalculateMultiDistances_param_5,
+ .param .u32 CalculateMultiDistances_param_6
)
{
- .reg .pred %p<15>;
- .reg .f32 %f<15>;
- .reg .b32 %r<53>;
- .reg .b64 %rd<46>;
-
-
- ld.param.u64 %rd5, [CalculateDistances_param_0];
- ld.param.u64 %rd6, [CalculateDistances_param_1];
- ld.param.u64 %rd7, [CalculateDistances_param_2];
- ld.param.u32 %r27, [CalculateDistances_param_3];
- ld.param.u32 %r28, [CalculateDistances_param_4];
- ld.param.u32 %r29, [CalculateDistances_param_5];
- ld.param.u32 %r30, [CalculateDistances_param_6];
- cvta.to.global.u64 %rd1, %rd7;
- cvta.to.global.u64 %rd2, %rd6;
- cvta.to.global.u64 %rd3, %rd5;
- mov.u32 %r31, %ctaid.x;
+ .reg .pred %p<13>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<49>;
+ .reg .b64 %rd<38>;
+
+
+ ld.param.u64 %rd6, [CalculateMultiDistances_param_0];
+ ld.param.u64 %rd7, [CalculateMultiDistances_param_1];
+ ld.param.u64 %rd8, [CalculateMultiDistances_param_2];
+ ld.param.u32 %r25, [CalculateMultiDistances_param_3];
+ ld.param.u32 %r26, [CalculateMultiDistances_param_4];
+ ld.param.u32 %r27, [CalculateMultiDistances_param_5];
+ ld.param.u32 %r28, [CalculateMultiDistances_param_6];
+ cvta.to.global.u64 %rd1, %rd8;
+ cvta.to.global.u64 %rd2, %rd7;
+ cvta.to.global.u64 %rd3, %rd6;
+ mov.u32 %r29, %ctaid.x;
mov.u32 %r1, %ntid.x;
- mov.u32 %r32, %tid.x;
- mad.lo.s32 %r45, %r1, %r31, %r32;
- setp.ge.u32 %p1, %r45, %r29;
- @%p1 bra $L__BB57_21;
-
- mov.u32 %r33, %ntid.y;
- mov.u32 %r34, %ctaid.y;
- mov.u32 %r35, %tid.y;
- mad.lo.s32 %r3, %r33, %r34, %r35;
- mov.u32 %r36, %nctaid.x;
- mul.lo.s32 %r4, %r1, %r36;
- mov.u32 %r37, %ctaid.z;
- mov.u32 %r38, %ntid.z;
- mov.u32 %r39, %tid.z;
- mad.lo.s32 %r5, %r38, %r37, %r39;
- mov.u32 %r40, %nctaid.y;
- mul.lo.s32 %r6, %r33, %r40;
- mov.u32 %r41, %nctaid.z;
- mul.lo.s32 %r7, %r38, %r41;
+ mov.u32 %r30, %tid.x;
+ mad.lo.s32 %r42, %r1, %r29, %r30;
+ setp.ge.u32 %p1, %r42, %r27;
+ @%p1 bra $L__BB57_18;
+
+ mov.u32 %r31, %ntid.y;
+ mov.u32 %r32, %ctaid.y;
+ mov.u32 %r33, %tid.y;
+ mad.lo.s32 %r3, %r31, %r32, %r33;
+ mov.u32 %r34, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r34;
+ mov.u32 %r35, %ctaid.z;
+ mov.u32 %r36, %ntid.z;
+ mov.u32 %r37, %tid.z;
+ mad.lo.s32 %r5, %r36, %r35, %r37;
+ mov.u32 %r38, %nctaid.y;
+ mul.lo.s32 %r6, %r31, %r38;
+ mov.u32 %r39, %nctaid.z;
+ mul.lo.s32 %r7, %r36, %r39;
$L__BB57_2:
- setp.ge.u32 %p2, %r3, %r28;
- @%p2 bra $L__BB57_20;
+ setp.ge.u32 %p2, %r3, %r26;
+ @%p2 bra $L__BB57_17;
- setp.eq.s32 %p3, %r30, 0;
- cvt.u64.u32 %rd4, %r45;
- @%p3 bra $L__BB57_15;
+ setp.eq.s32 %p3, %r28, 0;
+ cvt.u64.u32 %rd4, %r42;
+ @%p3 bra $L__BB57_12;
- mov.u32 %r46, %r3;
+ mov.u32 %r43, %r3;
$L__BB57_5:
- setp.ge.u32 %p4, %r5, %r27;
- @%p4 bra $L__BB57_14;
-
- mul.wide.u32 %rd8, %r46, 8;
- add.s64 %rd9, %rd3, %rd8;
- ld.global.nc.u64 %rd10, [%rd9];
- cvta.to.global.u64 %rd11, %rd10;
- shl.b64 %rd12, %rd4, 2;
- add.s64 %rd13, %rd11, %rd12;
- ld.global.f32 %f1, [%rd13];
- mul.lo.s32 %r10, %r46, %r27;
- setp.eq.s32 %p5, %r30, 1;
- @%p5 bra $L__BB57_10;
-
- setp.ne.s32 %p6, %r30, 2;
- @%p6 bra $L__BB57_12;
+ setp.ge.u32 %p4, %r5, %r25;
+ @%p4 bra $L__BB57_11;
- add.s32 %r47, %r5, %r10;
- mov.u32 %r48, %r5;
+ setp.eq.s32 %p5, %r28, 2;
+ mul.wide.u32 %rd9, %r43, 8;
+ add.s64 %rd10, %rd3, %rd9;
+ ld.global.nc.u64 %rd11, [%rd10];
+ cvta.to.global.u64 %rd12, %rd11;
+ shl.b64 %rd13, %rd4, 2;
+ add.s64 %rd5, %rd12, %rd13;
+ mul.lo.s32 %r10, %r43, %r25;
+ @%p5 bra $L__BB57_9;
+ bra.uni $L__BB57_7;
$L__BB57_9:
- mul.wide.u32 %rd14, %r48, 8;
- add.s64 %rd15, %rd2, %rd14;
- ld.global.nc.u64 %rd16, [%rd15];
- cvta.to.global.u64 %rd17, %rd16;
- add.s64 %rd19, %rd17, %rd12;
- ld.global.f32 %f3, [%rd19];
- sub.ftz.f32 %f4, %f1, %f3;
- abs.ftz.f32 %f5, %f4;
- mul.wide.u32 %rd20, %r47, 4;
- add.s64 %rd21, %rd1, %rd20;
- atom.global.add.f32 %f6, [%rd21], %f5;
- add.s32 %r47, %r47, %r7;
- add.s32 %r48, %r48, %r7;
- setp.lt.u32 %p7, %r48, %r27;
- @%p7 bra $L__BB57_9;
- bra.uni $L__BB57_14;
+ ld.global.f32 %f1, [%rd5];
+ mov.u32 %r46, %r5;
$L__BB57_10:
- mov.u32 %r49, %r5;
+ mul.wide.u32 %rd16, %r46, 8;
+ add.s64 %rd17, %rd2, %rd16;
+ ld.global.nc.u64 %rd18, [%rd17];
+ cvta.to.global.u64 %rd19, %rd18;
+ add.s64 %rd21, %rd19, %rd13;
+ ld.global.f32 %f4, [%rd21];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ add.s32 %r40, %r46, %r10;
+ mul.wide.u32 %rd22, %r40, 4;
+ add.s64 %rd23, %rd1, %rd22;
+ atom.global.add.f32 %f7, [%rd23], %f6;
+ add.s32 %r46, %r46, %r7;
+ setp.lt.u32 %p7, %r46, %r25;
+ @%p7 bra $L__BB57_10;
+ bra.uni $L__BB57_11;
+
+$L__BB57_7:
+ add.s32 %r44, %r5, %r10;
+ mov.u32 %r45, %r5;
+
+$L__BB57_8:
+ mul.wide.u32 %rd14, %r44, 4;
+ add.s64 %rd15, %rd1, %rd14;
+ atom.global.add.f32 %f3, [%rd15], 0f00000000;
+ add.s32 %r44, %r44, %r7;
+ add.s32 %r45, %r45, %r7;
+ setp.lt.u32 %p6, %r45, %r25;
+ @%p6 bra $L__BB57_8;
$L__BB57_11:
- mul.wide.u32 %rd22, %r49, 8;
- add.s64 %rd23, %rd2, %rd22;
- ld.global.nc.u64 %rd24, [%rd23];
- cvta.to.global.u64 %rd25, %rd24;
- add.s64 %rd27, %rd25, %rd12;
- ld.global.f32 %f7, [%rd27];
- mul.ftz.f32 %f8, %f1, %f7;
- add.s32 %r42, %r49, %r10;
- mul.wide.u32 %rd28, %r42, 4;
- add.s64 %rd29, %rd1, %rd28;
- atom.global.add.f32 %f9, [%rd29], %f8;
- add.s32 %r49, %r49, %r7;
- setp.lt.u32 %p8, %r49, %r27;
- @%p8 bra $L__BB57_11;
- bra.uni $L__BB57_14;
+ add.s32 %r43, %r43, %r6;
+ setp.lt.u32 %p8, %r43, %r26;
+ @%p8 bra $L__BB57_5;
+ bra.uni $L__BB57_17;
$L__BB57_12:
- mov.u32 %r50, %r5;
+ mov.u32 %r47, %r3;
$L__BB57_13:
- add.s32 %r43, %r50, %r10;
- mul.wide.u32 %rd30, %r43, 4;
- add.s64 %rd31, %rd1, %rd30;
- atom.global.add.f32 %f10, [%rd31], 0f00000000;
- add.s32 %r50, %r50, %r7;
- setp.lt.u32 %p9, %r50, %r27;
- @%p9 bra $L__BB57_13;
-
-$L__BB57_14:
- add.s32 %r46, %r46, %r6;
- setp.lt.u32 %p10, %r46, %r28;
- @%p10 bra $L__BB57_5;
- bra.uni $L__BB57_20;
+ setp.ge.u32 %p9, %r5, %r25;
+ @%p9 bra $L__BB57_16;
+
+ mul.wide.u32 %rd24, %r47, 8;
+ add.s64 %rd25, %rd3, %rd24;
+ ld.global.nc.u64 %rd26, [%rd25];
+ cvta.to.global.u64 %rd27, %rd26;
+ shl.b64 %rd28, %rd4, 2;
+ add.s64 %rd29, %rd27, %rd28;
+ ld.global.f32 %f2, [%rd29];
+ mul.lo.s32 %r20, %r47, %r25;
+ mov.u32 %r48, %r5;
$L__BB57_15:
- mov.u32 %r51, %r3;
+ mul.wide.u32 %rd30, %r48, 8;
+ add.s64 %rd31, %rd2, %rd30;
+ ld.global.nc.u64 %rd32, [%rd31];
+ cvta.to.global.u64 %rd33, %rd32;
+ add.s64 %rd35, %rd33, %rd28;
+ ld.global.f32 %f8, [%rd35];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ add.s32 %r41, %r48, %r20;
+ mul.wide.u32 %rd36, %r41, 4;
+ add.s64 %rd37, %rd1, %rd36;
+ atom.global.add.f32 %f11, [%rd37], %f10;
+ add.s32 %r48, %r48, %r7;
+ setp.lt.u32 %p10, %r48, %r25;
+ @%p10 bra $L__BB57_15;
$L__BB57_16:
- setp.ge.u32 %p11, %r5, %r27;
- @%p11 bra $L__BB57_19;
-
- mul.wide.u32 %rd32, %r51, 8;
- add.s64 %rd33, %rd3, %rd32;
- ld.global.nc.u64 %rd34, [%rd33];
- cvta.to.global.u64 %rd35, %rd34;
- shl.b64 %rd36, %rd4, 2;
- add.s64 %rd37, %rd35, %rd36;
- ld.global.f32 %f2, [%rd37];
- mul.lo.s32 %r22, %r51, %r27;
- mov.u32 %r52, %r5;
+ add.s32 %r47, %r47, %r6;
+ setp.lt.u32 %p11, %r47, %r26;
+ @%p11 bra $L__BB57_13;
+
+$L__BB57_17:
+ add.s32 %r42, %r42, %r4;
+ setp.lt.u32 %p12, %r42, %r27;
+ @%p12 bra $L__BB57_2;
$L__BB57_18:
- mul.wide.u32 %rd38, %r52, 8;
- add.s64 %rd39, %rd2, %rd38;
- ld.global.nc.u64 %rd40, [%rd39];
- cvta.to.global.u64 %rd41, %rd40;
- add.s64 %rd43, %rd41, %rd36;
- ld.global.f32 %f11, [%rd43];
- sub.ftz.f32 %f12, %f2, %f11;
- mul.ftz.f32 %f13, %f12, %f12;
- add.s32 %r44, %r52, %r22;
- mul.wide.u32 %rd44, %r44, 4;
- add.s64 %rd45, %rd1, %rd44;
- atom.global.add.f32 %f14, [%rd45], %f13;
- add.s32 %r52, %r52, %r7;
- setp.lt.u32 %p12, %r52, %r27;
- @%p12 bra $L__BB57_18;
-
-$L__BB57_19:
- add.s32 %r51, %r51, %r6;
- setp.lt.u32 %p13, %r51, %r28;
- @%p13 bra $L__BB57_16;
-
-$L__BB57_20:
- add.s32 %r45, %r45, %r4;
- setp.lt.u32 %p14, %r45, %r29;
- @%p14 bra $L__BB57_2;
-
-$L__BB57_21:
ret;
}
- // .globl MultiCosineDistance
-.visible .entry MultiCosineDistance(
- .param .u64 MultiCosineDistance_param_0,
- .param .u64 MultiCosineDistance_param_1,
- .param .u64 MultiCosineDistance_param_2,
- .param .u64 MultiCosineDistance_param_3,
- .param .u64 MultiCosineDistance_param_4,
- .param .u32 MultiCosineDistance_param_5,
- .param .u32 MultiCosineDistance_param_6,
- .param .u32 MultiCosineDistance_param_7
+ // .globl CalculateDistances
+.visible .entry CalculateDistances(
+ .param .u64 CalculateDistances_param_0,
+ .param .u64 CalculateDistances_param_1,
+ .param .u64 CalculateDistances_param_2,
+ .param .u32 CalculateDistances_param_3,
+ .param .u32 CalculateDistances_param_4,
+ .param .u32 CalculateDistances_param_5
+)
+{
+ .reg .pred %p<11>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<31>;
+ .reg .b64 %rd<31>;
+
+
+ ld.param.u64 %rd7, [CalculateDistances_param_0];
+ ld.param.u64 %rd8, [CalculateDistances_param_1];
+ ld.param.u64 %rd9, [CalculateDistances_param_2];
+ ld.param.u32 %r16, [CalculateDistances_param_3];
+ ld.param.u32 %r17, [CalculateDistances_param_4];
+ ld.param.u32 %r18, [CalculateDistances_param_5];
+ cvta.to.global.u64 %rd1, %rd9;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd7;
+ mov.u32 %r19, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r20, %tid.x;
+ mad.lo.s32 %r26, %r1, %r19, %r20;
+ setp.ge.u32 %p1, %r26, %r17;
+ @%p1 bra $L__BB58_15;
+
+ mov.u32 %r21, %ntid.y;
+ mov.u32 %r22, %ctaid.y;
+ mov.u32 %r23, %tid.y;
+ mad.lo.s32 %r3, %r21, %r22, %r23;
+ mov.u32 %r24, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r24;
+ mov.u32 %r25, %nctaid.y;
+ mul.lo.s32 %r5, %r21, %r25;
+ setp.eq.s32 %p2, %r18, 0;
+ @%p2 bra $L__BB58_11;
+
+$L__BB58_3:
+ setp.ge.u32 %p3, %r3, %r16;
+ @%p3 bra $L__BB58_9;
+
+ setp.eq.s32 %p4, %r18, 2;
+ cvt.u64.u32 %rd4, %r26;
+ mul.wide.u32 %rd10, %r26, 4;
+ add.s64 %rd5, %rd3, %rd10;
+ @%p4 bra $L__BB58_7;
+ bra.uni $L__BB58_5;
+
+$L__BB58_7:
+ ld.global.nc.f32 %f1, [%rd5];
+ mov.u32 %r28, %r3;
+
+$L__BB58_8:
+ mul.wide.u32 %rd13, %r28, 8;
+ add.s64 %rd14, %rd2, %rd13;
+ ld.global.nc.u64 %rd15, [%rd14];
+ cvta.to.global.u64 %rd16, %rd15;
+ shl.b64 %rd17, %rd4, 2;
+ add.s64 %rd18, %rd16, %rd17;
+ ld.global.f32 %f4, [%rd18];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ mul.wide.u32 %rd19, %r28, 4;
+ add.s64 %rd20, %rd1, %rd19;
+ atom.global.add.f32 %f7, [%rd20], %f6;
+ add.s32 %r28, %r28, %r5;
+ setp.lt.u32 %p6, %r28, %r16;
+ @%p6 bra $L__BB58_8;
+ bra.uni $L__BB58_9;
+
+$L__BB58_5:
+ mov.u32 %r27, %r3;
+
+$L__BB58_6:
+ mul.wide.u32 %rd11, %r27, 4;
+ add.s64 %rd12, %rd1, %rd11;
+ atom.global.add.f32 %f3, [%rd12], 0f00000000;
+ add.s32 %r27, %r27, %r5;
+ setp.lt.u32 %p5, %r27, %r16;
+ @%p5 bra $L__BB58_6;
+
+$L__BB58_9:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p7, %r26, %r17;
+ @%p7 bra $L__BB58_3;
+ bra.uni $L__BB58_15;
+
+$L__BB58_11:
+ setp.ge.u32 %p8, %r3, %r16;
+ @%p8 bra $L__BB58_14;
+
+ cvt.u64.u32 %rd6, %r26;
+ mul.wide.u32 %rd21, %r26, 4;
+ add.s64 %rd22, %rd3, %rd21;
+ ld.global.nc.f32 %f2, [%rd22];
+ mov.u32 %r30, %r3;
+
+$L__BB58_13:
+ mul.wide.u32 %rd23, %r30, 8;
+ add.s64 %rd24, %rd2, %rd23;
+ ld.global.nc.u64 %rd25, [%rd24];
+ cvta.to.global.u64 %rd26, %rd25;
+ shl.b64 %rd27, %rd6, 2;
+ add.s64 %rd28, %rd26, %rd27;
+ ld.global.f32 %f8, [%rd28];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ mul.wide.u32 %rd29, %r30, 4;
+ add.s64 %rd30, %rd1, %rd29;
+ atom.global.add.f32 %f11, [%rd30], %f10;
+ add.s32 %r30, %r30, %r5;
+ setp.lt.u32 %p9, %r30, %r16;
+ @%p9 bra $L__BB58_13;
+
+$L__BB58_14:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p10, %r26, %r17;
+ @%p10 bra $L__BB58_11;
+
+$L__BB58_15:
+ ret;
+
+}
+ // .globl CosineMultiDistance
+.visible .entry CosineMultiDistance(
+ .param .u64 CosineMultiDistance_param_0,
+ .param .u64 CosineMultiDistance_param_1,
+ .param .u64 CosineMultiDistance_param_2,
+ .param .u64 CosineMultiDistance_param_3,
+ .param .u64 CosineMultiDistance_param_4,
+ .param .u32 CosineMultiDistance_param_5,
+ .param .u32 CosineMultiDistance_param_6,
+ .param .u32 CosineMultiDistance_param_7
)
{
.reg .pred %p<7>;
@@ -4224,20 +4332,20 @@ $L__BB57_21:
.reg .b64 %rd<28>;
- ld.param.u64 %rd7, [MultiCosineDistance_param_0];
- ld.param.u64 %rd8, [MultiCosineDistance_param_1];
- ld.param.u64 %rd9, [MultiCosineDistance_param_2];
- ld.param.u64 %rd10, [MultiCosineDistance_param_3];
- ld.param.u64 %rd11, [MultiCosineDistance_param_4];
- ld.param.u32 %r17, [MultiCosineDistance_param_5];
- ld.param.u32 %r18, [MultiCosineDistance_param_6];
- ld.param.u32 %r19, [MultiCosineDistance_param_7];
+ ld.param.u64 %rd7, [CosineMultiDistance_param_0];
+ ld.param.u64 %rd8, [CosineMultiDistance_param_1];
+ ld.param.u64 %rd9, [CosineMultiDistance_param_2];
+ ld.param.u64 %rd10, [CosineMultiDistance_param_3];
+ ld.param.u64 %rd11, [CosineMultiDistance_param_4];
+ ld.param.u32 %r17, [CosineMultiDistance_param_5];
+ ld.param.u32 %r18, [CosineMultiDistance_param_6];
+ ld.param.u32 %r19, [CosineMultiDistance_param_7];
mov.u32 %r20, %ctaid.x;
mov.u32 %r1, %ntid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r31, %r1, %r20, %r21;
setp.ge.u32 %p1, %r31, %r19;
- @%p1 bra $L__BB58_9;
+ @%p1 bra $L__BB59_9;
mov.u32 %r22, %ntid.y;
mov.u32 %r23, %ctaid.y;
@@ -4259,16 +4367,16 @@ $L__BB57_21:
cvta.to.global.u64 %rd4, %rd10;
cvta.to.global.u64 %rd5, %rd11;
-$L__BB58_2:
+$L__BB59_2:
setp.ge.u32 %p2, %r3, %r18;
- @%p2 bra $L__BB58_8;
+ @%p2 bra $L__BB59_8;
cvt.u64.u32 %rd6, %r31;
mov.u32 %r32, %r3;
-$L__BB58_4:
+$L__BB59_4:
setp.ge.u32 %p3, %r5, %r17;
- @%p3 bra $L__BB58_7;
+ @%p3 bra $L__BB59_7;
mul.wide.u32 %rd12, %r32, 8;
add.s64 %rd13, %rd1, %rd12;
@@ -4281,7 +4389,7 @@ $L__BB58_4:
mad.lo.s32 %r33, %r32, %r17, %r5;
mov.u32 %r34, %r5;
-$L__BB58_6:
+$L__BB59_6:
mul.wide.u32 %rd18, %r34, 8;
add.s64 %rd19, %rd2, %rd18;
ld.global.nc.u64 %rd20, [%rd19];
@@ -4300,19 +4408,105 @@ $L__BB58_6:
add.s32 %r33, %r33, %r7;
add.s32 %r34, %r34, %r7;
setp.lt.u32 %p4, %r34, %r17;
- @%p4 bra $L__BB58_6;
+ @%p4 bra $L__BB59_6;
-$L__BB58_7:
+$L__BB59_7:
add.s32 %r32, %r32, %r6;
setp.lt.u32 %p5, %r32, %r18;
- @%p5 bra $L__BB58_4;
+ @%p5 bra $L__BB59_4;
-$L__BB58_8:
+$L__BB59_8:
add.s32 %r31, %r31, %r4;
setp.lt.u32 %p6, %r31, %r19;
- @%p6 bra $L__BB58_2;
+ @%p6 bra $L__BB59_2;
-$L__BB58_9:
+$L__BB59_9:
+ ret;
+
+}
+ // .globl CosineDistances
+.visible .entry CosineDistances(
+ .param .u64 CosineDistances_param_0,
+ .param .u64 CosineDistances_param_1,
+ .param .u64 CosineDistances_param_2,
+ .param .u64 CosineDistances_param_3,
+ .param .u64 CosineDistances_param_4,
+ .param .u32 CosineDistances_param_5,
+ .param .u32 CosineDistances_param_6
+)
+{
+ .reg .pred %p<5>;
+ .reg .f32 %f<9>;
+ .reg .b32 %r<21>;
+ .reg .b64 %rd<24>;
+
+
+ ld.param.u64 %rd7, [CosineDistances_param_0];
+ ld.param.u64 %rd8, [CosineDistances_param_1];
+ ld.param.u64 %rd9, [CosineDistances_param_2];
+ ld.param.u64 %rd10, [CosineDistances_param_3];
+ ld.param.u64 %rd11, [CosineDistances_param_4];
+ ld.param.u32 %r10, [CosineDistances_param_5];
+ ld.param.u32 %r11, [CosineDistances_param_6];
+ mov.u32 %r12, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r13, %tid.x;
+ mad.lo.s32 %r19, %r1, %r12, %r13;
+ setp.ge.u32 %p1, %r19, %r11;
+ @%p1 bra $L__BB60_6;
+
+ mov.u32 %r14, %ntid.y;
+ mov.u32 %r15, %ctaid.y;
+ mov.u32 %r16, %tid.y;
+ mad.lo.s32 %r3, %r14, %r15, %r16;
+ mov.u32 %r17, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r17;
+ mov.u32 %r18, %nctaid.y;
+ mul.lo.s32 %r5, %r14, %r18;
+ cvta.to.global.u64 %rd1, %rd7;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd9;
+ cvta.to.global.u64 %rd4, %rd10;
+ cvta.to.global.u64 %rd5, %rd11;
+
+$L__BB60_2:
+ setp.ge.u32 %p2, %r3, %r10;
+ @%p2 bra $L__BB60_5;
+
+ cvt.u64.u32 %rd6, %r19;
+ mul.wide.u32 %rd12, %r19, 4;
+ add.s64 %rd13, %rd1, %rd12;
+ ld.global.nc.f32 %f1, [%rd13];
+ mul.ftz.f32 %f2, %f1, %f1;
+ mov.u32 %r20, %r3;
+
+$L__BB60_4:
+ mul.wide.u32 %rd14, %r20, 8;
+ add.s64 %rd15, %rd2, %rd14;
+ ld.global.nc.u64 %rd16, [%rd15];
+ cvta.to.global.u64 %rd17, %rd16;
+ shl.b64 %rd18, %rd6, 2;
+ add.s64 %rd19, %rd17, %rd18;
+ ld.global.f32 %f3, [%rd19];
+ mul.wide.u32 %rd20, %r20, 4;
+ add.s64 %rd21, %rd3, %rd20;
+ atom.global.add.f32 %f4, [%rd21], %f2;
+ add.s64 %rd22, %rd4, %rd20;
+ mul.ftz.f32 %f5, %f1, %f3;
+ atom.global.add.f32 %f6, [%rd22], %f5;
+ add.s64 %rd23, %rd5, %rd20;
+ mul.ftz.f32 %f7, %f3, %f3;
+ atom.global.add.f32 %f8, [%rd23], %f7;
+ add.s32 %r20, %r20, %r5;
+ setp.lt.u32 %p3, %r20, %r10;
+ @%p3 bra $L__BB60_4;
+
+$L__BB60_5:
+ add.s32 %r19, %r19, %r4;
+ setp.lt.u32 %p4, %r19, %r11;
+ @%p4 bra $L__BB60_2;
+
+$L__BB60_6:
ret;
}
@@ -4340,7 +4534,7 @@ $L__BB58_9:
mov.u32 %r2, %tid.x;
mad.lo.s32 %r3, %r22, %r1, %r2;
setp.ge.u32 %p1, %r3, %r20;
- @%p1 bra $L__BB59_2;
+ @%p1 bra $L__BB61_2;
cvta.to.global.u64 %rd3, %rd1;
mul.lo.s32 %r23, %r3, %r21;
@@ -4352,17 +4546,17 @@ $L__BB58_9:
add.s32 %r26, %r25, %r24;
st.shared.f32 [%r26], %f8;
-$L__BB59_2:
+$L__BB61_2:
bar.sync 0;
setp.ne.s32 %p2, %r2, 0;
- @%p2 bra $L__BB59_11;
+ @%p2 bra $L__BB61_11;
shl.b32 %r4, %r1, 10;
sub.s32 %r27, %r20, %r4;
min.u32 %r5, %r27, 1024;
setp.eq.s32 %p3, %r5, 0;
mov.f32 %f25, 0f00000000;
- @%p3 bra $L__BB59_10;
+ @%p3 bra $L__BB61_10;
not.b32 %r29, %r20;
add.s32 %r30, %r4, %r29;
@@ -4373,7 +4567,7 @@ $L__BB59_2:
setp.lt.u32 %p4, %r33, 3;
mov.f32 %f25, 0f00000000;
mov.u32 %r45, 0;
- @%p4 bra $L__BB59_7;
+ @%p4 bra $L__BB61_7;
add.s32 %r36, %r4, -1;
sub.s32 %r37, %r36, %r20;
@@ -4382,7 +4576,7 @@ $L__BB59_2:
neg.s32 %r42, %r39;
mov.u32 %r43, _ZZ9SumValuesE5block;
-$L__BB59_6:
+$L__BB61_6:
ld.shared.f32 %f13, [%r43];
add.ftz.f32 %f14, %f25, %f13;
ld.shared.f32 %f15, [%r43+4];
@@ -4395,32 +4589,32 @@ $L__BB59_6:
add.s32 %r43, %r43, 16;
add.s32 %r42, %r42, -4;
setp.ne.s32 %p5, %r42, 1;
- @%p5 bra $L__BB59_6;
+ @%p5 bra $L__BB61_6;
-$L__BB59_7:
+$L__BB61_7:
setp.eq.s32 %p6, %r47, 0;
- @%p6 bra $L__BB59_10;
+ @%p6 bra $L__BB61_10;
shl.b32 %r40, %r45, 2;
mov.u32 %r41, _ZZ9SumValuesE5block;
add.s32 %r46, %r41, %r40;
-$L__BB59_9:
+$L__BB61_9:
.pragma "nounroll";
ld.shared.f32 %f20, [%r46];
add.ftz.f32 %f25, %f25, %f20;
add.s32 %r46, %r46, 4;
add.s32 %r47, %r47, -1;
setp.ne.s32 %p7, %r47, 0;
- @%p7 bra $L__BB59_9;
+ @%p7 bra $L__BB61_9;
-$L__BB59_10:
+$L__BB61_10:
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f25;
-$L__BB59_11:
+$L__BB61_11:
ret;
}
diff --git a/BrightData.Cuda/cuda/brightwire_72.ptx b/BrightData.Cuda/cuda/brightwire_72.ptx
index 0e508c92..d5e55a89 100644
--- a/BrightData.Cuda/cuda/brightwire_72.ptx
+++ b/BrightData.Cuda/cuda/brightwire_72.ptx
@@ -4022,200 +4022,308 @@ $L__BB56_3:
ret;
}
- // .globl CalculateDistances
-.visible .entry CalculateDistances(
- .param .u64 CalculateDistances_param_0,
- .param .u64 CalculateDistances_param_1,
- .param .u64 CalculateDistances_param_2,
- .param .u32 CalculateDistances_param_3,
- .param .u32 CalculateDistances_param_4,
- .param .u32 CalculateDistances_param_5,
- .param .u32 CalculateDistances_param_6
+ // .globl CalculateMultiDistances
+.visible .entry CalculateMultiDistances(
+ .param .u64 CalculateMultiDistances_param_0,
+ .param .u64 CalculateMultiDistances_param_1,
+ .param .u64 CalculateMultiDistances_param_2,
+ .param .u32 CalculateMultiDistances_param_3,
+ .param .u32 CalculateMultiDistances_param_4,
+ .param .u32 CalculateMultiDistances_param_5,
+ .param .u32 CalculateMultiDistances_param_6
)
{
- .reg .pred %p<15>;
- .reg .f32 %f<15>;
- .reg .b32 %r<53>;
- .reg .b64 %rd<46>;
-
-
- ld.param.u64 %rd5, [CalculateDistances_param_0];
- ld.param.u64 %rd6, [CalculateDistances_param_1];
- ld.param.u64 %rd7, [CalculateDistances_param_2];
- ld.param.u32 %r27, [CalculateDistances_param_3];
- ld.param.u32 %r28, [CalculateDistances_param_4];
- ld.param.u32 %r29, [CalculateDistances_param_5];
- ld.param.u32 %r30, [CalculateDistances_param_6];
- cvta.to.global.u64 %rd1, %rd7;
- cvta.to.global.u64 %rd2, %rd6;
- cvta.to.global.u64 %rd3, %rd5;
- mov.u32 %r31, %ctaid.x;
+ .reg .pred %p<13>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<49>;
+ .reg .b64 %rd<38>;
+
+
+ ld.param.u64 %rd6, [CalculateMultiDistances_param_0];
+ ld.param.u64 %rd7, [CalculateMultiDistances_param_1];
+ ld.param.u64 %rd8, [CalculateMultiDistances_param_2];
+ ld.param.u32 %r25, [CalculateMultiDistances_param_3];
+ ld.param.u32 %r26, [CalculateMultiDistances_param_4];
+ ld.param.u32 %r27, [CalculateMultiDistances_param_5];
+ ld.param.u32 %r28, [CalculateMultiDistances_param_6];
+ cvta.to.global.u64 %rd1, %rd8;
+ cvta.to.global.u64 %rd2, %rd7;
+ cvta.to.global.u64 %rd3, %rd6;
+ mov.u32 %r29, %ctaid.x;
mov.u32 %r1, %ntid.x;
- mov.u32 %r32, %tid.x;
- mad.lo.s32 %r45, %r1, %r31, %r32;
- setp.ge.u32 %p1, %r45, %r29;
- @%p1 bra $L__BB57_21;
-
- mov.u32 %r33, %ntid.y;
- mov.u32 %r34, %ctaid.y;
- mov.u32 %r35, %tid.y;
- mad.lo.s32 %r3, %r33, %r34, %r35;
- mov.u32 %r36, %nctaid.x;
- mul.lo.s32 %r4, %r1, %r36;
- mov.u32 %r37, %ctaid.z;
- mov.u32 %r38, %ntid.z;
- mov.u32 %r39, %tid.z;
- mad.lo.s32 %r5, %r38, %r37, %r39;
- mov.u32 %r40, %nctaid.y;
- mul.lo.s32 %r6, %r33, %r40;
- mov.u32 %r41, %nctaid.z;
- mul.lo.s32 %r7, %r38, %r41;
+ mov.u32 %r30, %tid.x;
+ mad.lo.s32 %r42, %r1, %r29, %r30;
+ setp.ge.u32 %p1, %r42, %r27;
+ @%p1 bra $L__BB57_18;
+
+ mov.u32 %r31, %ntid.y;
+ mov.u32 %r32, %ctaid.y;
+ mov.u32 %r33, %tid.y;
+ mad.lo.s32 %r3, %r31, %r32, %r33;
+ mov.u32 %r34, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r34;
+ mov.u32 %r35, %ctaid.z;
+ mov.u32 %r36, %ntid.z;
+ mov.u32 %r37, %tid.z;
+ mad.lo.s32 %r5, %r36, %r35, %r37;
+ mov.u32 %r38, %nctaid.y;
+ mul.lo.s32 %r6, %r31, %r38;
+ mov.u32 %r39, %nctaid.z;
+ mul.lo.s32 %r7, %r36, %r39;
$L__BB57_2:
- setp.ge.u32 %p2, %r3, %r28;
- @%p2 bra $L__BB57_20;
+ setp.ge.u32 %p2, %r3, %r26;
+ @%p2 bra $L__BB57_17;
- setp.eq.s32 %p3, %r30, 0;
- cvt.u64.u32 %rd4, %r45;
- @%p3 bra $L__BB57_15;
+ setp.eq.s32 %p3, %r28, 0;
+ cvt.u64.u32 %rd4, %r42;
+ @%p3 bra $L__BB57_12;
- mov.u32 %r46, %r3;
+ mov.u32 %r43, %r3;
$L__BB57_5:
- setp.ge.u32 %p4, %r5, %r27;
- @%p4 bra $L__BB57_14;
-
- mul.wide.u32 %rd8, %r46, 8;
- add.s64 %rd9, %rd3, %rd8;
- ld.global.nc.u64 %rd10, [%rd9];
- cvta.to.global.u64 %rd11, %rd10;
- shl.b64 %rd12, %rd4, 2;
- add.s64 %rd13, %rd11, %rd12;
- ld.global.f32 %f1, [%rd13];
- mul.lo.s32 %r10, %r46, %r27;
- setp.eq.s32 %p5, %r30, 1;
- @%p5 bra $L__BB57_10;
-
- setp.ne.s32 %p6, %r30, 2;
- @%p6 bra $L__BB57_12;
+ setp.ge.u32 %p4, %r5, %r25;
+ @%p4 bra $L__BB57_11;
- add.s32 %r47, %r5, %r10;
- mov.u32 %r48, %r5;
+ setp.eq.s32 %p5, %r28, 2;
+ mul.wide.u32 %rd9, %r43, 8;
+ add.s64 %rd10, %rd3, %rd9;
+ ld.global.nc.u64 %rd11, [%rd10];
+ cvta.to.global.u64 %rd12, %rd11;
+ shl.b64 %rd13, %rd4, 2;
+ add.s64 %rd5, %rd12, %rd13;
+ mul.lo.s32 %r10, %r43, %r25;
+ @%p5 bra $L__BB57_9;
+ bra.uni $L__BB57_7;
$L__BB57_9:
- mul.wide.u32 %rd14, %r48, 8;
- add.s64 %rd15, %rd2, %rd14;
- ld.global.nc.u64 %rd16, [%rd15];
- cvta.to.global.u64 %rd17, %rd16;
- add.s64 %rd19, %rd17, %rd12;
- ld.global.f32 %f3, [%rd19];
- sub.ftz.f32 %f4, %f1, %f3;
- abs.ftz.f32 %f5, %f4;
- mul.wide.u32 %rd20, %r47, 4;
- add.s64 %rd21, %rd1, %rd20;
- atom.global.add.f32 %f6, [%rd21], %f5;
- add.s32 %r47, %r47, %r7;
- add.s32 %r48, %r48, %r7;
- setp.lt.u32 %p7, %r48, %r27;
- @%p7 bra $L__BB57_9;
- bra.uni $L__BB57_14;
+ ld.global.f32 %f1, [%rd5];
+ mov.u32 %r46, %r5;
$L__BB57_10:
- mov.u32 %r49, %r5;
+ mul.wide.u32 %rd16, %r46, 8;
+ add.s64 %rd17, %rd2, %rd16;
+ ld.global.nc.u64 %rd18, [%rd17];
+ cvta.to.global.u64 %rd19, %rd18;
+ add.s64 %rd21, %rd19, %rd13;
+ ld.global.f32 %f4, [%rd21];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ add.s32 %r40, %r46, %r10;
+ mul.wide.u32 %rd22, %r40, 4;
+ add.s64 %rd23, %rd1, %rd22;
+ atom.global.add.f32 %f7, [%rd23], %f6;
+ add.s32 %r46, %r46, %r7;
+ setp.lt.u32 %p7, %r46, %r25;
+ @%p7 bra $L__BB57_10;
+ bra.uni $L__BB57_11;
+
+$L__BB57_7:
+ add.s32 %r44, %r5, %r10;
+ mov.u32 %r45, %r5;
+
+$L__BB57_8:
+ mul.wide.u32 %rd14, %r44, 4;
+ add.s64 %rd15, %rd1, %rd14;
+ atom.global.add.f32 %f3, [%rd15], 0f00000000;
+ add.s32 %r44, %r44, %r7;
+ add.s32 %r45, %r45, %r7;
+ setp.lt.u32 %p6, %r45, %r25;
+ @%p6 bra $L__BB57_8;
$L__BB57_11:
- mul.wide.u32 %rd22, %r49, 8;
- add.s64 %rd23, %rd2, %rd22;
- ld.global.nc.u64 %rd24, [%rd23];
- cvta.to.global.u64 %rd25, %rd24;
- add.s64 %rd27, %rd25, %rd12;
- ld.global.f32 %f7, [%rd27];
- mul.ftz.f32 %f8, %f1, %f7;
- add.s32 %r42, %r49, %r10;
- mul.wide.u32 %rd28, %r42, 4;
- add.s64 %rd29, %rd1, %rd28;
- atom.global.add.f32 %f9, [%rd29], %f8;
- add.s32 %r49, %r49, %r7;
- setp.lt.u32 %p8, %r49, %r27;
- @%p8 bra $L__BB57_11;
- bra.uni $L__BB57_14;
+ add.s32 %r43, %r43, %r6;
+ setp.lt.u32 %p8, %r43, %r26;
+ @%p8 bra $L__BB57_5;
+ bra.uni $L__BB57_17;
$L__BB57_12:
- mov.u32 %r50, %r5;
+ mov.u32 %r47, %r3;
$L__BB57_13:
- add.s32 %r43, %r50, %r10;
- mul.wide.u32 %rd30, %r43, 4;
- add.s64 %rd31, %rd1, %rd30;
- atom.global.add.f32 %f10, [%rd31], 0f00000000;
- add.s32 %r50, %r50, %r7;
- setp.lt.u32 %p9, %r50, %r27;
- @%p9 bra $L__BB57_13;
-
-$L__BB57_14:
- add.s32 %r46, %r46, %r6;
- setp.lt.u32 %p10, %r46, %r28;
- @%p10 bra $L__BB57_5;
- bra.uni $L__BB57_20;
+ setp.ge.u32 %p9, %r5, %r25;
+ @%p9 bra $L__BB57_16;
+
+ mul.wide.u32 %rd24, %r47, 8;
+ add.s64 %rd25, %rd3, %rd24;
+ ld.global.nc.u64 %rd26, [%rd25];
+ cvta.to.global.u64 %rd27, %rd26;
+ shl.b64 %rd28, %rd4, 2;
+ add.s64 %rd29, %rd27, %rd28;
+ ld.global.f32 %f2, [%rd29];
+ mul.lo.s32 %r20, %r47, %r25;
+ mov.u32 %r48, %r5;
$L__BB57_15:
- mov.u32 %r51, %r3;
+ mul.wide.u32 %rd30, %r48, 8;
+ add.s64 %rd31, %rd2, %rd30;
+ ld.global.nc.u64 %rd32, [%rd31];
+ cvta.to.global.u64 %rd33, %rd32;
+ add.s64 %rd35, %rd33, %rd28;
+ ld.global.f32 %f8, [%rd35];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ add.s32 %r41, %r48, %r20;
+ mul.wide.u32 %rd36, %r41, 4;
+ add.s64 %rd37, %rd1, %rd36;
+ atom.global.add.f32 %f11, [%rd37], %f10;
+ add.s32 %r48, %r48, %r7;
+ setp.lt.u32 %p10, %r48, %r25;
+ @%p10 bra $L__BB57_15;
$L__BB57_16:
- setp.ge.u32 %p11, %r5, %r27;
- @%p11 bra $L__BB57_19;
-
- mul.wide.u32 %rd32, %r51, 8;
- add.s64 %rd33, %rd3, %rd32;
- ld.global.nc.u64 %rd34, [%rd33];
- cvta.to.global.u64 %rd35, %rd34;
- shl.b64 %rd36, %rd4, 2;
- add.s64 %rd37, %rd35, %rd36;
- ld.global.f32 %f2, [%rd37];
- mul.lo.s32 %r22, %r51, %r27;
- mov.u32 %r52, %r5;
+ add.s32 %r47, %r47, %r6;
+ setp.lt.u32 %p11, %r47, %r26;
+ @%p11 bra $L__BB57_13;
+
+$L__BB57_17:
+ add.s32 %r42, %r42, %r4;
+ setp.lt.u32 %p12, %r42, %r27;
+ @%p12 bra $L__BB57_2;
$L__BB57_18:
- mul.wide.u32 %rd38, %r52, 8;
- add.s64 %rd39, %rd2, %rd38;
- ld.global.nc.u64 %rd40, [%rd39];
- cvta.to.global.u64 %rd41, %rd40;
- add.s64 %rd43, %rd41, %rd36;
- ld.global.f32 %f11, [%rd43];
- sub.ftz.f32 %f12, %f2, %f11;
- mul.ftz.f32 %f13, %f12, %f12;
- add.s32 %r44, %r52, %r22;
- mul.wide.u32 %rd44, %r44, 4;
- add.s64 %rd45, %rd1, %rd44;
- atom.global.add.f32 %f14, [%rd45], %f13;
- add.s32 %r52, %r52, %r7;
- setp.lt.u32 %p12, %r52, %r27;
- @%p12 bra $L__BB57_18;
-
-$L__BB57_19:
- add.s32 %r51, %r51, %r6;
- setp.lt.u32 %p13, %r51, %r28;
- @%p13 bra $L__BB57_16;
-
-$L__BB57_20:
- add.s32 %r45, %r45, %r4;
- setp.lt.u32 %p14, %r45, %r29;
- @%p14 bra $L__BB57_2;
-
-$L__BB57_21:
ret;
}
- // .globl MultiCosineDistance
-.visible .entry MultiCosineDistance(
- .param .u64 MultiCosineDistance_param_0,
- .param .u64 MultiCosineDistance_param_1,
- .param .u64 MultiCosineDistance_param_2,
- .param .u64 MultiCosineDistance_param_3,
- .param .u64 MultiCosineDistance_param_4,
- .param .u32 MultiCosineDistance_param_5,
- .param .u32 MultiCosineDistance_param_6,
- .param .u32 MultiCosineDistance_param_7
+ // .globl CalculateDistances
+.visible .entry CalculateDistances(
+ .param .u64 CalculateDistances_param_0,
+ .param .u64 CalculateDistances_param_1,
+ .param .u64 CalculateDistances_param_2,
+ .param .u32 CalculateDistances_param_3,
+ .param .u32 CalculateDistances_param_4,
+ .param .u32 CalculateDistances_param_5
+)
+{
+ .reg .pred %p<11>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<31>;
+ .reg .b64 %rd<31>;
+
+
+ ld.param.u64 %rd7, [CalculateDistances_param_0];
+ ld.param.u64 %rd8, [CalculateDistances_param_1];
+ ld.param.u64 %rd9, [CalculateDistances_param_2];
+ ld.param.u32 %r16, [CalculateDistances_param_3];
+ ld.param.u32 %r17, [CalculateDistances_param_4];
+ ld.param.u32 %r18, [CalculateDistances_param_5];
+ cvta.to.global.u64 %rd1, %rd9;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd7;
+ mov.u32 %r19, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r20, %tid.x;
+ mad.lo.s32 %r26, %r1, %r19, %r20;
+ setp.ge.u32 %p1, %r26, %r17;
+ @%p1 bra $L__BB58_15;
+
+ mov.u32 %r21, %ntid.y;
+ mov.u32 %r22, %ctaid.y;
+ mov.u32 %r23, %tid.y;
+ mad.lo.s32 %r3, %r21, %r22, %r23;
+ mov.u32 %r24, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r24;
+ mov.u32 %r25, %nctaid.y;
+ mul.lo.s32 %r5, %r21, %r25;
+ setp.eq.s32 %p2, %r18, 0;
+ @%p2 bra $L__BB58_11;
+
+$L__BB58_3:
+ setp.ge.u32 %p3, %r3, %r16;
+ @%p3 bra $L__BB58_9;
+
+ setp.eq.s32 %p4, %r18, 2;
+ cvt.u64.u32 %rd4, %r26;
+ mul.wide.u32 %rd10, %r26, 4;
+ add.s64 %rd5, %rd3, %rd10;
+ @%p4 bra $L__BB58_7;
+ bra.uni $L__BB58_5;
+
+$L__BB58_7:
+ ld.global.nc.f32 %f1, [%rd5];
+ mov.u32 %r28, %r3;
+
+$L__BB58_8:
+ mul.wide.u32 %rd13, %r28, 8;
+ add.s64 %rd14, %rd2, %rd13;
+ ld.global.nc.u64 %rd15, [%rd14];
+ cvta.to.global.u64 %rd16, %rd15;
+ shl.b64 %rd17, %rd4, 2;
+ add.s64 %rd18, %rd16, %rd17;
+ ld.global.f32 %f4, [%rd18];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ mul.wide.u32 %rd19, %r28, 4;
+ add.s64 %rd20, %rd1, %rd19;
+ atom.global.add.f32 %f7, [%rd20], %f6;
+ add.s32 %r28, %r28, %r5;
+ setp.lt.u32 %p6, %r28, %r16;
+ @%p6 bra $L__BB58_8;
+ bra.uni $L__BB58_9;
+
+$L__BB58_5:
+ mov.u32 %r27, %r3;
+
+$L__BB58_6:
+ mul.wide.u32 %rd11, %r27, 4;
+ add.s64 %rd12, %rd1, %rd11;
+ atom.global.add.f32 %f3, [%rd12], 0f00000000;
+ add.s32 %r27, %r27, %r5;
+ setp.lt.u32 %p5, %r27, %r16;
+ @%p5 bra $L__BB58_6;
+
+$L__BB58_9:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p7, %r26, %r17;
+ @%p7 bra $L__BB58_3;
+ bra.uni $L__BB58_15;
+
+$L__BB58_11:
+ setp.ge.u32 %p8, %r3, %r16;
+ @%p8 bra $L__BB58_14;
+
+ cvt.u64.u32 %rd6, %r26;
+ mul.wide.u32 %rd21, %r26, 4;
+ add.s64 %rd22, %rd3, %rd21;
+ ld.global.nc.f32 %f2, [%rd22];
+ mov.u32 %r30, %r3;
+
+$L__BB58_13:
+ mul.wide.u32 %rd23, %r30, 8;
+ add.s64 %rd24, %rd2, %rd23;
+ ld.global.nc.u64 %rd25, [%rd24];
+ cvta.to.global.u64 %rd26, %rd25;
+ shl.b64 %rd27, %rd6, 2;
+ add.s64 %rd28, %rd26, %rd27;
+ ld.global.f32 %f8, [%rd28];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ mul.wide.u32 %rd29, %r30, 4;
+ add.s64 %rd30, %rd1, %rd29;
+ atom.global.add.f32 %f11, [%rd30], %f10;
+ add.s32 %r30, %r30, %r5;
+ setp.lt.u32 %p9, %r30, %r16;
+ @%p9 bra $L__BB58_13;
+
+$L__BB58_14:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p10, %r26, %r17;
+ @%p10 bra $L__BB58_11;
+
+$L__BB58_15:
+ ret;
+
+}
+ // .globl CosineMultiDistance
+.visible .entry CosineMultiDistance(
+ .param .u64 CosineMultiDistance_param_0,
+ .param .u64 CosineMultiDistance_param_1,
+ .param .u64 CosineMultiDistance_param_2,
+ .param .u64 CosineMultiDistance_param_3,
+ .param .u64 CosineMultiDistance_param_4,
+ .param .u32 CosineMultiDistance_param_5,
+ .param .u32 CosineMultiDistance_param_6,
+ .param .u32 CosineMultiDistance_param_7
)
{
.reg .pred %p<7>;
@@ -4224,20 +4332,20 @@ $L__BB57_21:
.reg .b64 %rd<28>;
- ld.param.u64 %rd7, [MultiCosineDistance_param_0];
- ld.param.u64 %rd8, [MultiCosineDistance_param_1];
- ld.param.u64 %rd9, [MultiCosineDistance_param_2];
- ld.param.u64 %rd10, [MultiCosineDistance_param_3];
- ld.param.u64 %rd11, [MultiCosineDistance_param_4];
- ld.param.u32 %r17, [MultiCosineDistance_param_5];
- ld.param.u32 %r18, [MultiCosineDistance_param_6];
- ld.param.u32 %r19, [MultiCosineDistance_param_7];
+ ld.param.u64 %rd7, [CosineMultiDistance_param_0];
+ ld.param.u64 %rd8, [CosineMultiDistance_param_1];
+ ld.param.u64 %rd9, [CosineMultiDistance_param_2];
+ ld.param.u64 %rd10, [CosineMultiDistance_param_3];
+ ld.param.u64 %rd11, [CosineMultiDistance_param_4];
+ ld.param.u32 %r17, [CosineMultiDistance_param_5];
+ ld.param.u32 %r18, [CosineMultiDistance_param_6];
+ ld.param.u32 %r19, [CosineMultiDistance_param_7];
mov.u32 %r20, %ctaid.x;
mov.u32 %r1, %ntid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r31, %r1, %r20, %r21;
setp.ge.u32 %p1, %r31, %r19;
- @%p1 bra $L__BB58_9;
+ @%p1 bra $L__BB59_9;
mov.u32 %r22, %ntid.y;
mov.u32 %r23, %ctaid.y;
@@ -4259,16 +4367,16 @@ $L__BB57_21:
cvta.to.global.u64 %rd4, %rd10;
cvta.to.global.u64 %rd5, %rd11;
-$L__BB58_2:
+$L__BB59_2:
setp.ge.u32 %p2, %r3, %r18;
- @%p2 bra $L__BB58_8;
+ @%p2 bra $L__BB59_8;
cvt.u64.u32 %rd6, %r31;
mov.u32 %r32, %r3;
-$L__BB58_4:
+$L__BB59_4:
setp.ge.u32 %p3, %r5, %r17;
- @%p3 bra $L__BB58_7;
+ @%p3 bra $L__BB59_7;
mul.wide.u32 %rd12, %r32, 8;
add.s64 %rd13, %rd1, %rd12;
@@ -4281,7 +4389,7 @@ $L__BB58_4:
mad.lo.s32 %r33, %r32, %r17, %r5;
mov.u32 %r34, %r5;
-$L__BB58_6:
+$L__BB59_6:
mul.wide.u32 %rd18, %r34, 8;
add.s64 %rd19, %rd2, %rd18;
ld.global.nc.u64 %rd20, [%rd19];
@@ -4300,19 +4408,105 @@ $L__BB58_6:
add.s32 %r33, %r33, %r7;
add.s32 %r34, %r34, %r7;
setp.lt.u32 %p4, %r34, %r17;
- @%p4 bra $L__BB58_6;
+ @%p4 bra $L__BB59_6;
-$L__BB58_7:
+$L__BB59_7:
add.s32 %r32, %r32, %r6;
setp.lt.u32 %p5, %r32, %r18;
- @%p5 bra $L__BB58_4;
+ @%p5 bra $L__BB59_4;
-$L__BB58_8:
+$L__BB59_8:
add.s32 %r31, %r31, %r4;
setp.lt.u32 %p6, %r31, %r19;
- @%p6 bra $L__BB58_2;
+ @%p6 bra $L__BB59_2;
-$L__BB58_9:
+$L__BB59_9:
+ ret;
+
+}
+ // .globl CosineDistances
+.visible .entry CosineDistances(
+ .param .u64 CosineDistances_param_0,
+ .param .u64 CosineDistances_param_1,
+ .param .u64 CosineDistances_param_2,
+ .param .u64 CosineDistances_param_3,
+ .param .u64 CosineDistances_param_4,
+ .param .u32 CosineDistances_param_5,
+ .param .u32 CosineDistances_param_6
+)
+{
+ .reg .pred %p<5>;
+ .reg .f32 %f<9>;
+ .reg .b32 %r<21>;
+ .reg .b64 %rd<24>;
+
+
+ ld.param.u64 %rd7, [CosineDistances_param_0];
+ ld.param.u64 %rd8, [CosineDistances_param_1];
+ ld.param.u64 %rd9, [CosineDistances_param_2];
+ ld.param.u64 %rd10, [CosineDistances_param_3];
+ ld.param.u64 %rd11, [CosineDistances_param_4];
+ ld.param.u32 %r10, [CosineDistances_param_5];
+ ld.param.u32 %r11, [CosineDistances_param_6];
+ mov.u32 %r12, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r13, %tid.x;
+ mad.lo.s32 %r19, %r1, %r12, %r13;
+ setp.ge.u32 %p1, %r19, %r11;
+ @%p1 bra $L__BB60_6;
+
+ mov.u32 %r14, %ntid.y;
+ mov.u32 %r15, %ctaid.y;
+ mov.u32 %r16, %tid.y;
+ mad.lo.s32 %r3, %r14, %r15, %r16;
+ mov.u32 %r17, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r17;
+ mov.u32 %r18, %nctaid.y;
+ mul.lo.s32 %r5, %r14, %r18;
+ cvta.to.global.u64 %rd1, %rd7;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd9;
+ cvta.to.global.u64 %rd4, %rd10;
+ cvta.to.global.u64 %rd5, %rd11;
+
+$L__BB60_2:
+ setp.ge.u32 %p2, %r3, %r10;
+ @%p2 bra $L__BB60_5;
+
+ cvt.u64.u32 %rd6, %r19;
+ mul.wide.u32 %rd12, %r19, 4;
+ add.s64 %rd13, %rd1, %rd12;
+ ld.global.nc.f32 %f1, [%rd13];
+ mul.ftz.f32 %f2, %f1, %f1;
+ mov.u32 %r20, %r3;
+
+$L__BB60_4:
+ mul.wide.u32 %rd14, %r20, 8;
+ add.s64 %rd15, %rd2, %rd14;
+ ld.global.nc.u64 %rd16, [%rd15];
+ cvta.to.global.u64 %rd17, %rd16;
+ shl.b64 %rd18, %rd6, 2;
+ add.s64 %rd19, %rd17, %rd18;
+ ld.global.f32 %f3, [%rd19];
+ mul.wide.u32 %rd20, %r20, 4;
+ add.s64 %rd21, %rd3, %rd20;
+ atom.global.add.f32 %f4, [%rd21], %f2;
+ add.s64 %rd22, %rd4, %rd20;
+ mul.ftz.f32 %f5, %f1, %f3;
+ atom.global.add.f32 %f6, [%rd22], %f5;
+ add.s64 %rd23, %rd5, %rd20;
+ mul.ftz.f32 %f7, %f3, %f3;
+ atom.global.add.f32 %f8, [%rd23], %f7;
+ add.s32 %r20, %r20, %r5;
+ setp.lt.u32 %p3, %r20, %r10;
+ @%p3 bra $L__BB60_4;
+
+$L__BB60_5:
+ add.s32 %r19, %r19, %r4;
+ setp.lt.u32 %p4, %r19, %r11;
+ @%p4 bra $L__BB60_2;
+
+$L__BB60_6:
ret;
}
@@ -4340,7 +4534,7 @@ $L__BB58_9:
mov.u32 %r2, %tid.x;
mad.lo.s32 %r3, %r22, %r1, %r2;
setp.ge.u32 %p1, %r3, %r20;
- @%p1 bra $L__BB59_2;
+ @%p1 bra $L__BB61_2;
cvta.to.global.u64 %rd3, %rd1;
mul.lo.s32 %r23, %r3, %r21;
@@ -4352,17 +4546,17 @@ $L__BB58_9:
add.s32 %r26, %r25, %r24;
st.shared.f32 [%r26], %f8;
-$L__BB59_2:
+$L__BB61_2:
bar.sync 0;
setp.ne.s32 %p2, %r2, 0;
- @%p2 bra $L__BB59_11;
+ @%p2 bra $L__BB61_11;
shl.b32 %r4, %r1, 10;
sub.s32 %r27, %r20, %r4;
min.u32 %r5, %r27, 1024;
setp.eq.s32 %p3, %r5, 0;
mov.f32 %f25, 0f00000000;
- @%p3 bra $L__BB59_10;
+ @%p3 bra $L__BB61_10;
not.b32 %r29, %r20;
add.s32 %r30, %r4, %r29;
@@ -4373,7 +4567,7 @@ $L__BB59_2:
setp.lt.u32 %p4, %r33, 3;
mov.f32 %f25, 0f00000000;
mov.u32 %r45, 0;
- @%p4 bra $L__BB59_7;
+ @%p4 bra $L__BB61_7;
add.s32 %r36, %r4, -1;
sub.s32 %r37, %r36, %r20;
@@ -4382,7 +4576,7 @@ $L__BB59_2:
neg.s32 %r42, %r39;
mov.u32 %r43, _ZZ9SumValuesE5block;
-$L__BB59_6:
+$L__BB61_6:
ld.shared.f32 %f13, [%r43];
add.ftz.f32 %f14, %f25, %f13;
ld.shared.f32 %f15, [%r43+4];
@@ -4395,32 +4589,32 @@ $L__BB59_6:
add.s32 %r43, %r43, 16;
add.s32 %r42, %r42, -4;
setp.ne.s32 %p5, %r42, 1;
- @%p5 bra $L__BB59_6;
+ @%p5 bra $L__BB61_6;
-$L__BB59_7:
+$L__BB61_7:
setp.eq.s32 %p6, %r47, 0;
- @%p6 bra $L__BB59_10;
+ @%p6 bra $L__BB61_10;
shl.b32 %r40, %r45, 2;
mov.u32 %r41, _ZZ9SumValuesE5block;
add.s32 %r46, %r41, %r40;
-$L__BB59_9:
+$L__BB61_9:
.pragma "nounroll";
ld.shared.f32 %f20, [%r46];
add.ftz.f32 %f25, %f25, %f20;
add.s32 %r46, %r46, 4;
add.s32 %r47, %r47, -1;
setp.ne.s32 %p7, %r47, 0;
- @%p7 bra $L__BB59_9;
+ @%p7 bra $L__BB61_9;
-$L__BB59_10:
+$L__BB61_10:
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f25;
-$L__BB59_11:
+$L__BB61_11:
ret;
}
diff --git a/BrightData.Cuda/cuda/brightwire_75.ptx b/BrightData.Cuda/cuda/brightwire_75.ptx
index 92b2c519..d9f08c80 100644
--- a/BrightData.Cuda/cuda/brightwire_75.ptx
+++ b/BrightData.Cuda/cuda/brightwire_75.ptx
@@ -4022,200 +4022,308 @@ $L__BB56_3:
ret;
}
- // .globl CalculateDistances
-.visible .entry CalculateDistances(
- .param .u64 CalculateDistances_param_0,
- .param .u64 CalculateDistances_param_1,
- .param .u64 CalculateDistances_param_2,
- .param .u32 CalculateDistances_param_3,
- .param .u32 CalculateDistances_param_4,
- .param .u32 CalculateDistances_param_5,
- .param .u32 CalculateDistances_param_6
+ // .globl CalculateMultiDistances
+.visible .entry CalculateMultiDistances(
+ .param .u64 CalculateMultiDistances_param_0,
+ .param .u64 CalculateMultiDistances_param_1,
+ .param .u64 CalculateMultiDistances_param_2,
+ .param .u32 CalculateMultiDistances_param_3,
+ .param .u32 CalculateMultiDistances_param_4,
+ .param .u32 CalculateMultiDistances_param_5,
+ .param .u32 CalculateMultiDistances_param_6
)
{
- .reg .pred %p<15>;
- .reg .f32 %f<15>;
- .reg .b32 %r<53>;
- .reg .b64 %rd<46>;
-
-
- ld.param.u64 %rd5, [CalculateDistances_param_0];
- ld.param.u64 %rd6, [CalculateDistances_param_1];
- ld.param.u64 %rd7, [CalculateDistances_param_2];
- ld.param.u32 %r27, [CalculateDistances_param_3];
- ld.param.u32 %r28, [CalculateDistances_param_4];
- ld.param.u32 %r29, [CalculateDistances_param_5];
- ld.param.u32 %r30, [CalculateDistances_param_6];
- cvta.to.global.u64 %rd1, %rd7;
- cvta.to.global.u64 %rd2, %rd6;
- cvta.to.global.u64 %rd3, %rd5;
- mov.u32 %r31, %ctaid.x;
+ .reg .pred %p<13>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<49>;
+ .reg .b64 %rd<38>;
+
+
+ ld.param.u64 %rd6, [CalculateMultiDistances_param_0];
+ ld.param.u64 %rd7, [CalculateMultiDistances_param_1];
+ ld.param.u64 %rd8, [CalculateMultiDistances_param_2];
+ ld.param.u32 %r25, [CalculateMultiDistances_param_3];
+ ld.param.u32 %r26, [CalculateMultiDistances_param_4];
+ ld.param.u32 %r27, [CalculateMultiDistances_param_5];
+ ld.param.u32 %r28, [CalculateMultiDistances_param_6];
+ cvta.to.global.u64 %rd1, %rd8;
+ cvta.to.global.u64 %rd2, %rd7;
+ cvta.to.global.u64 %rd3, %rd6;
+ mov.u32 %r29, %ctaid.x;
mov.u32 %r1, %ntid.x;
- mov.u32 %r32, %tid.x;
- mad.lo.s32 %r45, %r1, %r31, %r32;
- setp.ge.u32 %p1, %r45, %r29;
- @%p1 bra $L__BB57_21;
-
- mov.u32 %r33, %ntid.y;
- mov.u32 %r34, %ctaid.y;
- mov.u32 %r35, %tid.y;
- mad.lo.s32 %r3, %r33, %r34, %r35;
- mov.u32 %r36, %nctaid.x;
- mul.lo.s32 %r4, %r1, %r36;
- mov.u32 %r37, %ctaid.z;
- mov.u32 %r38, %ntid.z;
- mov.u32 %r39, %tid.z;
- mad.lo.s32 %r5, %r38, %r37, %r39;
- mov.u32 %r40, %nctaid.y;
- mul.lo.s32 %r6, %r33, %r40;
- mov.u32 %r41, %nctaid.z;
- mul.lo.s32 %r7, %r38, %r41;
+ mov.u32 %r30, %tid.x;
+ mad.lo.s32 %r42, %r1, %r29, %r30;
+ setp.ge.u32 %p1, %r42, %r27;
+ @%p1 bra $L__BB57_18;
+
+ mov.u32 %r31, %ntid.y;
+ mov.u32 %r32, %ctaid.y;
+ mov.u32 %r33, %tid.y;
+ mad.lo.s32 %r3, %r31, %r32, %r33;
+ mov.u32 %r34, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r34;
+ mov.u32 %r35, %ctaid.z;
+ mov.u32 %r36, %ntid.z;
+ mov.u32 %r37, %tid.z;
+ mad.lo.s32 %r5, %r36, %r35, %r37;
+ mov.u32 %r38, %nctaid.y;
+ mul.lo.s32 %r6, %r31, %r38;
+ mov.u32 %r39, %nctaid.z;
+ mul.lo.s32 %r7, %r36, %r39;
$L__BB57_2:
- setp.ge.u32 %p2, %r3, %r28;
- @%p2 bra $L__BB57_20;
+ setp.ge.u32 %p2, %r3, %r26;
+ @%p2 bra $L__BB57_17;
- setp.eq.s32 %p3, %r30, 0;
- cvt.u64.u32 %rd4, %r45;
- @%p3 bra $L__BB57_15;
+ setp.eq.s32 %p3, %r28, 0;
+ cvt.u64.u32 %rd4, %r42;
+ @%p3 bra $L__BB57_12;
- mov.u32 %r46, %r3;
+ mov.u32 %r43, %r3;
$L__BB57_5:
- setp.ge.u32 %p4, %r5, %r27;
- @%p4 bra $L__BB57_14;
-
- mul.wide.u32 %rd8, %r46, 8;
- add.s64 %rd9, %rd3, %rd8;
- ld.global.nc.u64 %rd10, [%rd9];
- cvta.to.global.u64 %rd11, %rd10;
- shl.b64 %rd12, %rd4, 2;
- add.s64 %rd13, %rd11, %rd12;
- ld.global.f32 %f1, [%rd13];
- mul.lo.s32 %r10, %r46, %r27;
- setp.eq.s32 %p5, %r30, 1;
- @%p5 bra $L__BB57_10;
-
- setp.ne.s32 %p6, %r30, 2;
- @%p6 bra $L__BB57_12;
+ setp.ge.u32 %p4, %r5, %r25;
+ @%p4 bra $L__BB57_11;
- add.s32 %r47, %r5, %r10;
- mov.u32 %r48, %r5;
+ setp.eq.s32 %p5, %r28, 2;
+ mul.wide.u32 %rd9, %r43, 8;
+ add.s64 %rd10, %rd3, %rd9;
+ ld.global.nc.u64 %rd11, [%rd10];
+ cvta.to.global.u64 %rd12, %rd11;
+ shl.b64 %rd13, %rd4, 2;
+ add.s64 %rd5, %rd12, %rd13;
+ mul.lo.s32 %r10, %r43, %r25;
+ @%p5 bra $L__BB57_9;
+ bra.uni $L__BB57_7;
$L__BB57_9:
- mul.wide.u32 %rd14, %r48, 8;
- add.s64 %rd15, %rd2, %rd14;
- ld.global.nc.u64 %rd16, [%rd15];
- cvta.to.global.u64 %rd17, %rd16;
- add.s64 %rd19, %rd17, %rd12;
- ld.global.f32 %f3, [%rd19];
- sub.ftz.f32 %f4, %f1, %f3;
- abs.ftz.f32 %f5, %f4;
- mul.wide.u32 %rd20, %r47, 4;
- add.s64 %rd21, %rd1, %rd20;
- atom.global.add.f32 %f6, [%rd21], %f5;
- add.s32 %r47, %r47, %r7;
- add.s32 %r48, %r48, %r7;
- setp.lt.u32 %p7, %r48, %r27;
- @%p7 bra $L__BB57_9;
- bra.uni $L__BB57_14;
+ ld.global.f32 %f1, [%rd5];
+ mov.u32 %r46, %r5;
$L__BB57_10:
- mov.u32 %r49, %r5;
+ mul.wide.u32 %rd16, %r46, 8;
+ add.s64 %rd17, %rd2, %rd16;
+ ld.global.nc.u64 %rd18, [%rd17];
+ cvta.to.global.u64 %rd19, %rd18;
+ add.s64 %rd21, %rd19, %rd13;
+ ld.global.f32 %f4, [%rd21];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ add.s32 %r40, %r46, %r10;
+ mul.wide.u32 %rd22, %r40, 4;
+ add.s64 %rd23, %rd1, %rd22;
+ atom.global.add.f32 %f7, [%rd23], %f6;
+ add.s32 %r46, %r46, %r7;
+ setp.lt.u32 %p7, %r46, %r25;
+ @%p7 bra $L__BB57_10;
+ bra.uni $L__BB57_11;
+
+$L__BB57_7:
+ add.s32 %r44, %r5, %r10;
+ mov.u32 %r45, %r5;
+
+$L__BB57_8:
+ mul.wide.u32 %rd14, %r44, 4;
+ add.s64 %rd15, %rd1, %rd14;
+ atom.global.add.f32 %f3, [%rd15], 0f00000000;
+ add.s32 %r44, %r44, %r7;
+ add.s32 %r45, %r45, %r7;
+ setp.lt.u32 %p6, %r45, %r25;
+ @%p6 bra $L__BB57_8;
$L__BB57_11:
- mul.wide.u32 %rd22, %r49, 8;
- add.s64 %rd23, %rd2, %rd22;
- ld.global.nc.u64 %rd24, [%rd23];
- cvta.to.global.u64 %rd25, %rd24;
- add.s64 %rd27, %rd25, %rd12;
- ld.global.f32 %f7, [%rd27];
- mul.ftz.f32 %f8, %f1, %f7;
- add.s32 %r42, %r49, %r10;
- mul.wide.u32 %rd28, %r42, 4;
- add.s64 %rd29, %rd1, %rd28;
- atom.global.add.f32 %f9, [%rd29], %f8;
- add.s32 %r49, %r49, %r7;
- setp.lt.u32 %p8, %r49, %r27;
- @%p8 bra $L__BB57_11;
- bra.uni $L__BB57_14;
+ add.s32 %r43, %r43, %r6;
+ setp.lt.u32 %p8, %r43, %r26;
+ @%p8 bra $L__BB57_5;
+ bra.uni $L__BB57_17;
$L__BB57_12:
- mov.u32 %r50, %r5;
+ mov.u32 %r47, %r3;
$L__BB57_13:
- add.s32 %r43, %r50, %r10;
- mul.wide.u32 %rd30, %r43, 4;
- add.s64 %rd31, %rd1, %rd30;
- atom.global.add.f32 %f10, [%rd31], 0f00000000;
- add.s32 %r50, %r50, %r7;
- setp.lt.u32 %p9, %r50, %r27;
- @%p9 bra $L__BB57_13;
-
-$L__BB57_14:
- add.s32 %r46, %r46, %r6;
- setp.lt.u32 %p10, %r46, %r28;
- @%p10 bra $L__BB57_5;
- bra.uni $L__BB57_20;
+ setp.ge.u32 %p9, %r5, %r25;
+ @%p9 bra $L__BB57_16;
+
+ mul.wide.u32 %rd24, %r47, 8;
+ add.s64 %rd25, %rd3, %rd24;
+ ld.global.nc.u64 %rd26, [%rd25];
+ cvta.to.global.u64 %rd27, %rd26;
+ shl.b64 %rd28, %rd4, 2;
+ add.s64 %rd29, %rd27, %rd28;
+ ld.global.f32 %f2, [%rd29];
+ mul.lo.s32 %r20, %r47, %r25;
+ mov.u32 %r48, %r5;
$L__BB57_15:
- mov.u32 %r51, %r3;
+ mul.wide.u32 %rd30, %r48, 8;
+ add.s64 %rd31, %rd2, %rd30;
+ ld.global.nc.u64 %rd32, [%rd31];
+ cvta.to.global.u64 %rd33, %rd32;
+ add.s64 %rd35, %rd33, %rd28;
+ ld.global.f32 %f8, [%rd35];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ add.s32 %r41, %r48, %r20;
+ mul.wide.u32 %rd36, %r41, 4;
+ add.s64 %rd37, %rd1, %rd36;
+ atom.global.add.f32 %f11, [%rd37], %f10;
+ add.s32 %r48, %r48, %r7;
+ setp.lt.u32 %p10, %r48, %r25;
+ @%p10 bra $L__BB57_15;
$L__BB57_16:
- setp.ge.u32 %p11, %r5, %r27;
- @%p11 bra $L__BB57_19;
-
- mul.wide.u32 %rd32, %r51, 8;
- add.s64 %rd33, %rd3, %rd32;
- ld.global.nc.u64 %rd34, [%rd33];
- cvta.to.global.u64 %rd35, %rd34;
- shl.b64 %rd36, %rd4, 2;
- add.s64 %rd37, %rd35, %rd36;
- ld.global.f32 %f2, [%rd37];
- mul.lo.s32 %r22, %r51, %r27;
- mov.u32 %r52, %r5;
+ add.s32 %r47, %r47, %r6;
+ setp.lt.u32 %p11, %r47, %r26;
+ @%p11 bra $L__BB57_13;
+
+$L__BB57_17:
+ add.s32 %r42, %r42, %r4;
+ setp.lt.u32 %p12, %r42, %r27;
+ @%p12 bra $L__BB57_2;
$L__BB57_18:
- mul.wide.u32 %rd38, %r52, 8;
- add.s64 %rd39, %rd2, %rd38;
- ld.global.nc.u64 %rd40, [%rd39];
- cvta.to.global.u64 %rd41, %rd40;
- add.s64 %rd43, %rd41, %rd36;
- ld.global.f32 %f11, [%rd43];
- sub.ftz.f32 %f12, %f2, %f11;
- mul.ftz.f32 %f13, %f12, %f12;
- add.s32 %r44, %r52, %r22;
- mul.wide.u32 %rd44, %r44, 4;
- add.s64 %rd45, %rd1, %rd44;
- atom.global.add.f32 %f14, [%rd45], %f13;
- add.s32 %r52, %r52, %r7;
- setp.lt.u32 %p12, %r52, %r27;
- @%p12 bra $L__BB57_18;
-
-$L__BB57_19:
- add.s32 %r51, %r51, %r6;
- setp.lt.u32 %p13, %r51, %r28;
- @%p13 bra $L__BB57_16;
-
-$L__BB57_20:
- add.s32 %r45, %r45, %r4;
- setp.lt.u32 %p14, %r45, %r29;
- @%p14 bra $L__BB57_2;
-
-$L__BB57_21:
ret;
}
- // .globl MultiCosineDistance
-.visible .entry MultiCosineDistance(
- .param .u64 MultiCosineDistance_param_0,
- .param .u64 MultiCosineDistance_param_1,
- .param .u64 MultiCosineDistance_param_2,
- .param .u64 MultiCosineDistance_param_3,
- .param .u64 MultiCosineDistance_param_4,
- .param .u32 MultiCosineDistance_param_5,
- .param .u32 MultiCosineDistance_param_6,
- .param .u32 MultiCosineDistance_param_7
+ // .globl CalculateDistances
+.visible .entry CalculateDistances(
+ .param .u64 CalculateDistances_param_0,
+ .param .u64 CalculateDistances_param_1,
+ .param .u64 CalculateDistances_param_2,
+ .param .u32 CalculateDistances_param_3,
+ .param .u32 CalculateDistances_param_4,
+ .param .u32 CalculateDistances_param_5
+)
+{
+ .reg .pred %p<11>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<31>;
+ .reg .b64 %rd<31>;
+
+
+ ld.param.u64 %rd7, [CalculateDistances_param_0];
+ ld.param.u64 %rd8, [CalculateDistances_param_1];
+ ld.param.u64 %rd9, [CalculateDistances_param_2];
+ ld.param.u32 %r16, [CalculateDistances_param_3];
+ ld.param.u32 %r17, [CalculateDistances_param_4];
+ ld.param.u32 %r18, [CalculateDistances_param_5];
+ cvta.to.global.u64 %rd1, %rd9;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd7;
+ mov.u32 %r19, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r20, %tid.x;
+ mad.lo.s32 %r26, %r1, %r19, %r20;
+ setp.ge.u32 %p1, %r26, %r17;
+ @%p1 bra $L__BB58_15;
+
+ mov.u32 %r21, %ntid.y;
+ mov.u32 %r22, %ctaid.y;
+ mov.u32 %r23, %tid.y;
+ mad.lo.s32 %r3, %r21, %r22, %r23;
+ mov.u32 %r24, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r24;
+ mov.u32 %r25, %nctaid.y;
+ mul.lo.s32 %r5, %r21, %r25;
+ setp.eq.s32 %p2, %r18, 0;
+ @%p2 bra $L__BB58_11;
+
+$L__BB58_3:
+ setp.ge.u32 %p3, %r3, %r16;
+ @%p3 bra $L__BB58_9;
+
+ setp.eq.s32 %p4, %r18, 2;
+ cvt.u64.u32 %rd4, %r26;
+ mul.wide.u32 %rd10, %r26, 4;
+ add.s64 %rd5, %rd3, %rd10;
+ @%p4 bra $L__BB58_7;
+ bra.uni $L__BB58_5;
+
+$L__BB58_7:
+ ld.global.nc.f32 %f1, [%rd5];
+ mov.u32 %r28, %r3;
+
+$L__BB58_8:
+ mul.wide.u32 %rd13, %r28, 8;
+ add.s64 %rd14, %rd2, %rd13;
+ ld.global.nc.u64 %rd15, [%rd14];
+ cvta.to.global.u64 %rd16, %rd15;
+ shl.b64 %rd17, %rd4, 2;
+ add.s64 %rd18, %rd16, %rd17;
+ ld.global.f32 %f4, [%rd18];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ mul.wide.u32 %rd19, %r28, 4;
+ add.s64 %rd20, %rd1, %rd19;
+ atom.global.add.f32 %f7, [%rd20], %f6;
+ add.s32 %r28, %r28, %r5;
+ setp.lt.u32 %p6, %r28, %r16;
+ @%p6 bra $L__BB58_8;
+ bra.uni $L__BB58_9;
+
+$L__BB58_5:
+ mov.u32 %r27, %r3;
+
+$L__BB58_6:
+ mul.wide.u32 %rd11, %r27, 4;
+ add.s64 %rd12, %rd1, %rd11;
+ atom.global.add.f32 %f3, [%rd12], 0f00000000;
+ add.s32 %r27, %r27, %r5;
+ setp.lt.u32 %p5, %r27, %r16;
+ @%p5 bra $L__BB58_6;
+
+$L__BB58_9:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p7, %r26, %r17;
+ @%p7 bra $L__BB58_3;
+ bra.uni $L__BB58_15;
+
+$L__BB58_11:
+ setp.ge.u32 %p8, %r3, %r16;
+ @%p8 bra $L__BB58_14;
+
+ cvt.u64.u32 %rd6, %r26;
+ mul.wide.u32 %rd21, %r26, 4;
+ add.s64 %rd22, %rd3, %rd21;
+ ld.global.nc.f32 %f2, [%rd22];
+ mov.u32 %r30, %r3;
+
+$L__BB58_13:
+ mul.wide.u32 %rd23, %r30, 8;
+ add.s64 %rd24, %rd2, %rd23;
+ ld.global.nc.u64 %rd25, [%rd24];
+ cvta.to.global.u64 %rd26, %rd25;
+ shl.b64 %rd27, %rd6, 2;
+ add.s64 %rd28, %rd26, %rd27;
+ ld.global.f32 %f8, [%rd28];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ mul.wide.u32 %rd29, %r30, 4;
+ add.s64 %rd30, %rd1, %rd29;
+ atom.global.add.f32 %f11, [%rd30], %f10;
+ add.s32 %r30, %r30, %r5;
+ setp.lt.u32 %p9, %r30, %r16;
+ @%p9 bra $L__BB58_13;
+
+$L__BB58_14:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p10, %r26, %r17;
+ @%p10 bra $L__BB58_11;
+
+$L__BB58_15:
+ ret;
+
+}
+ // .globl CosineMultiDistance
+.visible .entry CosineMultiDistance(
+ .param .u64 CosineMultiDistance_param_0,
+ .param .u64 CosineMultiDistance_param_1,
+ .param .u64 CosineMultiDistance_param_2,
+ .param .u64 CosineMultiDistance_param_3,
+ .param .u64 CosineMultiDistance_param_4,
+ .param .u32 CosineMultiDistance_param_5,
+ .param .u32 CosineMultiDistance_param_6,
+ .param .u32 CosineMultiDistance_param_7
)
{
.reg .pred %p<7>;
@@ -4224,20 +4332,20 @@ $L__BB57_21:
.reg .b64 %rd<28>;
- ld.param.u64 %rd7, [MultiCosineDistance_param_0];
- ld.param.u64 %rd8, [MultiCosineDistance_param_1];
- ld.param.u64 %rd9, [MultiCosineDistance_param_2];
- ld.param.u64 %rd10, [MultiCosineDistance_param_3];
- ld.param.u64 %rd11, [MultiCosineDistance_param_4];
- ld.param.u32 %r17, [MultiCosineDistance_param_5];
- ld.param.u32 %r18, [MultiCosineDistance_param_6];
- ld.param.u32 %r19, [MultiCosineDistance_param_7];
+ ld.param.u64 %rd7, [CosineMultiDistance_param_0];
+ ld.param.u64 %rd8, [CosineMultiDistance_param_1];
+ ld.param.u64 %rd9, [CosineMultiDistance_param_2];
+ ld.param.u64 %rd10, [CosineMultiDistance_param_3];
+ ld.param.u64 %rd11, [CosineMultiDistance_param_4];
+ ld.param.u32 %r17, [CosineMultiDistance_param_5];
+ ld.param.u32 %r18, [CosineMultiDistance_param_6];
+ ld.param.u32 %r19, [CosineMultiDistance_param_7];
mov.u32 %r20, %ctaid.x;
mov.u32 %r1, %ntid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r31, %r1, %r20, %r21;
setp.ge.u32 %p1, %r31, %r19;
- @%p1 bra $L__BB58_9;
+ @%p1 bra $L__BB59_9;
mov.u32 %r22, %ntid.y;
mov.u32 %r23, %ctaid.y;
@@ -4259,16 +4367,16 @@ $L__BB57_21:
cvta.to.global.u64 %rd4, %rd10;
cvta.to.global.u64 %rd5, %rd11;
-$L__BB58_2:
+$L__BB59_2:
setp.ge.u32 %p2, %r3, %r18;
- @%p2 bra $L__BB58_8;
+ @%p2 bra $L__BB59_8;
cvt.u64.u32 %rd6, %r31;
mov.u32 %r32, %r3;
-$L__BB58_4:
+$L__BB59_4:
setp.ge.u32 %p3, %r5, %r17;
- @%p3 bra $L__BB58_7;
+ @%p3 bra $L__BB59_7;
mul.wide.u32 %rd12, %r32, 8;
add.s64 %rd13, %rd1, %rd12;
@@ -4281,7 +4389,7 @@ $L__BB58_4:
mad.lo.s32 %r33, %r32, %r17, %r5;
mov.u32 %r34, %r5;
-$L__BB58_6:
+$L__BB59_6:
mul.wide.u32 %rd18, %r34, 8;
add.s64 %rd19, %rd2, %rd18;
ld.global.nc.u64 %rd20, [%rd19];
@@ -4300,19 +4408,105 @@ $L__BB58_6:
add.s32 %r33, %r33, %r7;
add.s32 %r34, %r34, %r7;
setp.lt.u32 %p4, %r34, %r17;
- @%p4 bra $L__BB58_6;
+ @%p4 bra $L__BB59_6;
-$L__BB58_7:
+$L__BB59_7:
add.s32 %r32, %r32, %r6;
setp.lt.u32 %p5, %r32, %r18;
- @%p5 bra $L__BB58_4;
+ @%p5 bra $L__BB59_4;
-$L__BB58_8:
+$L__BB59_8:
add.s32 %r31, %r31, %r4;
setp.lt.u32 %p6, %r31, %r19;
- @%p6 bra $L__BB58_2;
+ @%p6 bra $L__BB59_2;
-$L__BB58_9:
+$L__BB59_9:
+ ret;
+
+}
+ // .globl CosineDistances
+.visible .entry CosineDistances(
+ .param .u64 CosineDistances_param_0,
+ .param .u64 CosineDistances_param_1,
+ .param .u64 CosineDistances_param_2,
+ .param .u64 CosineDistances_param_3,
+ .param .u64 CosineDistances_param_4,
+ .param .u32 CosineDistances_param_5,
+ .param .u32 CosineDistances_param_6
+)
+{
+ .reg .pred %p<5>;
+ .reg .f32 %f<9>;
+ .reg .b32 %r<21>;
+ .reg .b64 %rd<24>;
+
+
+ ld.param.u64 %rd7, [CosineDistances_param_0];
+ ld.param.u64 %rd8, [CosineDistances_param_1];
+ ld.param.u64 %rd9, [CosineDistances_param_2];
+ ld.param.u64 %rd10, [CosineDistances_param_3];
+ ld.param.u64 %rd11, [CosineDistances_param_4];
+ ld.param.u32 %r10, [CosineDistances_param_5];
+ ld.param.u32 %r11, [CosineDistances_param_6];
+ mov.u32 %r12, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r13, %tid.x;
+ mad.lo.s32 %r19, %r1, %r12, %r13;
+ setp.ge.u32 %p1, %r19, %r11;
+ @%p1 bra $L__BB60_6;
+
+ mov.u32 %r14, %ntid.y;
+ mov.u32 %r15, %ctaid.y;
+ mov.u32 %r16, %tid.y;
+ mad.lo.s32 %r3, %r14, %r15, %r16;
+ mov.u32 %r17, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r17;
+ mov.u32 %r18, %nctaid.y;
+ mul.lo.s32 %r5, %r14, %r18;
+ cvta.to.global.u64 %rd1, %rd7;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd9;
+ cvta.to.global.u64 %rd4, %rd10;
+ cvta.to.global.u64 %rd5, %rd11;
+
+$L__BB60_2:
+ setp.ge.u32 %p2, %r3, %r10;
+ @%p2 bra $L__BB60_5;
+
+ cvt.u64.u32 %rd6, %r19;
+ mul.wide.u32 %rd12, %r19, 4;
+ add.s64 %rd13, %rd1, %rd12;
+ ld.global.nc.f32 %f1, [%rd13];
+ mul.ftz.f32 %f2, %f1, %f1;
+ mov.u32 %r20, %r3;
+
+$L__BB60_4:
+ mul.wide.u32 %rd14, %r20, 8;
+ add.s64 %rd15, %rd2, %rd14;
+ ld.global.nc.u64 %rd16, [%rd15];
+ cvta.to.global.u64 %rd17, %rd16;
+ shl.b64 %rd18, %rd6, 2;
+ add.s64 %rd19, %rd17, %rd18;
+ ld.global.f32 %f3, [%rd19];
+ mul.wide.u32 %rd20, %r20, 4;
+ add.s64 %rd21, %rd3, %rd20;
+ atom.global.add.f32 %f4, [%rd21], %f2;
+ add.s64 %rd22, %rd4, %rd20;
+ mul.ftz.f32 %f5, %f1, %f3;
+ atom.global.add.f32 %f6, [%rd22], %f5;
+ add.s64 %rd23, %rd5, %rd20;
+ mul.ftz.f32 %f7, %f3, %f3;
+ atom.global.add.f32 %f8, [%rd23], %f7;
+ add.s32 %r20, %r20, %r5;
+ setp.lt.u32 %p3, %r20, %r10;
+ @%p3 bra $L__BB60_4;
+
+$L__BB60_5:
+ add.s32 %r19, %r19, %r4;
+ setp.lt.u32 %p4, %r19, %r11;
+ @%p4 bra $L__BB60_2;
+
+$L__BB60_6:
ret;
}
@@ -4340,7 +4534,7 @@ $L__BB58_9:
mov.u32 %r2, %tid.x;
mad.lo.s32 %r3, %r22, %r1, %r2;
setp.ge.u32 %p1, %r3, %r20;
- @%p1 bra $L__BB59_2;
+ @%p1 bra $L__BB61_2;
cvta.to.global.u64 %rd3, %rd1;
mul.lo.s32 %r23, %r3, %r21;
@@ -4352,17 +4546,17 @@ $L__BB58_9:
add.s32 %r26, %r25, %r24;
st.shared.f32 [%r26], %f8;
-$L__BB59_2:
+$L__BB61_2:
bar.sync 0;
setp.ne.s32 %p2, %r2, 0;
- @%p2 bra $L__BB59_11;
+ @%p2 bra $L__BB61_11;
shl.b32 %r4, %r1, 10;
sub.s32 %r27, %r20, %r4;
min.u32 %r5, %r27, 1024;
setp.eq.s32 %p3, %r5, 0;
mov.f32 %f25, 0f00000000;
- @%p3 bra $L__BB59_10;
+ @%p3 bra $L__BB61_10;
not.b32 %r29, %r20;
add.s32 %r30, %r4, %r29;
@@ -4373,7 +4567,7 @@ $L__BB59_2:
setp.lt.u32 %p4, %r33, 3;
mov.f32 %f25, 0f00000000;
mov.u32 %r45, 0;
- @%p4 bra $L__BB59_7;
+ @%p4 bra $L__BB61_7;
add.s32 %r36, %r4, -1;
sub.s32 %r37, %r36, %r20;
@@ -4382,7 +4576,7 @@ $L__BB59_2:
neg.s32 %r42, %r39;
mov.u32 %r43, _ZZ9SumValuesE5block;
-$L__BB59_6:
+$L__BB61_6:
ld.shared.f32 %f13, [%r43];
add.ftz.f32 %f14, %f25, %f13;
ld.shared.f32 %f15, [%r43+4];
@@ -4395,32 +4589,32 @@ $L__BB59_6:
add.s32 %r43, %r43, 16;
add.s32 %r42, %r42, -4;
setp.ne.s32 %p5, %r42, 1;
- @%p5 bra $L__BB59_6;
+ @%p5 bra $L__BB61_6;
-$L__BB59_7:
+$L__BB61_7:
setp.eq.s32 %p6, %r47, 0;
- @%p6 bra $L__BB59_10;
+ @%p6 bra $L__BB61_10;
shl.b32 %r40, %r45, 2;
mov.u32 %r41, _ZZ9SumValuesE5block;
add.s32 %r46, %r41, %r40;
-$L__BB59_9:
+$L__BB61_9:
.pragma "nounroll";
ld.shared.f32 %f20, [%r46];
add.ftz.f32 %f25, %f25, %f20;
add.s32 %r46, %r46, 4;
add.s32 %r47, %r47, -1;
setp.ne.s32 %p7, %r47, 0;
- @%p7 bra $L__BB59_9;
+ @%p7 bra $L__BB61_9;
-$L__BB59_10:
+$L__BB61_10:
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f25;
-$L__BB59_11:
+$L__BB61_11:
ret;
}
diff --git a/BrightData.Cuda/cuda/brightwire_80.ptx b/BrightData.Cuda/cuda/brightwire_80.ptx
index 2f211aad..1ff5501c 100644
--- a/BrightData.Cuda/cuda/brightwire_80.ptx
+++ b/BrightData.Cuda/cuda/brightwire_80.ptx
@@ -4022,200 +4022,308 @@ $L__BB56_3:
ret;
}
- // .globl CalculateDistances
-.visible .entry CalculateDistances(
- .param .u64 CalculateDistances_param_0,
- .param .u64 CalculateDistances_param_1,
- .param .u64 CalculateDistances_param_2,
- .param .u32 CalculateDistances_param_3,
- .param .u32 CalculateDistances_param_4,
- .param .u32 CalculateDistances_param_5,
- .param .u32 CalculateDistances_param_6
+ // .globl CalculateMultiDistances
+.visible .entry CalculateMultiDistances(
+ .param .u64 CalculateMultiDistances_param_0,
+ .param .u64 CalculateMultiDistances_param_1,
+ .param .u64 CalculateMultiDistances_param_2,
+ .param .u32 CalculateMultiDistances_param_3,
+ .param .u32 CalculateMultiDistances_param_4,
+ .param .u32 CalculateMultiDistances_param_5,
+ .param .u32 CalculateMultiDistances_param_6
)
{
- .reg .pred %p<15>;
- .reg .f32 %f<15>;
- .reg .b32 %r<53>;
- .reg .b64 %rd<46>;
-
-
- ld.param.u64 %rd5, [CalculateDistances_param_0];
- ld.param.u64 %rd6, [CalculateDistances_param_1];
- ld.param.u64 %rd7, [CalculateDistances_param_2];
- ld.param.u32 %r27, [CalculateDistances_param_3];
- ld.param.u32 %r28, [CalculateDistances_param_4];
- ld.param.u32 %r29, [CalculateDistances_param_5];
- ld.param.u32 %r30, [CalculateDistances_param_6];
- cvta.to.global.u64 %rd1, %rd7;
- cvta.to.global.u64 %rd2, %rd6;
- cvta.to.global.u64 %rd3, %rd5;
- mov.u32 %r31, %ctaid.x;
+ .reg .pred %p<13>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<49>;
+ .reg .b64 %rd<38>;
+
+
+ ld.param.u64 %rd6, [CalculateMultiDistances_param_0];
+ ld.param.u64 %rd7, [CalculateMultiDistances_param_1];
+ ld.param.u64 %rd8, [CalculateMultiDistances_param_2];
+ ld.param.u32 %r25, [CalculateMultiDistances_param_3];
+ ld.param.u32 %r26, [CalculateMultiDistances_param_4];
+ ld.param.u32 %r27, [CalculateMultiDistances_param_5];
+ ld.param.u32 %r28, [CalculateMultiDistances_param_6];
+ cvta.to.global.u64 %rd1, %rd8;
+ cvta.to.global.u64 %rd2, %rd7;
+ cvta.to.global.u64 %rd3, %rd6;
+ mov.u32 %r29, %ctaid.x;
mov.u32 %r1, %ntid.x;
- mov.u32 %r32, %tid.x;
- mad.lo.s32 %r45, %r1, %r31, %r32;
- setp.ge.u32 %p1, %r45, %r29;
- @%p1 bra $L__BB57_21;
-
- mov.u32 %r33, %ntid.y;
- mov.u32 %r34, %ctaid.y;
- mov.u32 %r35, %tid.y;
- mad.lo.s32 %r3, %r33, %r34, %r35;
- mov.u32 %r36, %nctaid.x;
- mul.lo.s32 %r4, %r1, %r36;
- mov.u32 %r37, %ctaid.z;
- mov.u32 %r38, %ntid.z;
- mov.u32 %r39, %tid.z;
- mad.lo.s32 %r5, %r38, %r37, %r39;
- mov.u32 %r40, %nctaid.y;
- mul.lo.s32 %r6, %r33, %r40;
- mov.u32 %r41, %nctaid.z;
- mul.lo.s32 %r7, %r38, %r41;
+ mov.u32 %r30, %tid.x;
+ mad.lo.s32 %r42, %r1, %r29, %r30;
+ setp.ge.u32 %p1, %r42, %r27;
+ @%p1 bra $L__BB57_18;
+
+ mov.u32 %r31, %ntid.y;
+ mov.u32 %r32, %ctaid.y;
+ mov.u32 %r33, %tid.y;
+ mad.lo.s32 %r3, %r31, %r32, %r33;
+ mov.u32 %r34, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r34;
+ mov.u32 %r35, %ctaid.z;
+ mov.u32 %r36, %ntid.z;
+ mov.u32 %r37, %tid.z;
+ mad.lo.s32 %r5, %r36, %r35, %r37;
+ mov.u32 %r38, %nctaid.y;
+ mul.lo.s32 %r6, %r31, %r38;
+ mov.u32 %r39, %nctaid.z;
+ mul.lo.s32 %r7, %r36, %r39;
$L__BB57_2:
- setp.ge.u32 %p2, %r3, %r28;
- @%p2 bra $L__BB57_20;
+ setp.ge.u32 %p2, %r3, %r26;
+ @%p2 bra $L__BB57_17;
- setp.eq.s32 %p3, %r30, 0;
- cvt.u64.u32 %rd4, %r45;
- @%p3 bra $L__BB57_15;
+ setp.eq.s32 %p3, %r28, 0;
+ cvt.u64.u32 %rd4, %r42;
+ @%p3 bra $L__BB57_12;
- mov.u32 %r46, %r3;
+ mov.u32 %r43, %r3;
$L__BB57_5:
- setp.ge.u32 %p4, %r5, %r27;
- @%p4 bra $L__BB57_14;
-
- mul.wide.u32 %rd8, %r46, 8;
- add.s64 %rd9, %rd3, %rd8;
- ld.global.nc.u64 %rd10, [%rd9];
- cvta.to.global.u64 %rd11, %rd10;
- shl.b64 %rd12, %rd4, 2;
- add.s64 %rd13, %rd11, %rd12;
- ld.global.f32 %f1, [%rd13];
- mul.lo.s32 %r10, %r46, %r27;
- setp.eq.s32 %p5, %r30, 1;
- @%p5 bra $L__BB57_10;
-
- setp.ne.s32 %p6, %r30, 2;
- @%p6 bra $L__BB57_12;
+ setp.ge.u32 %p4, %r5, %r25;
+ @%p4 bra $L__BB57_11;
- add.s32 %r47, %r5, %r10;
- mov.u32 %r48, %r5;
+ setp.eq.s32 %p5, %r28, 2;
+ mul.wide.u32 %rd9, %r43, 8;
+ add.s64 %rd10, %rd3, %rd9;
+ ld.global.nc.u64 %rd11, [%rd10];
+ cvta.to.global.u64 %rd12, %rd11;
+ shl.b64 %rd13, %rd4, 2;
+ add.s64 %rd5, %rd12, %rd13;
+ mul.lo.s32 %r10, %r43, %r25;
+ @%p5 bra $L__BB57_9;
+ bra.uni $L__BB57_7;
$L__BB57_9:
- mul.wide.u32 %rd14, %r48, 8;
- add.s64 %rd15, %rd2, %rd14;
- ld.global.nc.u64 %rd16, [%rd15];
- cvta.to.global.u64 %rd17, %rd16;
- add.s64 %rd19, %rd17, %rd12;
- ld.global.f32 %f3, [%rd19];
- sub.ftz.f32 %f4, %f1, %f3;
- abs.ftz.f32 %f5, %f4;
- mul.wide.u32 %rd20, %r47, 4;
- add.s64 %rd21, %rd1, %rd20;
- atom.global.add.f32 %f6, [%rd21], %f5;
- add.s32 %r47, %r47, %r7;
- add.s32 %r48, %r48, %r7;
- setp.lt.u32 %p7, %r48, %r27;
- @%p7 bra $L__BB57_9;
- bra.uni $L__BB57_14;
+ ld.global.f32 %f1, [%rd5];
+ mov.u32 %r46, %r5;
$L__BB57_10:
- mov.u32 %r49, %r5;
+ mul.wide.u32 %rd16, %r46, 8;
+ add.s64 %rd17, %rd2, %rd16;
+ ld.global.nc.u64 %rd18, [%rd17];
+ cvta.to.global.u64 %rd19, %rd18;
+ add.s64 %rd21, %rd19, %rd13;
+ ld.global.f32 %f4, [%rd21];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ add.s32 %r40, %r46, %r10;
+ mul.wide.u32 %rd22, %r40, 4;
+ add.s64 %rd23, %rd1, %rd22;
+ atom.global.add.f32 %f7, [%rd23], %f6;
+ add.s32 %r46, %r46, %r7;
+ setp.lt.u32 %p7, %r46, %r25;
+ @%p7 bra $L__BB57_10;
+ bra.uni $L__BB57_11;
+
+$L__BB57_7:
+ add.s32 %r44, %r5, %r10;
+ mov.u32 %r45, %r5;
+
+$L__BB57_8:
+ mul.wide.u32 %rd14, %r44, 4;
+ add.s64 %rd15, %rd1, %rd14;
+ atom.global.add.f32 %f3, [%rd15], 0f00000000;
+ add.s32 %r44, %r44, %r7;
+ add.s32 %r45, %r45, %r7;
+ setp.lt.u32 %p6, %r45, %r25;
+ @%p6 bra $L__BB57_8;
$L__BB57_11:
- mul.wide.u32 %rd22, %r49, 8;
- add.s64 %rd23, %rd2, %rd22;
- ld.global.nc.u64 %rd24, [%rd23];
- cvta.to.global.u64 %rd25, %rd24;
- add.s64 %rd27, %rd25, %rd12;
- ld.global.f32 %f7, [%rd27];
- mul.ftz.f32 %f8, %f1, %f7;
- add.s32 %r42, %r49, %r10;
- mul.wide.u32 %rd28, %r42, 4;
- add.s64 %rd29, %rd1, %rd28;
- atom.global.add.f32 %f9, [%rd29], %f8;
- add.s32 %r49, %r49, %r7;
- setp.lt.u32 %p8, %r49, %r27;
- @%p8 bra $L__BB57_11;
- bra.uni $L__BB57_14;
+ add.s32 %r43, %r43, %r6;
+ setp.lt.u32 %p8, %r43, %r26;
+ @%p8 bra $L__BB57_5;
+ bra.uni $L__BB57_17;
$L__BB57_12:
- mov.u32 %r50, %r5;
+ mov.u32 %r47, %r3;
$L__BB57_13:
- add.s32 %r43, %r50, %r10;
- mul.wide.u32 %rd30, %r43, 4;
- add.s64 %rd31, %rd1, %rd30;
- atom.global.add.f32 %f10, [%rd31], 0f00000000;
- add.s32 %r50, %r50, %r7;
- setp.lt.u32 %p9, %r50, %r27;
- @%p9 bra $L__BB57_13;
-
-$L__BB57_14:
- add.s32 %r46, %r46, %r6;
- setp.lt.u32 %p10, %r46, %r28;
- @%p10 bra $L__BB57_5;
- bra.uni $L__BB57_20;
+ setp.ge.u32 %p9, %r5, %r25;
+ @%p9 bra $L__BB57_16;
+
+ mul.wide.u32 %rd24, %r47, 8;
+ add.s64 %rd25, %rd3, %rd24;
+ ld.global.nc.u64 %rd26, [%rd25];
+ cvta.to.global.u64 %rd27, %rd26;
+ shl.b64 %rd28, %rd4, 2;
+ add.s64 %rd29, %rd27, %rd28;
+ ld.global.f32 %f2, [%rd29];
+ mul.lo.s32 %r20, %r47, %r25;
+ mov.u32 %r48, %r5;
$L__BB57_15:
- mov.u32 %r51, %r3;
+ mul.wide.u32 %rd30, %r48, 8;
+ add.s64 %rd31, %rd2, %rd30;
+ ld.global.nc.u64 %rd32, [%rd31];
+ cvta.to.global.u64 %rd33, %rd32;
+ add.s64 %rd35, %rd33, %rd28;
+ ld.global.f32 %f8, [%rd35];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ add.s32 %r41, %r48, %r20;
+ mul.wide.u32 %rd36, %r41, 4;
+ add.s64 %rd37, %rd1, %rd36;
+ atom.global.add.f32 %f11, [%rd37], %f10;
+ add.s32 %r48, %r48, %r7;
+ setp.lt.u32 %p10, %r48, %r25;
+ @%p10 bra $L__BB57_15;
$L__BB57_16:
- setp.ge.u32 %p11, %r5, %r27;
- @%p11 bra $L__BB57_19;
-
- mul.wide.u32 %rd32, %r51, 8;
- add.s64 %rd33, %rd3, %rd32;
- ld.global.nc.u64 %rd34, [%rd33];
- cvta.to.global.u64 %rd35, %rd34;
- shl.b64 %rd36, %rd4, 2;
- add.s64 %rd37, %rd35, %rd36;
- ld.global.f32 %f2, [%rd37];
- mul.lo.s32 %r22, %r51, %r27;
- mov.u32 %r52, %r5;
+ add.s32 %r47, %r47, %r6;
+ setp.lt.u32 %p11, %r47, %r26;
+ @%p11 bra $L__BB57_13;
+
+$L__BB57_17:
+ add.s32 %r42, %r42, %r4;
+ setp.lt.u32 %p12, %r42, %r27;
+ @%p12 bra $L__BB57_2;
$L__BB57_18:
- mul.wide.u32 %rd38, %r52, 8;
- add.s64 %rd39, %rd2, %rd38;
- ld.global.nc.u64 %rd40, [%rd39];
- cvta.to.global.u64 %rd41, %rd40;
- add.s64 %rd43, %rd41, %rd36;
- ld.global.f32 %f11, [%rd43];
- sub.ftz.f32 %f12, %f2, %f11;
- mul.ftz.f32 %f13, %f12, %f12;
- add.s32 %r44, %r52, %r22;
- mul.wide.u32 %rd44, %r44, 4;
- add.s64 %rd45, %rd1, %rd44;
- atom.global.add.f32 %f14, [%rd45], %f13;
- add.s32 %r52, %r52, %r7;
- setp.lt.u32 %p12, %r52, %r27;
- @%p12 bra $L__BB57_18;
-
-$L__BB57_19:
- add.s32 %r51, %r51, %r6;
- setp.lt.u32 %p13, %r51, %r28;
- @%p13 bra $L__BB57_16;
-
-$L__BB57_20:
- add.s32 %r45, %r45, %r4;
- setp.lt.u32 %p14, %r45, %r29;
- @%p14 bra $L__BB57_2;
-
-$L__BB57_21:
ret;
}
- // .globl MultiCosineDistance
-.visible .entry MultiCosineDistance(
- .param .u64 MultiCosineDistance_param_0,
- .param .u64 MultiCosineDistance_param_1,
- .param .u64 MultiCosineDistance_param_2,
- .param .u64 MultiCosineDistance_param_3,
- .param .u64 MultiCosineDistance_param_4,
- .param .u32 MultiCosineDistance_param_5,
- .param .u32 MultiCosineDistance_param_6,
- .param .u32 MultiCosineDistance_param_7
+ // .globl CalculateDistances
+.visible .entry CalculateDistances(
+ .param .u64 CalculateDistances_param_0,
+ .param .u64 CalculateDistances_param_1,
+ .param .u64 CalculateDistances_param_2,
+ .param .u32 CalculateDistances_param_3,
+ .param .u32 CalculateDistances_param_4,
+ .param .u32 CalculateDistances_param_5
+)
+{
+ .reg .pred %p<11>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<31>;
+ .reg .b64 %rd<31>;
+
+
+ ld.param.u64 %rd7, [CalculateDistances_param_0];
+ ld.param.u64 %rd8, [CalculateDistances_param_1];
+ ld.param.u64 %rd9, [CalculateDistances_param_2];
+ ld.param.u32 %r16, [CalculateDistances_param_3];
+ ld.param.u32 %r17, [CalculateDistances_param_4];
+ ld.param.u32 %r18, [CalculateDistances_param_5];
+ cvta.to.global.u64 %rd1, %rd9;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd7;
+ mov.u32 %r19, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r20, %tid.x;
+ mad.lo.s32 %r26, %r1, %r19, %r20;
+ setp.ge.u32 %p1, %r26, %r17;
+ @%p1 bra $L__BB58_15;
+
+ mov.u32 %r21, %ntid.y;
+ mov.u32 %r22, %ctaid.y;
+ mov.u32 %r23, %tid.y;
+ mad.lo.s32 %r3, %r21, %r22, %r23;
+ mov.u32 %r24, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r24;
+ mov.u32 %r25, %nctaid.y;
+ mul.lo.s32 %r5, %r21, %r25;
+ setp.eq.s32 %p2, %r18, 0;
+ @%p2 bra $L__BB58_11;
+
+$L__BB58_3:
+ setp.ge.u32 %p3, %r3, %r16;
+ @%p3 bra $L__BB58_9;
+
+ setp.eq.s32 %p4, %r18, 2;
+ cvt.u64.u32 %rd4, %r26;
+ mul.wide.u32 %rd10, %r26, 4;
+ add.s64 %rd5, %rd3, %rd10;
+ @%p4 bra $L__BB58_7;
+ bra.uni $L__BB58_5;
+
+$L__BB58_7:
+ ld.global.nc.f32 %f1, [%rd5];
+ mov.u32 %r28, %r3;
+
+$L__BB58_8:
+ mul.wide.u32 %rd13, %r28, 8;
+ add.s64 %rd14, %rd2, %rd13;
+ ld.global.nc.u64 %rd15, [%rd14];
+ cvta.to.global.u64 %rd16, %rd15;
+ shl.b64 %rd17, %rd4, 2;
+ add.s64 %rd18, %rd16, %rd17;
+ ld.global.f32 %f4, [%rd18];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ mul.wide.u32 %rd19, %r28, 4;
+ add.s64 %rd20, %rd1, %rd19;
+ atom.global.add.f32 %f7, [%rd20], %f6;
+ add.s32 %r28, %r28, %r5;
+ setp.lt.u32 %p6, %r28, %r16;
+ @%p6 bra $L__BB58_8;
+ bra.uni $L__BB58_9;
+
+$L__BB58_5:
+ mov.u32 %r27, %r3;
+
+$L__BB58_6:
+ mul.wide.u32 %rd11, %r27, 4;
+ add.s64 %rd12, %rd1, %rd11;
+ atom.global.add.f32 %f3, [%rd12], 0f00000000;
+ add.s32 %r27, %r27, %r5;
+ setp.lt.u32 %p5, %r27, %r16;
+ @%p5 bra $L__BB58_6;
+
+$L__BB58_9:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p7, %r26, %r17;
+ @%p7 bra $L__BB58_3;
+ bra.uni $L__BB58_15;
+
+$L__BB58_11:
+ setp.ge.u32 %p8, %r3, %r16;
+ @%p8 bra $L__BB58_14;
+
+ cvt.u64.u32 %rd6, %r26;
+ mul.wide.u32 %rd21, %r26, 4;
+ add.s64 %rd22, %rd3, %rd21;
+ ld.global.nc.f32 %f2, [%rd22];
+ mov.u32 %r30, %r3;
+
+$L__BB58_13:
+ mul.wide.u32 %rd23, %r30, 8;
+ add.s64 %rd24, %rd2, %rd23;
+ ld.global.nc.u64 %rd25, [%rd24];
+ cvta.to.global.u64 %rd26, %rd25;
+ shl.b64 %rd27, %rd6, 2;
+ add.s64 %rd28, %rd26, %rd27;
+ ld.global.f32 %f8, [%rd28];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ mul.wide.u32 %rd29, %r30, 4;
+ add.s64 %rd30, %rd1, %rd29;
+ atom.global.add.f32 %f11, [%rd30], %f10;
+ add.s32 %r30, %r30, %r5;
+ setp.lt.u32 %p9, %r30, %r16;
+ @%p9 bra $L__BB58_13;
+
+$L__BB58_14:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p10, %r26, %r17;
+ @%p10 bra $L__BB58_11;
+
+$L__BB58_15:
+ ret;
+
+}
+ // .globl CosineMultiDistance
+.visible .entry CosineMultiDistance(
+ .param .u64 CosineMultiDistance_param_0,
+ .param .u64 CosineMultiDistance_param_1,
+ .param .u64 CosineMultiDistance_param_2,
+ .param .u64 CosineMultiDistance_param_3,
+ .param .u64 CosineMultiDistance_param_4,
+ .param .u32 CosineMultiDistance_param_5,
+ .param .u32 CosineMultiDistance_param_6,
+ .param .u32 CosineMultiDistance_param_7
)
{
.reg .pred %p<7>;
@@ -4224,20 +4332,20 @@ $L__BB57_21:
.reg .b64 %rd<28>;
- ld.param.u64 %rd7, [MultiCosineDistance_param_0];
- ld.param.u64 %rd8, [MultiCosineDistance_param_1];
- ld.param.u64 %rd9, [MultiCosineDistance_param_2];
- ld.param.u64 %rd10, [MultiCosineDistance_param_3];
- ld.param.u64 %rd11, [MultiCosineDistance_param_4];
- ld.param.u32 %r17, [MultiCosineDistance_param_5];
- ld.param.u32 %r18, [MultiCosineDistance_param_6];
- ld.param.u32 %r19, [MultiCosineDistance_param_7];
+ ld.param.u64 %rd7, [CosineMultiDistance_param_0];
+ ld.param.u64 %rd8, [CosineMultiDistance_param_1];
+ ld.param.u64 %rd9, [CosineMultiDistance_param_2];
+ ld.param.u64 %rd10, [CosineMultiDistance_param_3];
+ ld.param.u64 %rd11, [CosineMultiDistance_param_4];
+ ld.param.u32 %r17, [CosineMultiDistance_param_5];
+ ld.param.u32 %r18, [CosineMultiDistance_param_6];
+ ld.param.u32 %r19, [CosineMultiDistance_param_7];
mov.u32 %r20, %ctaid.x;
mov.u32 %r1, %ntid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r31, %r1, %r20, %r21;
setp.ge.u32 %p1, %r31, %r19;
- @%p1 bra $L__BB58_9;
+ @%p1 bra $L__BB59_9;
mov.u32 %r22, %ntid.y;
mov.u32 %r23, %ctaid.y;
@@ -4259,16 +4367,16 @@ $L__BB57_21:
cvta.to.global.u64 %rd4, %rd10;
cvta.to.global.u64 %rd5, %rd11;
-$L__BB58_2:
+$L__BB59_2:
setp.ge.u32 %p2, %r3, %r18;
- @%p2 bra $L__BB58_8;
+ @%p2 bra $L__BB59_8;
cvt.u64.u32 %rd6, %r31;
mov.u32 %r32, %r3;
-$L__BB58_4:
+$L__BB59_4:
setp.ge.u32 %p3, %r5, %r17;
- @%p3 bra $L__BB58_7;
+ @%p3 bra $L__BB59_7;
mul.wide.u32 %rd12, %r32, 8;
add.s64 %rd13, %rd1, %rd12;
@@ -4281,7 +4389,7 @@ $L__BB58_4:
mad.lo.s32 %r33, %r32, %r17, %r5;
mov.u32 %r34, %r5;
-$L__BB58_6:
+$L__BB59_6:
mul.wide.u32 %rd18, %r34, 8;
add.s64 %rd19, %rd2, %rd18;
ld.global.nc.u64 %rd20, [%rd19];
@@ -4300,19 +4408,105 @@ $L__BB58_6:
add.s32 %r33, %r33, %r7;
add.s32 %r34, %r34, %r7;
setp.lt.u32 %p4, %r34, %r17;
- @%p4 bra $L__BB58_6;
+ @%p4 bra $L__BB59_6;
-$L__BB58_7:
+$L__BB59_7:
add.s32 %r32, %r32, %r6;
setp.lt.u32 %p5, %r32, %r18;
- @%p5 bra $L__BB58_4;
+ @%p5 bra $L__BB59_4;
-$L__BB58_8:
+$L__BB59_8:
add.s32 %r31, %r31, %r4;
setp.lt.u32 %p6, %r31, %r19;
- @%p6 bra $L__BB58_2;
+ @%p6 bra $L__BB59_2;
-$L__BB58_9:
+$L__BB59_9:
+ ret;
+
+}
+ // .globl CosineDistances
+.visible .entry CosineDistances(
+ .param .u64 CosineDistances_param_0,
+ .param .u64 CosineDistances_param_1,
+ .param .u64 CosineDistances_param_2,
+ .param .u64 CosineDistances_param_3,
+ .param .u64 CosineDistances_param_4,
+ .param .u32 CosineDistances_param_5,
+ .param .u32 CosineDistances_param_6
+)
+{
+ .reg .pred %p<5>;
+ .reg .f32 %f<9>;
+ .reg .b32 %r<21>;
+ .reg .b64 %rd<24>;
+
+
+ ld.param.u64 %rd7, [CosineDistances_param_0];
+ ld.param.u64 %rd8, [CosineDistances_param_1];
+ ld.param.u64 %rd9, [CosineDistances_param_2];
+ ld.param.u64 %rd10, [CosineDistances_param_3];
+ ld.param.u64 %rd11, [CosineDistances_param_4];
+ ld.param.u32 %r10, [CosineDistances_param_5];
+ ld.param.u32 %r11, [CosineDistances_param_6];
+ mov.u32 %r12, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r13, %tid.x;
+ mad.lo.s32 %r19, %r1, %r12, %r13;
+ setp.ge.u32 %p1, %r19, %r11;
+ @%p1 bra $L__BB60_6;
+
+ mov.u32 %r14, %ntid.y;
+ mov.u32 %r15, %ctaid.y;
+ mov.u32 %r16, %tid.y;
+ mad.lo.s32 %r3, %r14, %r15, %r16;
+ mov.u32 %r17, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r17;
+ mov.u32 %r18, %nctaid.y;
+ mul.lo.s32 %r5, %r14, %r18;
+ cvta.to.global.u64 %rd1, %rd7;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd9;
+ cvta.to.global.u64 %rd4, %rd10;
+ cvta.to.global.u64 %rd5, %rd11;
+
+$L__BB60_2:
+ setp.ge.u32 %p2, %r3, %r10;
+ @%p2 bra $L__BB60_5;
+
+ cvt.u64.u32 %rd6, %r19;
+ mul.wide.u32 %rd12, %r19, 4;
+ add.s64 %rd13, %rd1, %rd12;
+ ld.global.nc.f32 %f1, [%rd13];
+ mul.ftz.f32 %f2, %f1, %f1;
+ mov.u32 %r20, %r3;
+
+$L__BB60_4:
+ mul.wide.u32 %rd14, %r20, 8;
+ add.s64 %rd15, %rd2, %rd14;
+ ld.global.nc.u64 %rd16, [%rd15];
+ cvta.to.global.u64 %rd17, %rd16;
+ shl.b64 %rd18, %rd6, 2;
+ add.s64 %rd19, %rd17, %rd18;
+ ld.global.f32 %f3, [%rd19];
+ mul.wide.u32 %rd20, %r20, 4;
+ add.s64 %rd21, %rd3, %rd20;
+ atom.global.add.f32 %f4, [%rd21], %f2;
+ add.s64 %rd22, %rd4, %rd20;
+ mul.ftz.f32 %f5, %f1, %f3;
+ atom.global.add.f32 %f6, [%rd22], %f5;
+ add.s64 %rd23, %rd5, %rd20;
+ mul.ftz.f32 %f7, %f3, %f3;
+ atom.global.add.f32 %f8, [%rd23], %f7;
+ add.s32 %r20, %r20, %r5;
+ setp.lt.u32 %p3, %r20, %r10;
+ @%p3 bra $L__BB60_4;
+
+$L__BB60_5:
+ add.s32 %r19, %r19, %r4;
+ setp.lt.u32 %p4, %r19, %r11;
+ @%p4 bra $L__BB60_2;
+
+$L__BB60_6:
ret;
}
@@ -4340,7 +4534,7 @@ $L__BB58_9:
mov.u32 %r2, %tid.x;
mad.lo.s32 %r3, %r22, %r1, %r2;
setp.ge.u32 %p1, %r3, %r20;
- @%p1 bra $L__BB59_2;
+ @%p1 bra $L__BB61_2;
cvta.to.global.u64 %rd3, %rd1;
mul.lo.s32 %r23, %r3, %r21;
@@ -4352,17 +4546,17 @@ $L__BB58_9:
add.s32 %r26, %r25, %r24;
st.shared.f32 [%r26], %f8;
-$L__BB59_2:
+$L__BB61_2:
bar.sync 0;
setp.ne.s32 %p2, %r2, 0;
- @%p2 bra $L__BB59_11;
+ @%p2 bra $L__BB61_11;
shl.b32 %r4, %r1, 10;
sub.s32 %r27, %r20, %r4;
min.u32 %r5, %r27, 1024;
setp.eq.s32 %p3, %r5, 0;
mov.f32 %f25, 0f00000000;
- @%p3 bra $L__BB59_10;
+ @%p3 bra $L__BB61_10;
not.b32 %r29, %r20;
add.s32 %r30, %r4, %r29;
@@ -4373,7 +4567,7 @@ $L__BB59_2:
setp.lt.u32 %p4, %r33, 3;
mov.f32 %f25, 0f00000000;
mov.u32 %r45, 0;
- @%p4 bra $L__BB59_7;
+ @%p4 bra $L__BB61_7;
add.s32 %r36, %r4, -1;
sub.s32 %r37, %r36, %r20;
@@ -4382,7 +4576,7 @@ $L__BB59_2:
neg.s32 %r42, %r39;
mov.u32 %r43, _ZZ9SumValuesE5block;
-$L__BB59_6:
+$L__BB61_6:
ld.shared.f32 %f13, [%r43];
add.ftz.f32 %f14, %f25, %f13;
ld.shared.f32 %f15, [%r43+4];
@@ -4395,32 +4589,32 @@ $L__BB59_6:
add.s32 %r43, %r43, 16;
add.s32 %r42, %r42, -4;
setp.ne.s32 %p5, %r42, 1;
- @%p5 bra $L__BB59_6;
+ @%p5 bra $L__BB61_6;
-$L__BB59_7:
+$L__BB61_7:
setp.eq.s32 %p6, %r47, 0;
- @%p6 bra $L__BB59_10;
+ @%p6 bra $L__BB61_10;
shl.b32 %r40, %r45, 2;
mov.u32 %r41, _ZZ9SumValuesE5block;
add.s32 %r46, %r41, %r40;
-$L__BB59_9:
+$L__BB61_9:
.pragma "nounroll";
ld.shared.f32 %f20, [%r46];
add.ftz.f32 %f25, %f25, %f20;
add.s32 %r46, %r46, 4;
add.s32 %r47, %r47, -1;
setp.ne.s32 %p7, %r47, 0;
- @%p7 bra $L__BB59_9;
+ @%p7 bra $L__BB61_9;
-$L__BB59_10:
+$L__BB61_10:
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f25;
-$L__BB59_11:
+$L__BB61_11:
ret;
}
diff --git a/BrightData.Cuda/cuda/brightwire_86.ptx b/BrightData.Cuda/cuda/brightwire_86.ptx
index b9d46f59..0fb9cfae 100644
--- a/BrightData.Cuda/cuda/brightwire_86.ptx
+++ b/BrightData.Cuda/cuda/brightwire_86.ptx
@@ -4022,200 +4022,308 @@ $L__BB56_3:
ret;
}
- // .globl CalculateDistances
-.visible .entry CalculateDistances(
- .param .u64 CalculateDistances_param_0,
- .param .u64 CalculateDistances_param_1,
- .param .u64 CalculateDistances_param_2,
- .param .u32 CalculateDistances_param_3,
- .param .u32 CalculateDistances_param_4,
- .param .u32 CalculateDistances_param_5,
- .param .u32 CalculateDistances_param_6
+ // .globl CalculateMultiDistances
+.visible .entry CalculateMultiDistances(
+ .param .u64 CalculateMultiDistances_param_0,
+ .param .u64 CalculateMultiDistances_param_1,
+ .param .u64 CalculateMultiDistances_param_2,
+ .param .u32 CalculateMultiDistances_param_3,
+ .param .u32 CalculateMultiDistances_param_4,
+ .param .u32 CalculateMultiDistances_param_5,
+ .param .u32 CalculateMultiDistances_param_6
)
{
- .reg .pred %p<15>;
- .reg .f32 %f<15>;
- .reg .b32 %r<53>;
- .reg .b64 %rd<46>;
-
-
- ld.param.u64 %rd5, [CalculateDistances_param_0];
- ld.param.u64 %rd6, [CalculateDistances_param_1];
- ld.param.u64 %rd7, [CalculateDistances_param_2];
- ld.param.u32 %r27, [CalculateDistances_param_3];
- ld.param.u32 %r28, [CalculateDistances_param_4];
- ld.param.u32 %r29, [CalculateDistances_param_5];
- ld.param.u32 %r30, [CalculateDistances_param_6];
- cvta.to.global.u64 %rd1, %rd7;
- cvta.to.global.u64 %rd2, %rd6;
- cvta.to.global.u64 %rd3, %rd5;
- mov.u32 %r31, %ctaid.x;
+ .reg .pred %p<13>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<49>;
+ .reg .b64 %rd<38>;
+
+
+ ld.param.u64 %rd6, [CalculateMultiDistances_param_0];
+ ld.param.u64 %rd7, [CalculateMultiDistances_param_1];
+ ld.param.u64 %rd8, [CalculateMultiDistances_param_2];
+ ld.param.u32 %r25, [CalculateMultiDistances_param_3];
+ ld.param.u32 %r26, [CalculateMultiDistances_param_4];
+ ld.param.u32 %r27, [CalculateMultiDistances_param_5];
+ ld.param.u32 %r28, [CalculateMultiDistances_param_6];
+ cvta.to.global.u64 %rd1, %rd8;
+ cvta.to.global.u64 %rd2, %rd7;
+ cvta.to.global.u64 %rd3, %rd6;
+ mov.u32 %r29, %ctaid.x;
mov.u32 %r1, %ntid.x;
- mov.u32 %r32, %tid.x;
- mad.lo.s32 %r45, %r1, %r31, %r32;
- setp.ge.u32 %p1, %r45, %r29;
- @%p1 bra $L__BB57_21;
-
- mov.u32 %r33, %ntid.y;
- mov.u32 %r34, %ctaid.y;
- mov.u32 %r35, %tid.y;
- mad.lo.s32 %r3, %r33, %r34, %r35;
- mov.u32 %r36, %nctaid.x;
- mul.lo.s32 %r4, %r1, %r36;
- mov.u32 %r37, %ctaid.z;
- mov.u32 %r38, %ntid.z;
- mov.u32 %r39, %tid.z;
- mad.lo.s32 %r5, %r38, %r37, %r39;
- mov.u32 %r40, %nctaid.y;
- mul.lo.s32 %r6, %r33, %r40;
- mov.u32 %r41, %nctaid.z;
- mul.lo.s32 %r7, %r38, %r41;
+ mov.u32 %r30, %tid.x;
+ mad.lo.s32 %r42, %r1, %r29, %r30;
+ setp.ge.u32 %p1, %r42, %r27;
+ @%p1 bra $L__BB57_18;
+
+ mov.u32 %r31, %ntid.y;
+ mov.u32 %r32, %ctaid.y;
+ mov.u32 %r33, %tid.y;
+ mad.lo.s32 %r3, %r31, %r32, %r33;
+ mov.u32 %r34, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r34;
+ mov.u32 %r35, %ctaid.z;
+ mov.u32 %r36, %ntid.z;
+ mov.u32 %r37, %tid.z;
+ mad.lo.s32 %r5, %r36, %r35, %r37;
+ mov.u32 %r38, %nctaid.y;
+ mul.lo.s32 %r6, %r31, %r38;
+ mov.u32 %r39, %nctaid.z;
+ mul.lo.s32 %r7, %r36, %r39;
$L__BB57_2:
- setp.ge.u32 %p2, %r3, %r28;
- @%p2 bra $L__BB57_20;
+ setp.ge.u32 %p2, %r3, %r26;
+ @%p2 bra $L__BB57_17;
- setp.eq.s32 %p3, %r30, 0;
- cvt.u64.u32 %rd4, %r45;
- @%p3 bra $L__BB57_15;
+ setp.eq.s32 %p3, %r28, 0;
+ cvt.u64.u32 %rd4, %r42;
+ @%p3 bra $L__BB57_12;
- mov.u32 %r46, %r3;
+ mov.u32 %r43, %r3;
$L__BB57_5:
- setp.ge.u32 %p4, %r5, %r27;
- @%p4 bra $L__BB57_14;
-
- mul.wide.u32 %rd8, %r46, 8;
- add.s64 %rd9, %rd3, %rd8;
- ld.global.nc.u64 %rd10, [%rd9];
- cvta.to.global.u64 %rd11, %rd10;
- shl.b64 %rd12, %rd4, 2;
- add.s64 %rd13, %rd11, %rd12;
- ld.global.f32 %f1, [%rd13];
- mul.lo.s32 %r10, %r46, %r27;
- setp.eq.s32 %p5, %r30, 1;
- @%p5 bra $L__BB57_10;
-
- setp.ne.s32 %p6, %r30, 2;
- @%p6 bra $L__BB57_12;
+ setp.ge.u32 %p4, %r5, %r25;
+ @%p4 bra $L__BB57_11;
- add.s32 %r47, %r5, %r10;
- mov.u32 %r48, %r5;
+ setp.eq.s32 %p5, %r28, 2;
+ mul.wide.u32 %rd9, %r43, 8;
+ add.s64 %rd10, %rd3, %rd9;
+ ld.global.nc.u64 %rd11, [%rd10];
+ cvta.to.global.u64 %rd12, %rd11;
+ shl.b64 %rd13, %rd4, 2;
+ add.s64 %rd5, %rd12, %rd13;
+ mul.lo.s32 %r10, %r43, %r25;
+ @%p5 bra $L__BB57_9;
+ bra.uni $L__BB57_7;
$L__BB57_9:
- mul.wide.u32 %rd14, %r48, 8;
- add.s64 %rd15, %rd2, %rd14;
- ld.global.nc.u64 %rd16, [%rd15];
- cvta.to.global.u64 %rd17, %rd16;
- add.s64 %rd19, %rd17, %rd12;
- ld.global.f32 %f3, [%rd19];
- sub.ftz.f32 %f4, %f1, %f3;
- abs.ftz.f32 %f5, %f4;
- mul.wide.u32 %rd20, %r47, 4;
- add.s64 %rd21, %rd1, %rd20;
- atom.global.add.f32 %f6, [%rd21], %f5;
- add.s32 %r47, %r47, %r7;
- add.s32 %r48, %r48, %r7;
- setp.lt.u32 %p7, %r48, %r27;
- @%p7 bra $L__BB57_9;
- bra.uni $L__BB57_14;
+ ld.global.f32 %f1, [%rd5];
+ mov.u32 %r46, %r5;
$L__BB57_10:
- mov.u32 %r49, %r5;
+ mul.wide.u32 %rd16, %r46, 8;
+ add.s64 %rd17, %rd2, %rd16;
+ ld.global.nc.u64 %rd18, [%rd17];
+ cvta.to.global.u64 %rd19, %rd18;
+ add.s64 %rd21, %rd19, %rd13;
+ ld.global.f32 %f4, [%rd21];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ add.s32 %r40, %r46, %r10;
+ mul.wide.u32 %rd22, %r40, 4;
+ add.s64 %rd23, %rd1, %rd22;
+ atom.global.add.f32 %f7, [%rd23], %f6;
+ add.s32 %r46, %r46, %r7;
+ setp.lt.u32 %p7, %r46, %r25;
+ @%p7 bra $L__BB57_10;
+ bra.uni $L__BB57_11;
+
+$L__BB57_7:
+ add.s32 %r44, %r5, %r10;
+ mov.u32 %r45, %r5;
+
+$L__BB57_8:
+ mul.wide.u32 %rd14, %r44, 4;
+ add.s64 %rd15, %rd1, %rd14;
+ atom.global.add.f32 %f3, [%rd15], 0f00000000;
+ add.s32 %r44, %r44, %r7;
+ add.s32 %r45, %r45, %r7;
+ setp.lt.u32 %p6, %r45, %r25;
+ @%p6 bra $L__BB57_8;
$L__BB57_11:
- mul.wide.u32 %rd22, %r49, 8;
- add.s64 %rd23, %rd2, %rd22;
- ld.global.nc.u64 %rd24, [%rd23];
- cvta.to.global.u64 %rd25, %rd24;
- add.s64 %rd27, %rd25, %rd12;
- ld.global.f32 %f7, [%rd27];
- mul.ftz.f32 %f8, %f1, %f7;
- add.s32 %r42, %r49, %r10;
- mul.wide.u32 %rd28, %r42, 4;
- add.s64 %rd29, %rd1, %rd28;
- atom.global.add.f32 %f9, [%rd29], %f8;
- add.s32 %r49, %r49, %r7;
- setp.lt.u32 %p8, %r49, %r27;
- @%p8 bra $L__BB57_11;
- bra.uni $L__BB57_14;
+ add.s32 %r43, %r43, %r6;
+ setp.lt.u32 %p8, %r43, %r26;
+ @%p8 bra $L__BB57_5;
+ bra.uni $L__BB57_17;
$L__BB57_12:
- mov.u32 %r50, %r5;
+ mov.u32 %r47, %r3;
$L__BB57_13:
- add.s32 %r43, %r50, %r10;
- mul.wide.u32 %rd30, %r43, 4;
- add.s64 %rd31, %rd1, %rd30;
- atom.global.add.f32 %f10, [%rd31], 0f00000000;
- add.s32 %r50, %r50, %r7;
- setp.lt.u32 %p9, %r50, %r27;
- @%p9 bra $L__BB57_13;
-
-$L__BB57_14:
- add.s32 %r46, %r46, %r6;
- setp.lt.u32 %p10, %r46, %r28;
- @%p10 bra $L__BB57_5;
- bra.uni $L__BB57_20;
+ setp.ge.u32 %p9, %r5, %r25;
+ @%p9 bra $L__BB57_16;
+
+ mul.wide.u32 %rd24, %r47, 8;
+ add.s64 %rd25, %rd3, %rd24;
+ ld.global.nc.u64 %rd26, [%rd25];
+ cvta.to.global.u64 %rd27, %rd26;
+ shl.b64 %rd28, %rd4, 2;
+ add.s64 %rd29, %rd27, %rd28;
+ ld.global.f32 %f2, [%rd29];
+ mul.lo.s32 %r20, %r47, %r25;
+ mov.u32 %r48, %r5;
$L__BB57_15:
- mov.u32 %r51, %r3;
+ mul.wide.u32 %rd30, %r48, 8;
+ add.s64 %rd31, %rd2, %rd30;
+ ld.global.nc.u64 %rd32, [%rd31];
+ cvta.to.global.u64 %rd33, %rd32;
+ add.s64 %rd35, %rd33, %rd28;
+ ld.global.f32 %f8, [%rd35];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ add.s32 %r41, %r48, %r20;
+ mul.wide.u32 %rd36, %r41, 4;
+ add.s64 %rd37, %rd1, %rd36;
+ atom.global.add.f32 %f11, [%rd37], %f10;
+ add.s32 %r48, %r48, %r7;
+ setp.lt.u32 %p10, %r48, %r25;
+ @%p10 bra $L__BB57_15;
$L__BB57_16:
- setp.ge.u32 %p11, %r5, %r27;
- @%p11 bra $L__BB57_19;
-
- mul.wide.u32 %rd32, %r51, 8;
- add.s64 %rd33, %rd3, %rd32;
- ld.global.nc.u64 %rd34, [%rd33];
- cvta.to.global.u64 %rd35, %rd34;
- shl.b64 %rd36, %rd4, 2;
- add.s64 %rd37, %rd35, %rd36;
- ld.global.f32 %f2, [%rd37];
- mul.lo.s32 %r22, %r51, %r27;
- mov.u32 %r52, %r5;
+ add.s32 %r47, %r47, %r6;
+ setp.lt.u32 %p11, %r47, %r26;
+ @%p11 bra $L__BB57_13;
+
+$L__BB57_17:
+ add.s32 %r42, %r42, %r4;
+ setp.lt.u32 %p12, %r42, %r27;
+ @%p12 bra $L__BB57_2;
$L__BB57_18:
- mul.wide.u32 %rd38, %r52, 8;
- add.s64 %rd39, %rd2, %rd38;
- ld.global.nc.u64 %rd40, [%rd39];
- cvta.to.global.u64 %rd41, %rd40;
- add.s64 %rd43, %rd41, %rd36;
- ld.global.f32 %f11, [%rd43];
- sub.ftz.f32 %f12, %f2, %f11;
- mul.ftz.f32 %f13, %f12, %f12;
- add.s32 %r44, %r52, %r22;
- mul.wide.u32 %rd44, %r44, 4;
- add.s64 %rd45, %rd1, %rd44;
- atom.global.add.f32 %f14, [%rd45], %f13;
- add.s32 %r52, %r52, %r7;
- setp.lt.u32 %p12, %r52, %r27;
- @%p12 bra $L__BB57_18;
-
-$L__BB57_19:
- add.s32 %r51, %r51, %r6;
- setp.lt.u32 %p13, %r51, %r28;
- @%p13 bra $L__BB57_16;
-
-$L__BB57_20:
- add.s32 %r45, %r45, %r4;
- setp.lt.u32 %p14, %r45, %r29;
- @%p14 bra $L__BB57_2;
-
-$L__BB57_21:
ret;
}
- // .globl MultiCosineDistance
-.visible .entry MultiCosineDistance(
- .param .u64 MultiCosineDistance_param_0,
- .param .u64 MultiCosineDistance_param_1,
- .param .u64 MultiCosineDistance_param_2,
- .param .u64 MultiCosineDistance_param_3,
- .param .u64 MultiCosineDistance_param_4,
- .param .u32 MultiCosineDistance_param_5,
- .param .u32 MultiCosineDistance_param_6,
- .param .u32 MultiCosineDistance_param_7
+ // .globl CalculateDistances
+.visible .entry CalculateDistances(
+ .param .u64 CalculateDistances_param_0,
+ .param .u64 CalculateDistances_param_1,
+ .param .u64 CalculateDistances_param_2,
+ .param .u32 CalculateDistances_param_3,
+ .param .u32 CalculateDistances_param_4,
+ .param .u32 CalculateDistances_param_5
+)
+{
+ .reg .pred %p<11>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<31>;
+ .reg .b64 %rd<31>;
+
+
+ ld.param.u64 %rd7, [CalculateDistances_param_0];
+ ld.param.u64 %rd8, [CalculateDistances_param_1];
+ ld.param.u64 %rd9, [CalculateDistances_param_2];
+ ld.param.u32 %r16, [CalculateDistances_param_3];
+ ld.param.u32 %r17, [CalculateDistances_param_4];
+ ld.param.u32 %r18, [CalculateDistances_param_5];
+ cvta.to.global.u64 %rd1, %rd9;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd7;
+ mov.u32 %r19, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r20, %tid.x;
+ mad.lo.s32 %r26, %r1, %r19, %r20;
+ setp.ge.u32 %p1, %r26, %r17;
+ @%p1 bra $L__BB58_15;
+
+ mov.u32 %r21, %ntid.y;
+ mov.u32 %r22, %ctaid.y;
+ mov.u32 %r23, %tid.y;
+ mad.lo.s32 %r3, %r21, %r22, %r23;
+ mov.u32 %r24, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r24;
+ mov.u32 %r25, %nctaid.y;
+ mul.lo.s32 %r5, %r21, %r25;
+ setp.eq.s32 %p2, %r18, 0;
+ @%p2 bra $L__BB58_11;
+
+$L__BB58_3:
+ setp.ge.u32 %p3, %r3, %r16;
+ @%p3 bra $L__BB58_9;
+
+ setp.eq.s32 %p4, %r18, 2;
+ cvt.u64.u32 %rd4, %r26;
+ mul.wide.u32 %rd10, %r26, 4;
+ add.s64 %rd5, %rd3, %rd10;
+ @%p4 bra $L__BB58_7;
+ bra.uni $L__BB58_5;
+
+$L__BB58_7:
+ ld.global.nc.f32 %f1, [%rd5];
+ mov.u32 %r28, %r3;
+
+$L__BB58_8:
+ mul.wide.u32 %rd13, %r28, 8;
+ add.s64 %rd14, %rd2, %rd13;
+ ld.global.nc.u64 %rd15, [%rd14];
+ cvta.to.global.u64 %rd16, %rd15;
+ shl.b64 %rd17, %rd4, 2;
+ add.s64 %rd18, %rd16, %rd17;
+ ld.global.f32 %f4, [%rd18];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ mul.wide.u32 %rd19, %r28, 4;
+ add.s64 %rd20, %rd1, %rd19;
+ atom.global.add.f32 %f7, [%rd20], %f6;
+ add.s32 %r28, %r28, %r5;
+ setp.lt.u32 %p6, %r28, %r16;
+ @%p6 bra $L__BB58_8;
+ bra.uni $L__BB58_9;
+
+$L__BB58_5:
+ mov.u32 %r27, %r3;
+
+$L__BB58_6:
+ mul.wide.u32 %rd11, %r27, 4;
+ add.s64 %rd12, %rd1, %rd11;
+ atom.global.add.f32 %f3, [%rd12], 0f00000000;
+ add.s32 %r27, %r27, %r5;
+ setp.lt.u32 %p5, %r27, %r16;
+ @%p5 bra $L__BB58_6;
+
+$L__BB58_9:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p7, %r26, %r17;
+ @%p7 bra $L__BB58_3;
+ bra.uni $L__BB58_15;
+
+$L__BB58_11:
+ setp.ge.u32 %p8, %r3, %r16;
+ @%p8 bra $L__BB58_14;
+
+ cvt.u64.u32 %rd6, %r26;
+ mul.wide.u32 %rd21, %r26, 4;
+ add.s64 %rd22, %rd3, %rd21;
+ ld.global.nc.f32 %f2, [%rd22];
+ mov.u32 %r30, %r3;
+
+$L__BB58_13:
+ mul.wide.u32 %rd23, %r30, 8;
+ add.s64 %rd24, %rd2, %rd23;
+ ld.global.nc.u64 %rd25, [%rd24];
+ cvta.to.global.u64 %rd26, %rd25;
+ shl.b64 %rd27, %rd6, 2;
+ add.s64 %rd28, %rd26, %rd27;
+ ld.global.f32 %f8, [%rd28];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ mul.wide.u32 %rd29, %r30, 4;
+ add.s64 %rd30, %rd1, %rd29;
+ atom.global.add.f32 %f11, [%rd30], %f10;
+ add.s32 %r30, %r30, %r5;
+ setp.lt.u32 %p9, %r30, %r16;
+ @%p9 bra $L__BB58_13;
+
+$L__BB58_14:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p10, %r26, %r17;
+ @%p10 bra $L__BB58_11;
+
+$L__BB58_15:
+ ret;
+
+}
+ // .globl CosineMultiDistance
+.visible .entry CosineMultiDistance(
+ .param .u64 CosineMultiDistance_param_0,
+ .param .u64 CosineMultiDistance_param_1,
+ .param .u64 CosineMultiDistance_param_2,
+ .param .u64 CosineMultiDistance_param_3,
+ .param .u64 CosineMultiDistance_param_4,
+ .param .u32 CosineMultiDistance_param_5,
+ .param .u32 CosineMultiDistance_param_6,
+ .param .u32 CosineMultiDistance_param_7
)
{
.reg .pred %p<7>;
@@ -4224,20 +4332,20 @@ $L__BB57_21:
.reg .b64 %rd<28>;
- ld.param.u64 %rd7, [MultiCosineDistance_param_0];
- ld.param.u64 %rd8, [MultiCosineDistance_param_1];
- ld.param.u64 %rd9, [MultiCosineDistance_param_2];
- ld.param.u64 %rd10, [MultiCosineDistance_param_3];
- ld.param.u64 %rd11, [MultiCosineDistance_param_4];
- ld.param.u32 %r17, [MultiCosineDistance_param_5];
- ld.param.u32 %r18, [MultiCosineDistance_param_6];
- ld.param.u32 %r19, [MultiCosineDistance_param_7];
+ ld.param.u64 %rd7, [CosineMultiDistance_param_0];
+ ld.param.u64 %rd8, [CosineMultiDistance_param_1];
+ ld.param.u64 %rd9, [CosineMultiDistance_param_2];
+ ld.param.u64 %rd10, [CosineMultiDistance_param_3];
+ ld.param.u64 %rd11, [CosineMultiDistance_param_4];
+ ld.param.u32 %r17, [CosineMultiDistance_param_5];
+ ld.param.u32 %r18, [CosineMultiDistance_param_6];
+ ld.param.u32 %r19, [CosineMultiDistance_param_7];
mov.u32 %r20, %ctaid.x;
mov.u32 %r1, %ntid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r31, %r1, %r20, %r21;
setp.ge.u32 %p1, %r31, %r19;
- @%p1 bra $L__BB58_9;
+ @%p1 bra $L__BB59_9;
mov.u32 %r22, %ntid.y;
mov.u32 %r23, %ctaid.y;
@@ -4259,16 +4367,16 @@ $L__BB57_21:
cvta.to.global.u64 %rd4, %rd10;
cvta.to.global.u64 %rd5, %rd11;
-$L__BB58_2:
+$L__BB59_2:
setp.ge.u32 %p2, %r3, %r18;
- @%p2 bra $L__BB58_8;
+ @%p2 bra $L__BB59_8;
cvt.u64.u32 %rd6, %r31;
mov.u32 %r32, %r3;
-$L__BB58_4:
+$L__BB59_4:
setp.ge.u32 %p3, %r5, %r17;
- @%p3 bra $L__BB58_7;
+ @%p3 bra $L__BB59_7;
mul.wide.u32 %rd12, %r32, 8;
add.s64 %rd13, %rd1, %rd12;
@@ -4281,7 +4389,7 @@ $L__BB58_4:
mad.lo.s32 %r33, %r32, %r17, %r5;
mov.u32 %r34, %r5;
-$L__BB58_6:
+$L__BB59_6:
mul.wide.u32 %rd18, %r34, 8;
add.s64 %rd19, %rd2, %rd18;
ld.global.nc.u64 %rd20, [%rd19];
@@ -4300,19 +4408,105 @@ $L__BB58_6:
add.s32 %r33, %r33, %r7;
add.s32 %r34, %r34, %r7;
setp.lt.u32 %p4, %r34, %r17;
- @%p4 bra $L__BB58_6;
+ @%p4 bra $L__BB59_6;
-$L__BB58_7:
+$L__BB59_7:
add.s32 %r32, %r32, %r6;
setp.lt.u32 %p5, %r32, %r18;
- @%p5 bra $L__BB58_4;
+ @%p5 bra $L__BB59_4;
-$L__BB58_8:
+$L__BB59_8:
add.s32 %r31, %r31, %r4;
setp.lt.u32 %p6, %r31, %r19;
- @%p6 bra $L__BB58_2;
+ @%p6 bra $L__BB59_2;
-$L__BB58_9:
+$L__BB59_9:
+ ret;
+
+}
+ // .globl CosineDistances
+.visible .entry CosineDistances(
+ .param .u64 CosineDistances_param_0,
+ .param .u64 CosineDistances_param_1,
+ .param .u64 CosineDistances_param_2,
+ .param .u64 CosineDistances_param_3,
+ .param .u64 CosineDistances_param_4,
+ .param .u32 CosineDistances_param_5,
+ .param .u32 CosineDistances_param_6
+)
+{
+ .reg .pred %p<5>;
+ .reg .f32 %f<9>;
+ .reg .b32 %r<21>;
+ .reg .b64 %rd<24>;
+
+
+ ld.param.u64 %rd7, [CosineDistances_param_0];
+ ld.param.u64 %rd8, [CosineDistances_param_1];
+ ld.param.u64 %rd9, [CosineDistances_param_2];
+ ld.param.u64 %rd10, [CosineDistances_param_3];
+ ld.param.u64 %rd11, [CosineDistances_param_4];
+ ld.param.u32 %r10, [CosineDistances_param_5];
+ ld.param.u32 %r11, [CosineDistances_param_6];
+ mov.u32 %r12, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r13, %tid.x;
+ mad.lo.s32 %r19, %r1, %r12, %r13;
+ setp.ge.u32 %p1, %r19, %r11;
+ @%p1 bra $L__BB60_6;
+
+ mov.u32 %r14, %ntid.y;
+ mov.u32 %r15, %ctaid.y;
+ mov.u32 %r16, %tid.y;
+ mad.lo.s32 %r3, %r14, %r15, %r16;
+ mov.u32 %r17, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r17;
+ mov.u32 %r18, %nctaid.y;
+ mul.lo.s32 %r5, %r14, %r18;
+ cvta.to.global.u64 %rd1, %rd7;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd9;
+ cvta.to.global.u64 %rd4, %rd10;
+ cvta.to.global.u64 %rd5, %rd11;
+
+$L__BB60_2:
+ setp.ge.u32 %p2, %r3, %r10;
+ @%p2 bra $L__BB60_5;
+
+ cvt.u64.u32 %rd6, %r19;
+ mul.wide.u32 %rd12, %r19, 4;
+ add.s64 %rd13, %rd1, %rd12;
+ ld.global.nc.f32 %f1, [%rd13];
+ mul.ftz.f32 %f2, %f1, %f1;
+ mov.u32 %r20, %r3;
+
+$L__BB60_4:
+ mul.wide.u32 %rd14, %r20, 8;
+ add.s64 %rd15, %rd2, %rd14;
+ ld.global.nc.u64 %rd16, [%rd15];
+ cvta.to.global.u64 %rd17, %rd16;
+ shl.b64 %rd18, %rd6, 2;
+ add.s64 %rd19, %rd17, %rd18;
+ ld.global.f32 %f3, [%rd19];
+ mul.wide.u32 %rd20, %r20, 4;
+ add.s64 %rd21, %rd3, %rd20;
+ atom.global.add.f32 %f4, [%rd21], %f2;
+ add.s64 %rd22, %rd4, %rd20;
+ mul.ftz.f32 %f5, %f1, %f3;
+ atom.global.add.f32 %f6, [%rd22], %f5;
+ add.s64 %rd23, %rd5, %rd20;
+ mul.ftz.f32 %f7, %f3, %f3;
+ atom.global.add.f32 %f8, [%rd23], %f7;
+ add.s32 %r20, %r20, %r5;
+ setp.lt.u32 %p3, %r20, %r10;
+ @%p3 bra $L__BB60_4;
+
+$L__BB60_5:
+ add.s32 %r19, %r19, %r4;
+ setp.lt.u32 %p4, %r19, %r11;
+ @%p4 bra $L__BB60_2;
+
+$L__BB60_6:
ret;
}
@@ -4340,7 +4534,7 @@ $L__BB58_9:
mov.u32 %r2, %tid.x;
mad.lo.s32 %r3, %r22, %r1, %r2;
setp.ge.u32 %p1, %r3, %r20;
- @%p1 bra $L__BB59_2;
+ @%p1 bra $L__BB61_2;
cvta.to.global.u64 %rd3, %rd1;
mul.lo.s32 %r23, %r3, %r21;
@@ -4352,17 +4546,17 @@ $L__BB58_9:
add.s32 %r26, %r25, %r24;
st.shared.f32 [%r26], %f8;
-$L__BB59_2:
+$L__BB61_2:
bar.sync 0;
setp.ne.s32 %p2, %r2, 0;
- @%p2 bra $L__BB59_11;
+ @%p2 bra $L__BB61_11;
shl.b32 %r4, %r1, 10;
sub.s32 %r27, %r20, %r4;
min.u32 %r5, %r27, 1024;
setp.eq.s32 %p3, %r5, 0;
mov.f32 %f25, 0f00000000;
- @%p3 bra $L__BB59_10;
+ @%p3 bra $L__BB61_10;
not.b32 %r29, %r20;
add.s32 %r30, %r4, %r29;
@@ -4373,7 +4567,7 @@ $L__BB59_2:
setp.lt.u32 %p4, %r33, 3;
mov.f32 %f25, 0f00000000;
mov.u32 %r45, 0;
- @%p4 bra $L__BB59_7;
+ @%p4 bra $L__BB61_7;
add.s32 %r36, %r4, -1;
sub.s32 %r37, %r36, %r20;
@@ -4382,7 +4576,7 @@ $L__BB59_2:
neg.s32 %r42, %r39;
mov.u32 %r43, _ZZ9SumValuesE5block;
-$L__BB59_6:
+$L__BB61_6:
ld.shared.f32 %f13, [%r43];
add.ftz.f32 %f14, %f25, %f13;
ld.shared.f32 %f15, [%r43+4];
@@ -4395,32 +4589,32 @@ $L__BB59_6:
add.s32 %r43, %r43, 16;
add.s32 %r42, %r42, -4;
setp.ne.s32 %p5, %r42, 1;
- @%p5 bra $L__BB59_6;
+ @%p5 bra $L__BB61_6;
-$L__BB59_7:
+$L__BB61_7:
setp.eq.s32 %p6, %r47, 0;
- @%p6 bra $L__BB59_10;
+ @%p6 bra $L__BB61_10;
shl.b32 %r40, %r45, 2;
mov.u32 %r41, _ZZ9SumValuesE5block;
add.s32 %r46, %r41, %r40;
-$L__BB59_9:
+$L__BB61_9:
.pragma "nounroll";
ld.shared.f32 %f20, [%r46];
add.ftz.f32 %f25, %f25, %f20;
add.s32 %r46, %r46, 4;
add.s32 %r47, %r47, -1;
setp.ne.s32 %p7, %r47, 0;
- @%p7 bra $L__BB59_9;
+ @%p7 bra $L__BB61_9;
-$L__BB59_10:
+$L__BB61_10:
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f25;
-$L__BB59_11:
+$L__BB61_11:
ret;
}
diff --git a/BrightData.Cuda/cuda/brightwire_87.ptx b/BrightData.Cuda/cuda/brightwire_87.ptx
index 74abd754..1401facf 100644
--- a/BrightData.Cuda/cuda/brightwire_87.ptx
+++ b/BrightData.Cuda/cuda/brightwire_87.ptx
@@ -4022,200 +4022,308 @@ $L__BB56_3:
ret;
}
- // .globl CalculateDistances
-.visible .entry CalculateDistances(
- .param .u64 CalculateDistances_param_0,
- .param .u64 CalculateDistances_param_1,
- .param .u64 CalculateDistances_param_2,
- .param .u32 CalculateDistances_param_3,
- .param .u32 CalculateDistances_param_4,
- .param .u32 CalculateDistances_param_5,
- .param .u32 CalculateDistances_param_6
+ // .globl CalculateMultiDistances
+.visible .entry CalculateMultiDistances(
+ .param .u64 CalculateMultiDistances_param_0,
+ .param .u64 CalculateMultiDistances_param_1,
+ .param .u64 CalculateMultiDistances_param_2,
+ .param .u32 CalculateMultiDistances_param_3,
+ .param .u32 CalculateMultiDistances_param_4,
+ .param .u32 CalculateMultiDistances_param_5,
+ .param .u32 CalculateMultiDistances_param_6
)
{
- .reg .pred %p<15>;
- .reg .f32 %f<15>;
- .reg .b32 %r<53>;
- .reg .b64 %rd<46>;
-
-
- ld.param.u64 %rd5, [CalculateDistances_param_0];
- ld.param.u64 %rd6, [CalculateDistances_param_1];
- ld.param.u64 %rd7, [CalculateDistances_param_2];
- ld.param.u32 %r27, [CalculateDistances_param_3];
- ld.param.u32 %r28, [CalculateDistances_param_4];
- ld.param.u32 %r29, [CalculateDistances_param_5];
- ld.param.u32 %r30, [CalculateDistances_param_6];
- cvta.to.global.u64 %rd1, %rd7;
- cvta.to.global.u64 %rd2, %rd6;
- cvta.to.global.u64 %rd3, %rd5;
- mov.u32 %r31, %ctaid.x;
+ .reg .pred %p<13>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<49>;
+ .reg .b64 %rd<38>;
+
+
+ ld.param.u64 %rd6, [CalculateMultiDistances_param_0];
+ ld.param.u64 %rd7, [CalculateMultiDistances_param_1];
+ ld.param.u64 %rd8, [CalculateMultiDistances_param_2];
+ ld.param.u32 %r25, [CalculateMultiDistances_param_3];
+ ld.param.u32 %r26, [CalculateMultiDistances_param_4];
+ ld.param.u32 %r27, [CalculateMultiDistances_param_5];
+ ld.param.u32 %r28, [CalculateMultiDistances_param_6];
+ cvta.to.global.u64 %rd1, %rd8;
+ cvta.to.global.u64 %rd2, %rd7;
+ cvta.to.global.u64 %rd3, %rd6;
+ mov.u32 %r29, %ctaid.x;
mov.u32 %r1, %ntid.x;
- mov.u32 %r32, %tid.x;
- mad.lo.s32 %r45, %r1, %r31, %r32;
- setp.ge.u32 %p1, %r45, %r29;
- @%p1 bra $L__BB57_21;
-
- mov.u32 %r33, %ntid.y;
- mov.u32 %r34, %ctaid.y;
- mov.u32 %r35, %tid.y;
- mad.lo.s32 %r3, %r33, %r34, %r35;
- mov.u32 %r36, %nctaid.x;
- mul.lo.s32 %r4, %r1, %r36;
- mov.u32 %r37, %ctaid.z;
- mov.u32 %r38, %ntid.z;
- mov.u32 %r39, %tid.z;
- mad.lo.s32 %r5, %r38, %r37, %r39;
- mov.u32 %r40, %nctaid.y;
- mul.lo.s32 %r6, %r33, %r40;
- mov.u32 %r41, %nctaid.z;
- mul.lo.s32 %r7, %r38, %r41;
+ mov.u32 %r30, %tid.x;
+ mad.lo.s32 %r42, %r1, %r29, %r30;
+ setp.ge.u32 %p1, %r42, %r27;
+ @%p1 bra $L__BB57_18;
+
+ mov.u32 %r31, %ntid.y;
+ mov.u32 %r32, %ctaid.y;
+ mov.u32 %r33, %tid.y;
+ mad.lo.s32 %r3, %r31, %r32, %r33;
+ mov.u32 %r34, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r34;
+ mov.u32 %r35, %ctaid.z;
+ mov.u32 %r36, %ntid.z;
+ mov.u32 %r37, %tid.z;
+ mad.lo.s32 %r5, %r36, %r35, %r37;
+ mov.u32 %r38, %nctaid.y;
+ mul.lo.s32 %r6, %r31, %r38;
+ mov.u32 %r39, %nctaid.z;
+ mul.lo.s32 %r7, %r36, %r39;
$L__BB57_2:
- setp.ge.u32 %p2, %r3, %r28;
- @%p2 bra $L__BB57_20;
+ setp.ge.u32 %p2, %r3, %r26;
+ @%p2 bra $L__BB57_17;
- setp.eq.s32 %p3, %r30, 0;
- cvt.u64.u32 %rd4, %r45;
- @%p3 bra $L__BB57_15;
+ setp.eq.s32 %p3, %r28, 0;
+ cvt.u64.u32 %rd4, %r42;
+ @%p3 bra $L__BB57_12;
- mov.u32 %r46, %r3;
+ mov.u32 %r43, %r3;
$L__BB57_5:
- setp.ge.u32 %p4, %r5, %r27;
- @%p4 bra $L__BB57_14;
-
- mul.wide.u32 %rd8, %r46, 8;
- add.s64 %rd9, %rd3, %rd8;
- ld.global.nc.u64 %rd10, [%rd9];
- cvta.to.global.u64 %rd11, %rd10;
- shl.b64 %rd12, %rd4, 2;
- add.s64 %rd13, %rd11, %rd12;
- ld.global.f32 %f1, [%rd13];
- mul.lo.s32 %r10, %r46, %r27;
- setp.eq.s32 %p5, %r30, 1;
- @%p5 bra $L__BB57_10;
-
- setp.ne.s32 %p6, %r30, 2;
- @%p6 bra $L__BB57_12;
+ setp.ge.u32 %p4, %r5, %r25;
+ @%p4 bra $L__BB57_11;
- add.s32 %r47, %r5, %r10;
- mov.u32 %r48, %r5;
+ setp.eq.s32 %p5, %r28, 2;
+ mul.wide.u32 %rd9, %r43, 8;
+ add.s64 %rd10, %rd3, %rd9;
+ ld.global.nc.u64 %rd11, [%rd10];
+ cvta.to.global.u64 %rd12, %rd11;
+ shl.b64 %rd13, %rd4, 2;
+ add.s64 %rd5, %rd12, %rd13;
+ mul.lo.s32 %r10, %r43, %r25;
+ @%p5 bra $L__BB57_9;
+ bra.uni $L__BB57_7;
$L__BB57_9:
- mul.wide.u32 %rd14, %r48, 8;
- add.s64 %rd15, %rd2, %rd14;
- ld.global.nc.u64 %rd16, [%rd15];
- cvta.to.global.u64 %rd17, %rd16;
- add.s64 %rd19, %rd17, %rd12;
- ld.global.f32 %f3, [%rd19];
- sub.ftz.f32 %f4, %f1, %f3;
- abs.ftz.f32 %f5, %f4;
- mul.wide.u32 %rd20, %r47, 4;
- add.s64 %rd21, %rd1, %rd20;
- atom.global.add.f32 %f6, [%rd21], %f5;
- add.s32 %r47, %r47, %r7;
- add.s32 %r48, %r48, %r7;
- setp.lt.u32 %p7, %r48, %r27;
- @%p7 bra $L__BB57_9;
- bra.uni $L__BB57_14;
+ ld.global.f32 %f1, [%rd5];
+ mov.u32 %r46, %r5;
$L__BB57_10:
- mov.u32 %r49, %r5;
+ mul.wide.u32 %rd16, %r46, 8;
+ add.s64 %rd17, %rd2, %rd16;
+ ld.global.nc.u64 %rd18, [%rd17];
+ cvta.to.global.u64 %rd19, %rd18;
+ add.s64 %rd21, %rd19, %rd13;
+ ld.global.f32 %f4, [%rd21];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ add.s32 %r40, %r46, %r10;
+ mul.wide.u32 %rd22, %r40, 4;
+ add.s64 %rd23, %rd1, %rd22;
+ atom.global.add.f32 %f7, [%rd23], %f6;
+ add.s32 %r46, %r46, %r7;
+ setp.lt.u32 %p7, %r46, %r25;
+ @%p7 bra $L__BB57_10;
+ bra.uni $L__BB57_11;
+
+$L__BB57_7:
+ add.s32 %r44, %r5, %r10;
+ mov.u32 %r45, %r5;
+
+$L__BB57_8:
+ mul.wide.u32 %rd14, %r44, 4;
+ add.s64 %rd15, %rd1, %rd14;
+ atom.global.add.f32 %f3, [%rd15], 0f00000000;
+ add.s32 %r44, %r44, %r7;
+ add.s32 %r45, %r45, %r7;
+ setp.lt.u32 %p6, %r45, %r25;
+ @%p6 bra $L__BB57_8;
$L__BB57_11:
- mul.wide.u32 %rd22, %r49, 8;
- add.s64 %rd23, %rd2, %rd22;
- ld.global.nc.u64 %rd24, [%rd23];
- cvta.to.global.u64 %rd25, %rd24;
- add.s64 %rd27, %rd25, %rd12;
- ld.global.f32 %f7, [%rd27];
- mul.ftz.f32 %f8, %f1, %f7;
- add.s32 %r42, %r49, %r10;
- mul.wide.u32 %rd28, %r42, 4;
- add.s64 %rd29, %rd1, %rd28;
- atom.global.add.f32 %f9, [%rd29], %f8;
- add.s32 %r49, %r49, %r7;
- setp.lt.u32 %p8, %r49, %r27;
- @%p8 bra $L__BB57_11;
- bra.uni $L__BB57_14;
+ add.s32 %r43, %r43, %r6;
+ setp.lt.u32 %p8, %r43, %r26;
+ @%p8 bra $L__BB57_5;
+ bra.uni $L__BB57_17;
$L__BB57_12:
- mov.u32 %r50, %r5;
+ mov.u32 %r47, %r3;
$L__BB57_13:
- add.s32 %r43, %r50, %r10;
- mul.wide.u32 %rd30, %r43, 4;
- add.s64 %rd31, %rd1, %rd30;
- atom.global.add.f32 %f10, [%rd31], 0f00000000;
- add.s32 %r50, %r50, %r7;
- setp.lt.u32 %p9, %r50, %r27;
- @%p9 bra $L__BB57_13;
-
-$L__BB57_14:
- add.s32 %r46, %r46, %r6;
- setp.lt.u32 %p10, %r46, %r28;
- @%p10 bra $L__BB57_5;
- bra.uni $L__BB57_20;
+ setp.ge.u32 %p9, %r5, %r25;
+ @%p9 bra $L__BB57_16;
+
+ mul.wide.u32 %rd24, %r47, 8;
+ add.s64 %rd25, %rd3, %rd24;
+ ld.global.nc.u64 %rd26, [%rd25];
+ cvta.to.global.u64 %rd27, %rd26;
+ shl.b64 %rd28, %rd4, 2;
+ add.s64 %rd29, %rd27, %rd28;
+ ld.global.f32 %f2, [%rd29];
+ mul.lo.s32 %r20, %r47, %r25;
+ mov.u32 %r48, %r5;
$L__BB57_15:
- mov.u32 %r51, %r3;
+ mul.wide.u32 %rd30, %r48, 8;
+ add.s64 %rd31, %rd2, %rd30;
+ ld.global.nc.u64 %rd32, [%rd31];
+ cvta.to.global.u64 %rd33, %rd32;
+ add.s64 %rd35, %rd33, %rd28;
+ ld.global.f32 %f8, [%rd35];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ add.s32 %r41, %r48, %r20;
+ mul.wide.u32 %rd36, %r41, 4;
+ add.s64 %rd37, %rd1, %rd36;
+ atom.global.add.f32 %f11, [%rd37], %f10;
+ add.s32 %r48, %r48, %r7;
+ setp.lt.u32 %p10, %r48, %r25;
+ @%p10 bra $L__BB57_15;
$L__BB57_16:
- setp.ge.u32 %p11, %r5, %r27;
- @%p11 bra $L__BB57_19;
-
- mul.wide.u32 %rd32, %r51, 8;
- add.s64 %rd33, %rd3, %rd32;
- ld.global.nc.u64 %rd34, [%rd33];
- cvta.to.global.u64 %rd35, %rd34;
- shl.b64 %rd36, %rd4, 2;
- add.s64 %rd37, %rd35, %rd36;
- ld.global.f32 %f2, [%rd37];
- mul.lo.s32 %r22, %r51, %r27;
- mov.u32 %r52, %r5;
+ add.s32 %r47, %r47, %r6;
+ setp.lt.u32 %p11, %r47, %r26;
+ @%p11 bra $L__BB57_13;
+
+$L__BB57_17:
+ add.s32 %r42, %r42, %r4;
+ setp.lt.u32 %p12, %r42, %r27;
+ @%p12 bra $L__BB57_2;
$L__BB57_18:
- mul.wide.u32 %rd38, %r52, 8;
- add.s64 %rd39, %rd2, %rd38;
- ld.global.nc.u64 %rd40, [%rd39];
- cvta.to.global.u64 %rd41, %rd40;
- add.s64 %rd43, %rd41, %rd36;
- ld.global.f32 %f11, [%rd43];
- sub.ftz.f32 %f12, %f2, %f11;
- mul.ftz.f32 %f13, %f12, %f12;
- add.s32 %r44, %r52, %r22;
- mul.wide.u32 %rd44, %r44, 4;
- add.s64 %rd45, %rd1, %rd44;
- atom.global.add.f32 %f14, [%rd45], %f13;
- add.s32 %r52, %r52, %r7;
- setp.lt.u32 %p12, %r52, %r27;
- @%p12 bra $L__BB57_18;
-
-$L__BB57_19:
- add.s32 %r51, %r51, %r6;
- setp.lt.u32 %p13, %r51, %r28;
- @%p13 bra $L__BB57_16;
-
-$L__BB57_20:
- add.s32 %r45, %r45, %r4;
- setp.lt.u32 %p14, %r45, %r29;
- @%p14 bra $L__BB57_2;
-
-$L__BB57_21:
ret;
}
- // .globl MultiCosineDistance
-.visible .entry MultiCosineDistance(
- .param .u64 MultiCosineDistance_param_0,
- .param .u64 MultiCosineDistance_param_1,
- .param .u64 MultiCosineDistance_param_2,
- .param .u64 MultiCosineDistance_param_3,
- .param .u64 MultiCosineDistance_param_4,
- .param .u32 MultiCosineDistance_param_5,
- .param .u32 MultiCosineDistance_param_6,
- .param .u32 MultiCosineDistance_param_7
+ // .globl CalculateDistances
+.visible .entry CalculateDistances(
+ .param .u64 CalculateDistances_param_0,
+ .param .u64 CalculateDistances_param_1,
+ .param .u64 CalculateDistances_param_2,
+ .param .u32 CalculateDistances_param_3,
+ .param .u32 CalculateDistances_param_4,
+ .param .u32 CalculateDistances_param_5
+)
+{
+ .reg .pred %p<11>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<31>;
+ .reg .b64 %rd<31>;
+
+
+ ld.param.u64 %rd7, [CalculateDistances_param_0];
+ ld.param.u64 %rd8, [CalculateDistances_param_1];
+ ld.param.u64 %rd9, [CalculateDistances_param_2];
+ ld.param.u32 %r16, [CalculateDistances_param_3];
+ ld.param.u32 %r17, [CalculateDistances_param_4];
+ ld.param.u32 %r18, [CalculateDistances_param_5];
+ cvta.to.global.u64 %rd1, %rd9;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd7;
+ mov.u32 %r19, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r20, %tid.x;
+ mad.lo.s32 %r26, %r1, %r19, %r20;
+ setp.ge.u32 %p1, %r26, %r17;
+ @%p1 bra $L__BB58_15;
+
+ mov.u32 %r21, %ntid.y;
+ mov.u32 %r22, %ctaid.y;
+ mov.u32 %r23, %tid.y;
+ mad.lo.s32 %r3, %r21, %r22, %r23;
+ mov.u32 %r24, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r24;
+ mov.u32 %r25, %nctaid.y;
+ mul.lo.s32 %r5, %r21, %r25;
+ setp.eq.s32 %p2, %r18, 0;
+ @%p2 bra $L__BB58_11;
+
+$L__BB58_3:
+ setp.ge.u32 %p3, %r3, %r16;
+ @%p3 bra $L__BB58_9;
+
+ setp.eq.s32 %p4, %r18, 2;
+ cvt.u64.u32 %rd4, %r26;
+ mul.wide.u32 %rd10, %r26, 4;
+ add.s64 %rd5, %rd3, %rd10;
+ @%p4 bra $L__BB58_7;
+ bra.uni $L__BB58_5;
+
+$L__BB58_7:
+ ld.global.nc.f32 %f1, [%rd5];
+ mov.u32 %r28, %r3;
+
+$L__BB58_8:
+ mul.wide.u32 %rd13, %r28, 8;
+ add.s64 %rd14, %rd2, %rd13;
+ ld.global.nc.u64 %rd15, [%rd14];
+ cvta.to.global.u64 %rd16, %rd15;
+ shl.b64 %rd17, %rd4, 2;
+ add.s64 %rd18, %rd16, %rd17;
+ ld.global.f32 %f4, [%rd18];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ mul.wide.u32 %rd19, %r28, 4;
+ add.s64 %rd20, %rd1, %rd19;
+ atom.global.add.f32 %f7, [%rd20], %f6;
+ add.s32 %r28, %r28, %r5;
+ setp.lt.u32 %p6, %r28, %r16;
+ @%p6 bra $L__BB58_8;
+ bra.uni $L__BB58_9;
+
+$L__BB58_5:
+ mov.u32 %r27, %r3;
+
+$L__BB58_6:
+ mul.wide.u32 %rd11, %r27, 4;
+ add.s64 %rd12, %rd1, %rd11;
+ atom.global.add.f32 %f3, [%rd12], 0f00000000;
+ add.s32 %r27, %r27, %r5;
+ setp.lt.u32 %p5, %r27, %r16;
+ @%p5 bra $L__BB58_6;
+
+$L__BB58_9:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p7, %r26, %r17;
+ @%p7 bra $L__BB58_3;
+ bra.uni $L__BB58_15;
+
+$L__BB58_11:
+ setp.ge.u32 %p8, %r3, %r16;
+ @%p8 bra $L__BB58_14;
+
+ cvt.u64.u32 %rd6, %r26;
+ mul.wide.u32 %rd21, %r26, 4;
+ add.s64 %rd22, %rd3, %rd21;
+ ld.global.nc.f32 %f2, [%rd22];
+ mov.u32 %r30, %r3;
+
+$L__BB58_13:
+ mul.wide.u32 %rd23, %r30, 8;
+ add.s64 %rd24, %rd2, %rd23;
+ ld.global.nc.u64 %rd25, [%rd24];
+ cvta.to.global.u64 %rd26, %rd25;
+ shl.b64 %rd27, %rd6, 2;
+ add.s64 %rd28, %rd26, %rd27;
+ ld.global.f32 %f8, [%rd28];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ mul.wide.u32 %rd29, %r30, 4;
+ add.s64 %rd30, %rd1, %rd29;
+ atom.global.add.f32 %f11, [%rd30], %f10;
+ add.s32 %r30, %r30, %r5;
+ setp.lt.u32 %p9, %r30, %r16;
+ @%p9 bra $L__BB58_13;
+
+$L__BB58_14:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p10, %r26, %r17;
+ @%p10 bra $L__BB58_11;
+
+$L__BB58_15:
+ ret;
+
+}
+ // .globl CosineMultiDistance
+.visible .entry CosineMultiDistance(
+ .param .u64 CosineMultiDistance_param_0,
+ .param .u64 CosineMultiDistance_param_1,
+ .param .u64 CosineMultiDistance_param_2,
+ .param .u64 CosineMultiDistance_param_3,
+ .param .u64 CosineMultiDistance_param_4,
+ .param .u32 CosineMultiDistance_param_5,
+ .param .u32 CosineMultiDistance_param_6,
+ .param .u32 CosineMultiDistance_param_7
)
{
.reg .pred %p<7>;
@@ -4224,20 +4332,20 @@ $L__BB57_21:
.reg .b64 %rd<28>;
- ld.param.u64 %rd7, [MultiCosineDistance_param_0];
- ld.param.u64 %rd8, [MultiCosineDistance_param_1];
- ld.param.u64 %rd9, [MultiCosineDistance_param_2];
- ld.param.u64 %rd10, [MultiCosineDistance_param_3];
- ld.param.u64 %rd11, [MultiCosineDistance_param_4];
- ld.param.u32 %r17, [MultiCosineDistance_param_5];
- ld.param.u32 %r18, [MultiCosineDistance_param_6];
- ld.param.u32 %r19, [MultiCosineDistance_param_7];
+ ld.param.u64 %rd7, [CosineMultiDistance_param_0];
+ ld.param.u64 %rd8, [CosineMultiDistance_param_1];
+ ld.param.u64 %rd9, [CosineMultiDistance_param_2];
+ ld.param.u64 %rd10, [CosineMultiDistance_param_3];
+ ld.param.u64 %rd11, [CosineMultiDistance_param_4];
+ ld.param.u32 %r17, [CosineMultiDistance_param_5];
+ ld.param.u32 %r18, [CosineMultiDistance_param_6];
+ ld.param.u32 %r19, [CosineMultiDistance_param_7];
mov.u32 %r20, %ctaid.x;
mov.u32 %r1, %ntid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r31, %r1, %r20, %r21;
setp.ge.u32 %p1, %r31, %r19;
- @%p1 bra $L__BB58_9;
+ @%p1 bra $L__BB59_9;
mov.u32 %r22, %ntid.y;
mov.u32 %r23, %ctaid.y;
@@ -4259,16 +4367,16 @@ $L__BB57_21:
cvta.to.global.u64 %rd4, %rd10;
cvta.to.global.u64 %rd5, %rd11;
-$L__BB58_2:
+$L__BB59_2:
setp.ge.u32 %p2, %r3, %r18;
- @%p2 bra $L__BB58_8;
+ @%p2 bra $L__BB59_8;
cvt.u64.u32 %rd6, %r31;
mov.u32 %r32, %r3;
-$L__BB58_4:
+$L__BB59_4:
setp.ge.u32 %p3, %r5, %r17;
- @%p3 bra $L__BB58_7;
+ @%p3 bra $L__BB59_7;
mul.wide.u32 %rd12, %r32, 8;
add.s64 %rd13, %rd1, %rd12;
@@ -4281,7 +4389,7 @@ $L__BB58_4:
mad.lo.s32 %r33, %r32, %r17, %r5;
mov.u32 %r34, %r5;
-$L__BB58_6:
+$L__BB59_6:
mul.wide.u32 %rd18, %r34, 8;
add.s64 %rd19, %rd2, %rd18;
ld.global.nc.u64 %rd20, [%rd19];
@@ -4300,19 +4408,105 @@ $L__BB58_6:
add.s32 %r33, %r33, %r7;
add.s32 %r34, %r34, %r7;
setp.lt.u32 %p4, %r34, %r17;
- @%p4 bra $L__BB58_6;
+ @%p4 bra $L__BB59_6;
-$L__BB58_7:
+$L__BB59_7:
add.s32 %r32, %r32, %r6;
setp.lt.u32 %p5, %r32, %r18;
- @%p5 bra $L__BB58_4;
+ @%p5 bra $L__BB59_4;
-$L__BB58_8:
+$L__BB59_8:
add.s32 %r31, %r31, %r4;
setp.lt.u32 %p6, %r31, %r19;
- @%p6 bra $L__BB58_2;
+ @%p6 bra $L__BB59_2;
-$L__BB58_9:
+$L__BB59_9:
+ ret;
+
+}
+ // .globl CosineDistances
+.visible .entry CosineDistances(
+ .param .u64 CosineDistances_param_0,
+ .param .u64 CosineDistances_param_1,
+ .param .u64 CosineDistances_param_2,
+ .param .u64 CosineDistances_param_3,
+ .param .u64 CosineDistances_param_4,
+ .param .u32 CosineDistances_param_5,
+ .param .u32 CosineDistances_param_6
+)
+{
+ .reg .pred %p<5>;
+ .reg .f32 %f<9>;
+ .reg .b32 %r<21>;
+ .reg .b64 %rd<24>;
+
+
+ ld.param.u64 %rd7, [CosineDistances_param_0];
+ ld.param.u64 %rd8, [CosineDistances_param_1];
+ ld.param.u64 %rd9, [CosineDistances_param_2];
+ ld.param.u64 %rd10, [CosineDistances_param_3];
+ ld.param.u64 %rd11, [CosineDistances_param_4];
+ ld.param.u32 %r10, [CosineDistances_param_5];
+ ld.param.u32 %r11, [CosineDistances_param_6];
+ mov.u32 %r12, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r13, %tid.x;
+ mad.lo.s32 %r19, %r1, %r12, %r13;
+ setp.ge.u32 %p1, %r19, %r11;
+ @%p1 bra $L__BB60_6;
+
+ mov.u32 %r14, %ntid.y;
+ mov.u32 %r15, %ctaid.y;
+ mov.u32 %r16, %tid.y;
+ mad.lo.s32 %r3, %r14, %r15, %r16;
+ mov.u32 %r17, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r17;
+ mov.u32 %r18, %nctaid.y;
+ mul.lo.s32 %r5, %r14, %r18;
+ cvta.to.global.u64 %rd1, %rd7;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd9;
+ cvta.to.global.u64 %rd4, %rd10;
+ cvta.to.global.u64 %rd5, %rd11;
+
+$L__BB60_2:
+ setp.ge.u32 %p2, %r3, %r10;
+ @%p2 bra $L__BB60_5;
+
+ cvt.u64.u32 %rd6, %r19;
+ mul.wide.u32 %rd12, %r19, 4;
+ add.s64 %rd13, %rd1, %rd12;
+ ld.global.nc.f32 %f1, [%rd13];
+ mul.ftz.f32 %f2, %f1, %f1;
+ mov.u32 %r20, %r3;
+
+$L__BB60_4:
+ mul.wide.u32 %rd14, %r20, 8;
+ add.s64 %rd15, %rd2, %rd14;
+ ld.global.nc.u64 %rd16, [%rd15];
+ cvta.to.global.u64 %rd17, %rd16;
+ shl.b64 %rd18, %rd6, 2;
+ add.s64 %rd19, %rd17, %rd18;
+ ld.global.f32 %f3, [%rd19];
+ mul.wide.u32 %rd20, %r20, 4;
+ add.s64 %rd21, %rd3, %rd20;
+ atom.global.add.f32 %f4, [%rd21], %f2;
+ add.s64 %rd22, %rd4, %rd20;
+ mul.ftz.f32 %f5, %f1, %f3;
+ atom.global.add.f32 %f6, [%rd22], %f5;
+ add.s64 %rd23, %rd5, %rd20;
+ mul.ftz.f32 %f7, %f3, %f3;
+ atom.global.add.f32 %f8, [%rd23], %f7;
+ add.s32 %r20, %r20, %r5;
+ setp.lt.u32 %p3, %r20, %r10;
+ @%p3 bra $L__BB60_4;
+
+$L__BB60_5:
+ add.s32 %r19, %r19, %r4;
+ setp.lt.u32 %p4, %r19, %r11;
+ @%p4 bra $L__BB60_2;
+
+$L__BB60_6:
ret;
}
@@ -4340,7 +4534,7 @@ $L__BB58_9:
mov.u32 %r2, %tid.x;
mad.lo.s32 %r3, %r22, %r1, %r2;
setp.ge.u32 %p1, %r3, %r20;
- @%p1 bra $L__BB59_2;
+ @%p1 bra $L__BB61_2;
cvta.to.global.u64 %rd3, %rd1;
mul.lo.s32 %r23, %r3, %r21;
@@ -4352,17 +4546,17 @@ $L__BB58_9:
add.s32 %r26, %r25, %r24;
st.shared.f32 [%r26], %f8;
-$L__BB59_2:
+$L__BB61_2:
bar.sync 0;
setp.ne.s32 %p2, %r2, 0;
- @%p2 bra $L__BB59_11;
+ @%p2 bra $L__BB61_11;
shl.b32 %r4, %r1, 10;
sub.s32 %r27, %r20, %r4;
min.u32 %r5, %r27, 1024;
setp.eq.s32 %p3, %r5, 0;
mov.f32 %f25, 0f00000000;
- @%p3 bra $L__BB59_10;
+ @%p3 bra $L__BB61_10;
not.b32 %r29, %r20;
add.s32 %r30, %r4, %r29;
@@ -4373,7 +4567,7 @@ $L__BB59_2:
setp.lt.u32 %p4, %r33, 3;
mov.f32 %f25, 0f00000000;
mov.u32 %r45, 0;
- @%p4 bra $L__BB59_7;
+ @%p4 bra $L__BB61_7;
add.s32 %r36, %r4, -1;
sub.s32 %r37, %r36, %r20;
@@ -4382,7 +4576,7 @@ $L__BB59_2:
neg.s32 %r42, %r39;
mov.u32 %r43, _ZZ9SumValuesE5block;
-$L__BB59_6:
+$L__BB61_6:
ld.shared.f32 %f13, [%r43];
add.ftz.f32 %f14, %f25, %f13;
ld.shared.f32 %f15, [%r43+4];
@@ -4395,32 +4589,32 @@ $L__BB59_6:
add.s32 %r43, %r43, 16;
add.s32 %r42, %r42, -4;
setp.ne.s32 %p5, %r42, 1;
- @%p5 bra $L__BB59_6;
+ @%p5 bra $L__BB61_6;
-$L__BB59_7:
+$L__BB61_7:
setp.eq.s32 %p6, %r47, 0;
- @%p6 bra $L__BB59_10;
+ @%p6 bra $L__BB61_10;
shl.b32 %r40, %r45, 2;
mov.u32 %r41, _ZZ9SumValuesE5block;
add.s32 %r46, %r41, %r40;
-$L__BB59_9:
+$L__BB61_9:
.pragma "nounroll";
ld.shared.f32 %f20, [%r46];
add.ftz.f32 %f25, %f25, %f20;
add.s32 %r46, %r46, 4;
add.s32 %r47, %r47, -1;
setp.ne.s32 %p7, %r47, 0;
- @%p7 bra $L__BB59_9;
+ @%p7 bra $L__BB61_9;
-$L__BB59_10:
+$L__BB61_10:
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f25;
-$L__BB59_11:
+$L__BB61_11:
ret;
}
diff --git a/BrightData.Cuda/cuda/brightwire_89.ptx b/BrightData.Cuda/cuda/brightwire_89.ptx
index 63e2f638..8dff67ac 100644
--- a/BrightData.Cuda/cuda/brightwire_89.ptx
+++ b/BrightData.Cuda/cuda/brightwire_89.ptx
@@ -4022,200 +4022,308 @@ $L__BB56_3:
ret;
}
- // .globl CalculateDistances
-.visible .entry CalculateDistances(
- .param .u64 CalculateDistances_param_0,
- .param .u64 CalculateDistances_param_1,
- .param .u64 CalculateDistances_param_2,
- .param .u32 CalculateDistances_param_3,
- .param .u32 CalculateDistances_param_4,
- .param .u32 CalculateDistances_param_5,
- .param .u32 CalculateDistances_param_6
+ // .globl CalculateMultiDistances
+.visible .entry CalculateMultiDistances(
+ .param .u64 CalculateMultiDistances_param_0,
+ .param .u64 CalculateMultiDistances_param_1,
+ .param .u64 CalculateMultiDistances_param_2,
+ .param .u32 CalculateMultiDistances_param_3,
+ .param .u32 CalculateMultiDistances_param_4,
+ .param .u32 CalculateMultiDistances_param_5,
+ .param .u32 CalculateMultiDistances_param_6
)
{
- .reg .pred %p<15>;
- .reg .f32 %f<15>;
- .reg .b32 %r<53>;
- .reg .b64 %rd<46>;
-
-
- ld.param.u64 %rd5, [CalculateDistances_param_0];
- ld.param.u64 %rd6, [CalculateDistances_param_1];
- ld.param.u64 %rd7, [CalculateDistances_param_2];
- ld.param.u32 %r27, [CalculateDistances_param_3];
- ld.param.u32 %r28, [CalculateDistances_param_4];
- ld.param.u32 %r29, [CalculateDistances_param_5];
- ld.param.u32 %r30, [CalculateDistances_param_6];
- cvta.to.global.u64 %rd1, %rd7;
- cvta.to.global.u64 %rd2, %rd6;
- cvta.to.global.u64 %rd3, %rd5;
- mov.u32 %r31, %ctaid.x;
+ .reg .pred %p<13>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<49>;
+ .reg .b64 %rd<38>;
+
+
+ ld.param.u64 %rd6, [CalculateMultiDistances_param_0];
+ ld.param.u64 %rd7, [CalculateMultiDistances_param_1];
+ ld.param.u64 %rd8, [CalculateMultiDistances_param_2];
+ ld.param.u32 %r25, [CalculateMultiDistances_param_3];
+ ld.param.u32 %r26, [CalculateMultiDistances_param_4];
+ ld.param.u32 %r27, [CalculateMultiDistances_param_5];
+ ld.param.u32 %r28, [CalculateMultiDistances_param_6];
+ cvta.to.global.u64 %rd1, %rd8;
+ cvta.to.global.u64 %rd2, %rd7;
+ cvta.to.global.u64 %rd3, %rd6;
+ mov.u32 %r29, %ctaid.x;
mov.u32 %r1, %ntid.x;
- mov.u32 %r32, %tid.x;
- mad.lo.s32 %r45, %r1, %r31, %r32;
- setp.ge.u32 %p1, %r45, %r29;
- @%p1 bra $L__BB57_21;
-
- mov.u32 %r33, %ntid.y;
- mov.u32 %r34, %ctaid.y;
- mov.u32 %r35, %tid.y;
- mad.lo.s32 %r3, %r33, %r34, %r35;
- mov.u32 %r36, %nctaid.x;
- mul.lo.s32 %r4, %r1, %r36;
- mov.u32 %r37, %ctaid.z;
- mov.u32 %r38, %ntid.z;
- mov.u32 %r39, %tid.z;
- mad.lo.s32 %r5, %r38, %r37, %r39;
- mov.u32 %r40, %nctaid.y;
- mul.lo.s32 %r6, %r33, %r40;
- mov.u32 %r41, %nctaid.z;
- mul.lo.s32 %r7, %r38, %r41;
+ mov.u32 %r30, %tid.x;
+ mad.lo.s32 %r42, %r1, %r29, %r30;
+ setp.ge.u32 %p1, %r42, %r27;
+ @%p1 bra $L__BB57_18;
+
+ mov.u32 %r31, %ntid.y;
+ mov.u32 %r32, %ctaid.y;
+ mov.u32 %r33, %tid.y;
+ mad.lo.s32 %r3, %r31, %r32, %r33;
+ mov.u32 %r34, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r34;
+ mov.u32 %r35, %ctaid.z;
+ mov.u32 %r36, %ntid.z;
+ mov.u32 %r37, %tid.z;
+ mad.lo.s32 %r5, %r36, %r35, %r37;
+ mov.u32 %r38, %nctaid.y;
+ mul.lo.s32 %r6, %r31, %r38;
+ mov.u32 %r39, %nctaid.z;
+ mul.lo.s32 %r7, %r36, %r39;
$L__BB57_2:
- setp.ge.u32 %p2, %r3, %r28;
- @%p2 bra $L__BB57_20;
+ setp.ge.u32 %p2, %r3, %r26;
+ @%p2 bra $L__BB57_17;
- setp.eq.s32 %p3, %r30, 0;
- cvt.u64.u32 %rd4, %r45;
- @%p3 bra $L__BB57_15;
+ setp.eq.s32 %p3, %r28, 0;
+ cvt.u64.u32 %rd4, %r42;
+ @%p3 bra $L__BB57_12;
- mov.u32 %r46, %r3;
+ mov.u32 %r43, %r3;
$L__BB57_5:
- setp.ge.u32 %p4, %r5, %r27;
- @%p4 bra $L__BB57_14;
-
- mul.wide.u32 %rd8, %r46, 8;
- add.s64 %rd9, %rd3, %rd8;
- ld.global.nc.u64 %rd10, [%rd9];
- cvta.to.global.u64 %rd11, %rd10;
- shl.b64 %rd12, %rd4, 2;
- add.s64 %rd13, %rd11, %rd12;
- ld.global.f32 %f1, [%rd13];
- mul.lo.s32 %r10, %r46, %r27;
- setp.eq.s32 %p5, %r30, 1;
- @%p5 bra $L__BB57_10;
-
- setp.ne.s32 %p6, %r30, 2;
- @%p6 bra $L__BB57_12;
+ setp.ge.u32 %p4, %r5, %r25;
+ @%p4 bra $L__BB57_11;
- add.s32 %r47, %r5, %r10;
- mov.u32 %r48, %r5;
+ setp.eq.s32 %p5, %r28, 2;
+ mul.wide.u32 %rd9, %r43, 8;
+ add.s64 %rd10, %rd3, %rd9;
+ ld.global.nc.u64 %rd11, [%rd10];
+ cvta.to.global.u64 %rd12, %rd11;
+ shl.b64 %rd13, %rd4, 2;
+ add.s64 %rd5, %rd12, %rd13;
+ mul.lo.s32 %r10, %r43, %r25;
+ @%p5 bra $L__BB57_9;
+ bra.uni $L__BB57_7;
$L__BB57_9:
- mul.wide.u32 %rd14, %r48, 8;
- add.s64 %rd15, %rd2, %rd14;
- ld.global.nc.u64 %rd16, [%rd15];
- cvta.to.global.u64 %rd17, %rd16;
- add.s64 %rd19, %rd17, %rd12;
- ld.global.f32 %f3, [%rd19];
- sub.ftz.f32 %f4, %f1, %f3;
- abs.ftz.f32 %f5, %f4;
- mul.wide.u32 %rd20, %r47, 4;
- add.s64 %rd21, %rd1, %rd20;
- atom.global.add.f32 %f6, [%rd21], %f5;
- add.s32 %r47, %r47, %r7;
- add.s32 %r48, %r48, %r7;
- setp.lt.u32 %p7, %r48, %r27;
- @%p7 bra $L__BB57_9;
- bra.uni $L__BB57_14;
+ ld.global.f32 %f1, [%rd5];
+ mov.u32 %r46, %r5;
$L__BB57_10:
- mov.u32 %r49, %r5;
+ mul.wide.u32 %rd16, %r46, 8;
+ add.s64 %rd17, %rd2, %rd16;
+ ld.global.nc.u64 %rd18, [%rd17];
+ cvta.to.global.u64 %rd19, %rd18;
+ add.s64 %rd21, %rd19, %rd13;
+ ld.global.f32 %f4, [%rd21];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ add.s32 %r40, %r46, %r10;
+ mul.wide.u32 %rd22, %r40, 4;
+ add.s64 %rd23, %rd1, %rd22;
+ atom.global.add.f32 %f7, [%rd23], %f6;
+ add.s32 %r46, %r46, %r7;
+ setp.lt.u32 %p7, %r46, %r25;
+ @%p7 bra $L__BB57_10;
+ bra.uni $L__BB57_11;
+
+$L__BB57_7:
+ add.s32 %r44, %r5, %r10;
+ mov.u32 %r45, %r5;
+
+$L__BB57_8:
+ mul.wide.u32 %rd14, %r44, 4;
+ add.s64 %rd15, %rd1, %rd14;
+ atom.global.add.f32 %f3, [%rd15], 0f00000000;
+ add.s32 %r44, %r44, %r7;
+ add.s32 %r45, %r45, %r7;
+ setp.lt.u32 %p6, %r45, %r25;
+ @%p6 bra $L__BB57_8;
$L__BB57_11:
- mul.wide.u32 %rd22, %r49, 8;
- add.s64 %rd23, %rd2, %rd22;
- ld.global.nc.u64 %rd24, [%rd23];
- cvta.to.global.u64 %rd25, %rd24;
- add.s64 %rd27, %rd25, %rd12;
- ld.global.f32 %f7, [%rd27];
- mul.ftz.f32 %f8, %f1, %f7;
- add.s32 %r42, %r49, %r10;
- mul.wide.u32 %rd28, %r42, 4;
- add.s64 %rd29, %rd1, %rd28;
- atom.global.add.f32 %f9, [%rd29], %f8;
- add.s32 %r49, %r49, %r7;
- setp.lt.u32 %p8, %r49, %r27;
- @%p8 bra $L__BB57_11;
- bra.uni $L__BB57_14;
+ add.s32 %r43, %r43, %r6;
+ setp.lt.u32 %p8, %r43, %r26;
+ @%p8 bra $L__BB57_5;
+ bra.uni $L__BB57_17;
$L__BB57_12:
- mov.u32 %r50, %r5;
+ mov.u32 %r47, %r3;
$L__BB57_13:
- add.s32 %r43, %r50, %r10;
- mul.wide.u32 %rd30, %r43, 4;
- add.s64 %rd31, %rd1, %rd30;
- atom.global.add.f32 %f10, [%rd31], 0f00000000;
- add.s32 %r50, %r50, %r7;
- setp.lt.u32 %p9, %r50, %r27;
- @%p9 bra $L__BB57_13;
-
-$L__BB57_14:
- add.s32 %r46, %r46, %r6;
- setp.lt.u32 %p10, %r46, %r28;
- @%p10 bra $L__BB57_5;
- bra.uni $L__BB57_20;
+ setp.ge.u32 %p9, %r5, %r25;
+ @%p9 bra $L__BB57_16;
+
+ mul.wide.u32 %rd24, %r47, 8;
+ add.s64 %rd25, %rd3, %rd24;
+ ld.global.nc.u64 %rd26, [%rd25];
+ cvta.to.global.u64 %rd27, %rd26;
+ shl.b64 %rd28, %rd4, 2;
+ add.s64 %rd29, %rd27, %rd28;
+ ld.global.f32 %f2, [%rd29];
+ mul.lo.s32 %r20, %r47, %r25;
+ mov.u32 %r48, %r5;
$L__BB57_15:
- mov.u32 %r51, %r3;
+ mul.wide.u32 %rd30, %r48, 8;
+ add.s64 %rd31, %rd2, %rd30;
+ ld.global.nc.u64 %rd32, [%rd31];
+ cvta.to.global.u64 %rd33, %rd32;
+ add.s64 %rd35, %rd33, %rd28;
+ ld.global.f32 %f8, [%rd35];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ add.s32 %r41, %r48, %r20;
+ mul.wide.u32 %rd36, %r41, 4;
+ add.s64 %rd37, %rd1, %rd36;
+ atom.global.add.f32 %f11, [%rd37], %f10;
+ add.s32 %r48, %r48, %r7;
+ setp.lt.u32 %p10, %r48, %r25;
+ @%p10 bra $L__BB57_15;
$L__BB57_16:
- setp.ge.u32 %p11, %r5, %r27;
- @%p11 bra $L__BB57_19;
-
- mul.wide.u32 %rd32, %r51, 8;
- add.s64 %rd33, %rd3, %rd32;
- ld.global.nc.u64 %rd34, [%rd33];
- cvta.to.global.u64 %rd35, %rd34;
- shl.b64 %rd36, %rd4, 2;
- add.s64 %rd37, %rd35, %rd36;
- ld.global.f32 %f2, [%rd37];
- mul.lo.s32 %r22, %r51, %r27;
- mov.u32 %r52, %r5;
+ add.s32 %r47, %r47, %r6;
+ setp.lt.u32 %p11, %r47, %r26;
+ @%p11 bra $L__BB57_13;
+
+$L__BB57_17:
+ add.s32 %r42, %r42, %r4;
+ setp.lt.u32 %p12, %r42, %r27;
+ @%p12 bra $L__BB57_2;
$L__BB57_18:
- mul.wide.u32 %rd38, %r52, 8;
- add.s64 %rd39, %rd2, %rd38;
- ld.global.nc.u64 %rd40, [%rd39];
- cvta.to.global.u64 %rd41, %rd40;
- add.s64 %rd43, %rd41, %rd36;
- ld.global.f32 %f11, [%rd43];
- sub.ftz.f32 %f12, %f2, %f11;
- mul.ftz.f32 %f13, %f12, %f12;
- add.s32 %r44, %r52, %r22;
- mul.wide.u32 %rd44, %r44, 4;
- add.s64 %rd45, %rd1, %rd44;
- atom.global.add.f32 %f14, [%rd45], %f13;
- add.s32 %r52, %r52, %r7;
- setp.lt.u32 %p12, %r52, %r27;
- @%p12 bra $L__BB57_18;
-
-$L__BB57_19:
- add.s32 %r51, %r51, %r6;
- setp.lt.u32 %p13, %r51, %r28;
- @%p13 bra $L__BB57_16;
-
-$L__BB57_20:
- add.s32 %r45, %r45, %r4;
- setp.lt.u32 %p14, %r45, %r29;
- @%p14 bra $L__BB57_2;
-
-$L__BB57_21:
ret;
}
- // .globl MultiCosineDistance
-.visible .entry MultiCosineDistance(
- .param .u64 MultiCosineDistance_param_0,
- .param .u64 MultiCosineDistance_param_1,
- .param .u64 MultiCosineDistance_param_2,
- .param .u64 MultiCosineDistance_param_3,
- .param .u64 MultiCosineDistance_param_4,
- .param .u32 MultiCosineDistance_param_5,
- .param .u32 MultiCosineDistance_param_6,
- .param .u32 MultiCosineDistance_param_7
+ // .globl CalculateDistances
+.visible .entry CalculateDistances(
+ .param .u64 CalculateDistances_param_0,
+ .param .u64 CalculateDistances_param_1,
+ .param .u64 CalculateDistances_param_2,
+ .param .u32 CalculateDistances_param_3,
+ .param .u32 CalculateDistances_param_4,
+ .param .u32 CalculateDistances_param_5
+)
+{
+ .reg .pred %p<11>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<31>;
+ .reg .b64 %rd<31>;
+
+
+ ld.param.u64 %rd7, [CalculateDistances_param_0];
+ ld.param.u64 %rd8, [CalculateDistances_param_1];
+ ld.param.u64 %rd9, [CalculateDistances_param_2];
+ ld.param.u32 %r16, [CalculateDistances_param_3];
+ ld.param.u32 %r17, [CalculateDistances_param_4];
+ ld.param.u32 %r18, [CalculateDistances_param_5];
+ cvta.to.global.u64 %rd1, %rd9;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd7;
+ mov.u32 %r19, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r20, %tid.x;
+ mad.lo.s32 %r26, %r1, %r19, %r20;
+ setp.ge.u32 %p1, %r26, %r17;
+ @%p1 bra $L__BB58_15;
+
+ mov.u32 %r21, %ntid.y;
+ mov.u32 %r22, %ctaid.y;
+ mov.u32 %r23, %tid.y;
+ mad.lo.s32 %r3, %r21, %r22, %r23;
+ mov.u32 %r24, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r24;
+ mov.u32 %r25, %nctaid.y;
+ mul.lo.s32 %r5, %r21, %r25;
+ setp.eq.s32 %p2, %r18, 0;
+ @%p2 bra $L__BB58_11;
+
+$L__BB58_3:
+ setp.ge.u32 %p3, %r3, %r16;
+ @%p3 bra $L__BB58_9;
+
+ setp.eq.s32 %p4, %r18, 2;
+ cvt.u64.u32 %rd4, %r26;
+ mul.wide.u32 %rd10, %r26, 4;
+ add.s64 %rd5, %rd3, %rd10;
+ @%p4 bra $L__BB58_7;
+ bra.uni $L__BB58_5;
+
+$L__BB58_7:
+ ld.global.nc.f32 %f1, [%rd5];
+ mov.u32 %r28, %r3;
+
+$L__BB58_8:
+ mul.wide.u32 %rd13, %r28, 8;
+ add.s64 %rd14, %rd2, %rd13;
+ ld.global.nc.u64 %rd15, [%rd14];
+ cvta.to.global.u64 %rd16, %rd15;
+ shl.b64 %rd17, %rd4, 2;
+ add.s64 %rd18, %rd16, %rd17;
+ ld.global.f32 %f4, [%rd18];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ mul.wide.u32 %rd19, %r28, 4;
+ add.s64 %rd20, %rd1, %rd19;
+ atom.global.add.f32 %f7, [%rd20], %f6;
+ add.s32 %r28, %r28, %r5;
+ setp.lt.u32 %p6, %r28, %r16;
+ @%p6 bra $L__BB58_8;
+ bra.uni $L__BB58_9;
+
+$L__BB58_5:
+ mov.u32 %r27, %r3;
+
+$L__BB58_6:
+ mul.wide.u32 %rd11, %r27, 4;
+ add.s64 %rd12, %rd1, %rd11;
+ atom.global.add.f32 %f3, [%rd12], 0f00000000;
+ add.s32 %r27, %r27, %r5;
+ setp.lt.u32 %p5, %r27, %r16;
+ @%p5 bra $L__BB58_6;
+
+$L__BB58_9:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p7, %r26, %r17;
+ @%p7 bra $L__BB58_3;
+ bra.uni $L__BB58_15;
+
+$L__BB58_11:
+ setp.ge.u32 %p8, %r3, %r16;
+ @%p8 bra $L__BB58_14;
+
+ cvt.u64.u32 %rd6, %r26;
+ mul.wide.u32 %rd21, %r26, 4;
+ add.s64 %rd22, %rd3, %rd21;
+ ld.global.nc.f32 %f2, [%rd22];
+ mov.u32 %r30, %r3;
+
+$L__BB58_13:
+ mul.wide.u32 %rd23, %r30, 8;
+ add.s64 %rd24, %rd2, %rd23;
+ ld.global.nc.u64 %rd25, [%rd24];
+ cvta.to.global.u64 %rd26, %rd25;
+ shl.b64 %rd27, %rd6, 2;
+ add.s64 %rd28, %rd26, %rd27;
+ ld.global.f32 %f8, [%rd28];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ mul.wide.u32 %rd29, %r30, 4;
+ add.s64 %rd30, %rd1, %rd29;
+ atom.global.add.f32 %f11, [%rd30], %f10;
+ add.s32 %r30, %r30, %r5;
+ setp.lt.u32 %p9, %r30, %r16;
+ @%p9 bra $L__BB58_13;
+
+$L__BB58_14:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p10, %r26, %r17;
+ @%p10 bra $L__BB58_11;
+
+$L__BB58_15:
+ ret;
+
+}
+ // .globl CosineMultiDistance
+.visible .entry CosineMultiDistance(
+ .param .u64 CosineMultiDistance_param_0,
+ .param .u64 CosineMultiDistance_param_1,
+ .param .u64 CosineMultiDistance_param_2,
+ .param .u64 CosineMultiDistance_param_3,
+ .param .u64 CosineMultiDistance_param_4,
+ .param .u32 CosineMultiDistance_param_5,
+ .param .u32 CosineMultiDistance_param_6,
+ .param .u32 CosineMultiDistance_param_7
)
{
.reg .pred %p<7>;
@@ -4224,20 +4332,20 @@ $L__BB57_21:
.reg .b64 %rd<28>;
- ld.param.u64 %rd7, [MultiCosineDistance_param_0];
- ld.param.u64 %rd8, [MultiCosineDistance_param_1];
- ld.param.u64 %rd9, [MultiCosineDistance_param_2];
- ld.param.u64 %rd10, [MultiCosineDistance_param_3];
- ld.param.u64 %rd11, [MultiCosineDistance_param_4];
- ld.param.u32 %r17, [MultiCosineDistance_param_5];
- ld.param.u32 %r18, [MultiCosineDistance_param_6];
- ld.param.u32 %r19, [MultiCosineDistance_param_7];
+ ld.param.u64 %rd7, [CosineMultiDistance_param_0];
+ ld.param.u64 %rd8, [CosineMultiDistance_param_1];
+ ld.param.u64 %rd9, [CosineMultiDistance_param_2];
+ ld.param.u64 %rd10, [CosineMultiDistance_param_3];
+ ld.param.u64 %rd11, [CosineMultiDistance_param_4];
+ ld.param.u32 %r17, [CosineMultiDistance_param_5];
+ ld.param.u32 %r18, [CosineMultiDistance_param_6];
+ ld.param.u32 %r19, [CosineMultiDistance_param_7];
mov.u32 %r20, %ctaid.x;
mov.u32 %r1, %ntid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r31, %r1, %r20, %r21;
setp.ge.u32 %p1, %r31, %r19;
- @%p1 bra $L__BB58_9;
+ @%p1 bra $L__BB59_9;
mov.u32 %r22, %ntid.y;
mov.u32 %r23, %ctaid.y;
@@ -4259,16 +4367,16 @@ $L__BB57_21:
cvta.to.global.u64 %rd4, %rd10;
cvta.to.global.u64 %rd5, %rd11;
-$L__BB58_2:
+$L__BB59_2:
setp.ge.u32 %p2, %r3, %r18;
- @%p2 bra $L__BB58_8;
+ @%p2 bra $L__BB59_8;
cvt.u64.u32 %rd6, %r31;
mov.u32 %r32, %r3;
-$L__BB58_4:
+$L__BB59_4:
setp.ge.u32 %p3, %r5, %r17;
- @%p3 bra $L__BB58_7;
+ @%p3 bra $L__BB59_7;
mul.wide.u32 %rd12, %r32, 8;
add.s64 %rd13, %rd1, %rd12;
@@ -4281,7 +4389,7 @@ $L__BB58_4:
mad.lo.s32 %r33, %r32, %r17, %r5;
mov.u32 %r34, %r5;
-$L__BB58_6:
+$L__BB59_6:
mul.wide.u32 %rd18, %r34, 8;
add.s64 %rd19, %rd2, %rd18;
ld.global.nc.u64 %rd20, [%rd19];
@@ -4300,19 +4408,105 @@ $L__BB58_6:
add.s32 %r33, %r33, %r7;
add.s32 %r34, %r34, %r7;
setp.lt.u32 %p4, %r34, %r17;
- @%p4 bra $L__BB58_6;
+ @%p4 bra $L__BB59_6;
-$L__BB58_7:
+$L__BB59_7:
add.s32 %r32, %r32, %r6;
setp.lt.u32 %p5, %r32, %r18;
- @%p5 bra $L__BB58_4;
+ @%p5 bra $L__BB59_4;
-$L__BB58_8:
+$L__BB59_8:
add.s32 %r31, %r31, %r4;
setp.lt.u32 %p6, %r31, %r19;
- @%p6 bra $L__BB58_2;
+ @%p6 bra $L__BB59_2;
-$L__BB58_9:
+$L__BB59_9:
+ ret;
+
+}
+ // .globl CosineDistances
+.visible .entry CosineDistances(
+ .param .u64 CosineDistances_param_0,
+ .param .u64 CosineDistances_param_1,
+ .param .u64 CosineDistances_param_2,
+ .param .u64 CosineDistances_param_3,
+ .param .u64 CosineDistances_param_4,
+ .param .u32 CosineDistances_param_5,
+ .param .u32 CosineDistances_param_6
+)
+{
+ .reg .pred %p<5>;
+ .reg .f32 %f<9>;
+ .reg .b32 %r<21>;
+ .reg .b64 %rd<24>;
+
+
+ ld.param.u64 %rd7, [CosineDistances_param_0];
+ ld.param.u64 %rd8, [CosineDistances_param_1];
+ ld.param.u64 %rd9, [CosineDistances_param_2];
+ ld.param.u64 %rd10, [CosineDistances_param_3];
+ ld.param.u64 %rd11, [CosineDistances_param_4];
+ ld.param.u32 %r10, [CosineDistances_param_5];
+ ld.param.u32 %r11, [CosineDistances_param_6];
+ mov.u32 %r12, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r13, %tid.x;
+ mad.lo.s32 %r19, %r1, %r12, %r13;
+ setp.ge.u32 %p1, %r19, %r11;
+ @%p1 bra $L__BB60_6;
+
+ mov.u32 %r14, %ntid.y;
+ mov.u32 %r15, %ctaid.y;
+ mov.u32 %r16, %tid.y;
+ mad.lo.s32 %r3, %r14, %r15, %r16;
+ mov.u32 %r17, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r17;
+ mov.u32 %r18, %nctaid.y;
+ mul.lo.s32 %r5, %r14, %r18;
+ cvta.to.global.u64 %rd1, %rd7;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd9;
+ cvta.to.global.u64 %rd4, %rd10;
+ cvta.to.global.u64 %rd5, %rd11;
+
+$L__BB60_2:
+ setp.ge.u32 %p2, %r3, %r10;
+ @%p2 bra $L__BB60_5;
+
+ cvt.u64.u32 %rd6, %r19;
+ mul.wide.u32 %rd12, %r19, 4;
+ add.s64 %rd13, %rd1, %rd12;
+ ld.global.nc.f32 %f1, [%rd13];
+ mul.ftz.f32 %f2, %f1, %f1;
+ mov.u32 %r20, %r3;
+
+$L__BB60_4:
+ mul.wide.u32 %rd14, %r20, 8;
+ add.s64 %rd15, %rd2, %rd14;
+ ld.global.nc.u64 %rd16, [%rd15];
+ cvta.to.global.u64 %rd17, %rd16;
+ shl.b64 %rd18, %rd6, 2;
+ add.s64 %rd19, %rd17, %rd18;
+ ld.global.f32 %f3, [%rd19];
+ mul.wide.u32 %rd20, %r20, 4;
+ add.s64 %rd21, %rd3, %rd20;
+ atom.global.add.f32 %f4, [%rd21], %f2;
+ add.s64 %rd22, %rd4, %rd20;
+ mul.ftz.f32 %f5, %f1, %f3;
+ atom.global.add.f32 %f6, [%rd22], %f5;
+ add.s64 %rd23, %rd5, %rd20;
+ mul.ftz.f32 %f7, %f3, %f3;
+ atom.global.add.f32 %f8, [%rd23], %f7;
+ add.s32 %r20, %r20, %r5;
+ setp.lt.u32 %p3, %r20, %r10;
+ @%p3 bra $L__BB60_4;
+
+$L__BB60_5:
+ add.s32 %r19, %r19, %r4;
+ setp.lt.u32 %p4, %r19, %r11;
+ @%p4 bra $L__BB60_2;
+
+$L__BB60_6:
ret;
}
@@ -4340,7 +4534,7 @@ $L__BB58_9:
mov.u32 %r2, %tid.x;
mad.lo.s32 %r3, %r22, %r1, %r2;
setp.ge.u32 %p1, %r3, %r20;
- @%p1 bra $L__BB59_2;
+ @%p1 bra $L__BB61_2;
cvta.to.global.u64 %rd3, %rd1;
mul.lo.s32 %r23, %r3, %r21;
@@ -4352,17 +4546,17 @@ $L__BB58_9:
add.s32 %r26, %r25, %r24;
st.shared.f32 [%r26], %f8;
-$L__BB59_2:
+$L__BB61_2:
bar.sync 0;
setp.ne.s32 %p2, %r2, 0;
- @%p2 bra $L__BB59_11;
+ @%p2 bra $L__BB61_11;
shl.b32 %r4, %r1, 10;
sub.s32 %r27, %r20, %r4;
min.u32 %r5, %r27, 1024;
setp.eq.s32 %p3, %r5, 0;
mov.f32 %f25, 0f00000000;
- @%p3 bra $L__BB59_10;
+ @%p3 bra $L__BB61_10;
not.b32 %r29, %r20;
add.s32 %r30, %r4, %r29;
@@ -4373,7 +4567,7 @@ $L__BB59_2:
setp.lt.u32 %p4, %r33, 3;
mov.f32 %f25, 0f00000000;
mov.u32 %r45, 0;
- @%p4 bra $L__BB59_7;
+ @%p4 bra $L__BB61_7;
add.s32 %r36, %r4, -1;
sub.s32 %r37, %r36, %r20;
@@ -4382,7 +4576,7 @@ $L__BB59_2:
neg.s32 %r42, %r39;
mov.u32 %r43, _ZZ9SumValuesE5block;
-$L__BB59_6:
+$L__BB61_6:
ld.shared.f32 %f13, [%r43];
add.ftz.f32 %f14, %f25, %f13;
ld.shared.f32 %f15, [%r43+4];
@@ -4395,32 +4589,32 @@ $L__BB59_6:
add.s32 %r43, %r43, 16;
add.s32 %r42, %r42, -4;
setp.ne.s32 %p5, %r42, 1;
- @%p5 bra $L__BB59_6;
+ @%p5 bra $L__BB61_6;
-$L__BB59_7:
+$L__BB61_7:
setp.eq.s32 %p6, %r47, 0;
- @%p6 bra $L__BB59_10;
+ @%p6 bra $L__BB61_10;
shl.b32 %r40, %r45, 2;
mov.u32 %r41, _ZZ9SumValuesE5block;
add.s32 %r46, %r41, %r40;
-$L__BB59_9:
+$L__BB61_9:
.pragma "nounroll";
ld.shared.f32 %f20, [%r46];
add.ftz.f32 %f25, %f25, %f20;
add.s32 %r46, %r46, 4;
add.s32 %r47, %r47, -1;
setp.ne.s32 %p7, %r47, 0;
- @%p7 bra $L__BB59_9;
+ @%p7 bra $L__BB61_9;
-$L__BB59_10:
+$L__BB61_10:
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f25;
-$L__BB59_11:
+$L__BB61_11:
ret;
}
diff --git a/BrightData.Cuda/cuda/brightwire_90.ptx b/BrightData.Cuda/cuda/brightwire_90.ptx
index b0423e2c..760cebcd 100644
--- a/BrightData.Cuda/cuda/brightwire_90.ptx
+++ b/BrightData.Cuda/cuda/brightwire_90.ptx
@@ -4022,200 +4022,308 @@ $L__BB56_3:
ret;
}
- // .globl CalculateDistances
-.visible .entry CalculateDistances(
- .param .u64 CalculateDistances_param_0,
- .param .u64 CalculateDistances_param_1,
- .param .u64 CalculateDistances_param_2,
- .param .u32 CalculateDistances_param_3,
- .param .u32 CalculateDistances_param_4,
- .param .u32 CalculateDistances_param_5,
- .param .u32 CalculateDistances_param_6
+ // .globl CalculateMultiDistances
+.visible .entry CalculateMultiDistances(
+ .param .u64 CalculateMultiDistances_param_0,
+ .param .u64 CalculateMultiDistances_param_1,
+ .param .u64 CalculateMultiDistances_param_2,
+ .param .u32 CalculateMultiDistances_param_3,
+ .param .u32 CalculateMultiDistances_param_4,
+ .param .u32 CalculateMultiDistances_param_5,
+ .param .u32 CalculateMultiDistances_param_6
)
{
- .reg .pred %p<15>;
- .reg .f32 %f<15>;
- .reg .b32 %r<53>;
- .reg .b64 %rd<46>;
-
-
- ld.param.u64 %rd5, [CalculateDistances_param_0];
- ld.param.u64 %rd6, [CalculateDistances_param_1];
- ld.param.u64 %rd7, [CalculateDistances_param_2];
- ld.param.u32 %r27, [CalculateDistances_param_3];
- ld.param.u32 %r28, [CalculateDistances_param_4];
- ld.param.u32 %r29, [CalculateDistances_param_5];
- ld.param.u32 %r30, [CalculateDistances_param_6];
- cvta.to.global.u64 %rd1, %rd7;
- cvta.to.global.u64 %rd2, %rd6;
- cvta.to.global.u64 %rd3, %rd5;
- mov.u32 %r31, %ctaid.x;
+ .reg .pred %p<13>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<49>;
+ .reg .b64 %rd<38>;
+
+
+ ld.param.u64 %rd6, [CalculateMultiDistances_param_0];
+ ld.param.u64 %rd7, [CalculateMultiDistances_param_1];
+ ld.param.u64 %rd8, [CalculateMultiDistances_param_2];
+ ld.param.u32 %r25, [CalculateMultiDistances_param_3];
+ ld.param.u32 %r26, [CalculateMultiDistances_param_4];
+ ld.param.u32 %r27, [CalculateMultiDistances_param_5];
+ ld.param.u32 %r28, [CalculateMultiDistances_param_6];
+ cvta.to.global.u64 %rd1, %rd8;
+ cvta.to.global.u64 %rd2, %rd7;
+ cvta.to.global.u64 %rd3, %rd6;
+ mov.u32 %r29, %ctaid.x;
mov.u32 %r1, %ntid.x;
- mov.u32 %r32, %tid.x;
- mad.lo.s32 %r45, %r1, %r31, %r32;
- setp.ge.u32 %p1, %r45, %r29;
- @%p1 bra $L__BB57_21;
-
- mov.u32 %r33, %ntid.y;
- mov.u32 %r34, %ctaid.y;
- mov.u32 %r35, %tid.y;
- mad.lo.s32 %r3, %r33, %r34, %r35;
- mov.u32 %r36, %nctaid.x;
- mul.lo.s32 %r4, %r1, %r36;
- mov.u32 %r37, %ctaid.z;
- mov.u32 %r38, %ntid.z;
- mov.u32 %r39, %tid.z;
- mad.lo.s32 %r5, %r38, %r37, %r39;
- mov.u32 %r40, %nctaid.y;
- mul.lo.s32 %r6, %r33, %r40;
- mov.u32 %r41, %nctaid.z;
- mul.lo.s32 %r7, %r38, %r41;
+ mov.u32 %r30, %tid.x;
+ mad.lo.s32 %r42, %r1, %r29, %r30;
+ setp.ge.u32 %p1, %r42, %r27;
+ @%p1 bra $L__BB57_18;
+
+ mov.u32 %r31, %ntid.y;
+ mov.u32 %r32, %ctaid.y;
+ mov.u32 %r33, %tid.y;
+ mad.lo.s32 %r3, %r31, %r32, %r33;
+ mov.u32 %r34, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r34;
+ mov.u32 %r35, %ctaid.z;
+ mov.u32 %r36, %ntid.z;
+ mov.u32 %r37, %tid.z;
+ mad.lo.s32 %r5, %r36, %r35, %r37;
+ mov.u32 %r38, %nctaid.y;
+ mul.lo.s32 %r6, %r31, %r38;
+ mov.u32 %r39, %nctaid.z;
+ mul.lo.s32 %r7, %r36, %r39;
$L__BB57_2:
- setp.ge.u32 %p2, %r3, %r28;
- @%p2 bra $L__BB57_20;
+ setp.ge.u32 %p2, %r3, %r26;
+ @%p2 bra $L__BB57_17;
- setp.eq.s32 %p3, %r30, 0;
- cvt.u64.u32 %rd4, %r45;
- @%p3 bra $L__BB57_15;
+ setp.eq.s32 %p3, %r28, 0;
+ cvt.u64.u32 %rd4, %r42;
+ @%p3 bra $L__BB57_12;
- mov.u32 %r46, %r3;
+ mov.u32 %r43, %r3;
$L__BB57_5:
- setp.ge.u32 %p4, %r5, %r27;
- @%p4 bra $L__BB57_14;
-
- mul.wide.u32 %rd8, %r46, 8;
- add.s64 %rd9, %rd3, %rd8;
- ld.global.nc.u64 %rd10, [%rd9];
- cvta.to.global.u64 %rd11, %rd10;
- shl.b64 %rd12, %rd4, 2;
- add.s64 %rd13, %rd11, %rd12;
- ld.global.f32 %f1, [%rd13];
- mul.lo.s32 %r10, %r46, %r27;
- setp.eq.s32 %p5, %r30, 1;
- @%p5 bra $L__BB57_10;
-
- setp.ne.s32 %p6, %r30, 2;
- @%p6 bra $L__BB57_12;
+ setp.ge.u32 %p4, %r5, %r25;
+ @%p4 bra $L__BB57_11;
- add.s32 %r47, %r5, %r10;
- mov.u32 %r48, %r5;
+ setp.eq.s32 %p5, %r28, 2;
+ mul.wide.u32 %rd9, %r43, 8;
+ add.s64 %rd10, %rd3, %rd9;
+ ld.global.nc.u64 %rd11, [%rd10];
+ cvta.to.global.u64 %rd12, %rd11;
+ shl.b64 %rd13, %rd4, 2;
+ add.s64 %rd5, %rd12, %rd13;
+ mul.lo.s32 %r10, %r43, %r25;
+ @%p5 bra $L__BB57_9;
+ bra.uni $L__BB57_7;
$L__BB57_9:
- mul.wide.u32 %rd14, %r48, 8;
- add.s64 %rd15, %rd2, %rd14;
- ld.global.nc.u64 %rd16, [%rd15];
- cvta.to.global.u64 %rd17, %rd16;
- add.s64 %rd19, %rd17, %rd12;
- ld.global.f32 %f3, [%rd19];
- sub.ftz.f32 %f4, %f1, %f3;
- abs.ftz.f32 %f5, %f4;
- mul.wide.u32 %rd20, %r47, 4;
- add.s64 %rd21, %rd1, %rd20;
- atom.global.add.f32 %f6, [%rd21], %f5;
- add.s32 %r47, %r47, %r7;
- add.s32 %r48, %r48, %r7;
- setp.lt.u32 %p7, %r48, %r27;
- @%p7 bra $L__BB57_9;
- bra.uni $L__BB57_14;
+ ld.global.f32 %f1, [%rd5];
+ mov.u32 %r46, %r5;
$L__BB57_10:
- mov.u32 %r49, %r5;
+ mul.wide.u32 %rd16, %r46, 8;
+ add.s64 %rd17, %rd2, %rd16;
+ ld.global.nc.u64 %rd18, [%rd17];
+ cvta.to.global.u64 %rd19, %rd18;
+ add.s64 %rd21, %rd19, %rd13;
+ ld.global.f32 %f4, [%rd21];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ add.s32 %r40, %r46, %r10;
+ mul.wide.u32 %rd22, %r40, 4;
+ add.s64 %rd23, %rd1, %rd22;
+ atom.global.add.f32 %f7, [%rd23], %f6;
+ add.s32 %r46, %r46, %r7;
+ setp.lt.u32 %p7, %r46, %r25;
+ @%p7 bra $L__BB57_10;
+ bra.uni $L__BB57_11;
+
+$L__BB57_7:
+ add.s32 %r44, %r5, %r10;
+ mov.u32 %r45, %r5;
+
+$L__BB57_8:
+ mul.wide.u32 %rd14, %r44, 4;
+ add.s64 %rd15, %rd1, %rd14;
+ atom.global.add.f32 %f3, [%rd15], 0f00000000;
+ add.s32 %r44, %r44, %r7;
+ add.s32 %r45, %r45, %r7;
+ setp.lt.u32 %p6, %r45, %r25;
+ @%p6 bra $L__BB57_8;
$L__BB57_11:
- mul.wide.u32 %rd22, %r49, 8;
- add.s64 %rd23, %rd2, %rd22;
- ld.global.nc.u64 %rd24, [%rd23];
- cvta.to.global.u64 %rd25, %rd24;
- add.s64 %rd27, %rd25, %rd12;
- ld.global.f32 %f7, [%rd27];
- mul.ftz.f32 %f8, %f1, %f7;
- add.s32 %r42, %r49, %r10;
- mul.wide.u32 %rd28, %r42, 4;
- add.s64 %rd29, %rd1, %rd28;
- atom.global.add.f32 %f9, [%rd29], %f8;
- add.s32 %r49, %r49, %r7;
- setp.lt.u32 %p8, %r49, %r27;
- @%p8 bra $L__BB57_11;
- bra.uni $L__BB57_14;
+ add.s32 %r43, %r43, %r6;
+ setp.lt.u32 %p8, %r43, %r26;
+ @%p8 bra $L__BB57_5;
+ bra.uni $L__BB57_17;
$L__BB57_12:
- mov.u32 %r50, %r5;
+ mov.u32 %r47, %r3;
$L__BB57_13:
- add.s32 %r43, %r50, %r10;
- mul.wide.u32 %rd30, %r43, 4;
- add.s64 %rd31, %rd1, %rd30;
- atom.global.add.f32 %f10, [%rd31], 0f00000000;
- add.s32 %r50, %r50, %r7;
- setp.lt.u32 %p9, %r50, %r27;
- @%p9 bra $L__BB57_13;
-
-$L__BB57_14:
- add.s32 %r46, %r46, %r6;
- setp.lt.u32 %p10, %r46, %r28;
- @%p10 bra $L__BB57_5;
- bra.uni $L__BB57_20;
+ setp.ge.u32 %p9, %r5, %r25;
+ @%p9 bra $L__BB57_16;
+
+ mul.wide.u32 %rd24, %r47, 8;
+ add.s64 %rd25, %rd3, %rd24;
+ ld.global.nc.u64 %rd26, [%rd25];
+ cvta.to.global.u64 %rd27, %rd26;
+ shl.b64 %rd28, %rd4, 2;
+ add.s64 %rd29, %rd27, %rd28;
+ ld.global.f32 %f2, [%rd29];
+ mul.lo.s32 %r20, %r47, %r25;
+ mov.u32 %r48, %r5;
$L__BB57_15:
- mov.u32 %r51, %r3;
+ mul.wide.u32 %rd30, %r48, 8;
+ add.s64 %rd31, %rd2, %rd30;
+ ld.global.nc.u64 %rd32, [%rd31];
+ cvta.to.global.u64 %rd33, %rd32;
+ add.s64 %rd35, %rd33, %rd28;
+ ld.global.f32 %f8, [%rd35];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ add.s32 %r41, %r48, %r20;
+ mul.wide.u32 %rd36, %r41, 4;
+ add.s64 %rd37, %rd1, %rd36;
+ atom.global.add.f32 %f11, [%rd37], %f10;
+ add.s32 %r48, %r48, %r7;
+ setp.lt.u32 %p10, %r48, %r25;
+ @%p10 bra $L__BB57_15;
$L__BB57_16:
- setp.ge.u32 %p11, %r5, %r27;
- @%p11 bra $L__BB57_19;
-
- mul.wide.u32 %rd32, %r51, 8;
- add.s64 %rd33, %rd3, %rd32;
- ld.global.nc.u64 %rd34, [%rd33];
- cvta.to.global.u64 %rd35, %rd34;
- shl.b64 %rd36, %rd4, 2;
- add.s64 %rd37, %rd35, %rd36;
- ld.global.f32 %f2, [%rd37];
- mul.lo.s32 %r22, %r51, %r27;
- mov.u32 %r52, %r5;
+ add.s32 %r47, %r47, %r6;
+ setp.lt.u32 %p11, %r47, %r26;
+ @%p11 bra $L__BB57_13;
+
+$L__BB57_17:
+ add.s32 %r42, %r42, %r4;
+ setp.lt.u32 %p12, %r42, %r27;
+ @%p12 bra $L__BB57_2;
$L__BB57_18:
- mul.wide.u32 %rd38, %r52, 8;
- add.s64 %rd39, %rd2, %rd38;
- ld.global.nc.u64 %rd40, [%rd39];
- cvta.to.global.u64 %rd41, %rd40;
- add.s64 %rd43, %rd41, %rd36;
- ld.global.f32 %f11, [%rd43];
- sub.ftz.f32 %f12, %f2, %f11;
- mul.ftz.f32 %f13, %f12, %f12;
- add.s32 %r44, %r52, %r22;
- mul.wide.u32 %rd44, %r44, 4;
- add.s64 %rd45, %rd1, %rd44;
- atom.global.add.f32 %f14, [%rd45], %f13;
- add.s32 %r52, %r52, %r7;
- setp.lt.u32 %p12, %r52, %r27;
- @%p12 bra $L__BB57_18;
-
-$L__BB57_19:
- add.s32 %r51, %r51, %r6;
- setp.lt.u32 %p13, %r51, %r28;
- @%p13 bra $L__BB57_16;
-
-$L__BB57_20:
- add.s32 %r45, %r45, %r4;
- setp.lt.u32 %p14, %r45, %r29;
- @%p14 bra $L__BB57_2;
-
-$L__BB57_21:
ret;
}
- // .globl MultiCosineDistance
-.visible .entry MultiCosineDistance(
- .param .u64 MultiCosineDistance_param_0,
- .param .u64 MultiCosineDistance_param_1,
- .param .u64 MultiCosineDistance_param_2,
- .param .u64 MultiCosineDistance_param_3,
- .param .u64 MultiCosineDistance_param_4,
- .param .u32 MultiCosineDistance_param_5,
- .param .u32 MultiCosineDistance_param_6,
- .param .u32 MultiCosineDistance_param_7
+ // .globl CalculateDistances
+.visible .entry CalculateDistances(
+ .param .u64 CalculateDistances_param_0,
+ .param .u64 CalculateDistances_param_1,
+ .param .u64 CalculateDistances_param_2,
+ .param .u32 CalculateDistances_param_3,
+ .param .u32 CalculateDistances_param_4,
+ .param .u32 CalculateDistances_param_5
+)
+{
+ .reg .pred %p<11>;
+ .reg .f32 %f<12>;
+ .reg .b32 %r<31>;
+ .reg .b64 %rd<31>;
+
+
+ ld.param.u64 %rd7, [CalculateDistances_param_0];
+ ld.param.u64 %rd8, [CalculateDistances_param_1];
+ ld.param.u64 %rd9, [CalculateDistances_param_2];
+ ld.param.u32 %r16, [CalculateDistances_param_3];
+ ld.param.u32 %r17, [CalculateDistances_param_4];
+ ld.param.u32 %r18, [CalculateDistances_param_5];
+ cvta.to.global.u64 %rd1, %rd9;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd7;
+ mov.u32 %r19, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r20, %tid.x;
+ mad.lo.s32 %r26, %r1, %r19, %r20;
+ setp.ge.u32 %p1, %r26, %r17;
+ @%p1 bra $L__BB58_15;
+
+ mov.u32 %r21, %ntid.y;
+ mov.u32 %r22, %ctaid.y;
+ mov.u32 %r23, %tid.y;
+ mad.lo.s32 %r3, %r21, %r22, %r23;
+ mov.u32 %r24, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r24;
+ mov.u32 %r25, %nctaid.y;
+ mul.lo.s32 %r5, %r21, %r25;
+ setp.eq.s32 %p2, %r18, 0;
+ @%p2 bra $L__BB58_11;
+
+$L__BB58_3:
+ setp.ge.u32 %p3, %r3, %r16;
+ @%p3 bra $L__BB58_9;
+
+ setp.eq.s32 %p4, %r18, 2;
+ cvt.u64.u32 %rd4, %r26;
+ mul.wide.u32 %rd10, %r26, 4;
+ add.s64 %rd5, %rd3, %rd10;
+ @%p4 bra $L__BB58_7;
+ bra.uni $L__BB58_5;
+
+$L__BB58_7:
+ ld.global.nc.f32 %f1, [%rd5];
+ mov.u32 %r28, %r3;
+
+$L__BB58_8:
+ mul.wide.u32 %rd13, %r28, 8;
+ add.s64 %rd14, %rd2, %rd13;
+ ld.global.nc.u64 %rd15, [%rd14];
+ cvta.to.global.u64 %rd16, %rd15;
+ shl.b64 %rd17, %rd4, 2;
+ add.s64 %rd18, %rd16, %rd17;
+ ld.global.f32 %f4, [%rd18];
+ sub.ftz.f32 %f5, %f1, %f4;
+ abs.ftz.f32 %f6, %f5;
+ mul.wide.u32 %rd19, %r28, 4;
+ add.s64 %rd20, %rd1, %rd19;
+ atom.global.add.f32 %f7, [%rd20], %f6;
+ add.s32 %r28, %r28, %r5;
+ setp.lt.u32 %p6, %r28, %r16;
+ @%p6 bra $L__BB58_8;
+ bra.uni $L__BB58_9;
+
+$L__BB58_5:
+ mov.u32 %r27, %r3;
+
+$L__BB58_6:
+ mul.wide.u32 %rd11, %r27, 4;
+ add.s64 %rd12, %rd1, %rd11;
+ atom.global.add.f32 %f3, [%rd12], 0f00000000;
+ add.s32 %r27, %r27, %r5;
+ setp.lt.u32 %p5, %r27, %r16;
+ @%p5 bra $L__BB58_6;
+
+$L__BB58_9:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p7, %r26, %r17;
+ @%p7 bra $L__BB58_3;
+ bra.uni $L__BB58_15;
+
+$L__BB58_11:
+ setp.ge.u32 %p8, %r3, %r16;
+ @%p8 bra $L__BB58_14;
+
+ cvt.u64.u32 %rd6, %r26;
+ mul.wide.u32 %rd21, %r26, 4;
+ add.s64 %rd22, %rd3, %rd21;
+ ld.global.nc.f32 %f2, [%rd22];
+ mov.u32 %r30, %r3;
+
+$L__BB58_13:
+ mul.wide.u32 %rd23, %r30, 8;
+ add.s64 %rd24, %rd2, %rd23;
+ ld.global.nc.u64 %rd25, [%rd24];
+ cvta.to.global.u64 %rd26, %rd25;
+ shl.b64 %rd27, %rd6, 2;
+ add.s64 %rd28, %rd26, %rd27;
+ ld.global.f32 %f8, [%rd28];
+ sub.ftz.f32 %f9, %f2, %f8;
+ mul.ftz.f32 %f10, %f9, %f9;
+ mul.wide.u32 %rd29, %r30, 4;
+ add.s64 %rd30, %rd1, %rd29;
+ atom.global.add.f32 %f11, [%rd30], %f10;
+ add.s32 %r30, %r30, %r5;
+ setp.lt.u32 %p9, %r30, %r16;
+ @%p9 bra $L__BB58_13;
+
+$L__BB58_14:
+ add.s32 %r26, %r26, %r4;
+ setp.lt.u32 %p10, %r26, %r17;
+ @%p10 bra $L__BB58_11;
+
+$L__BB58_15:
+ ret;
+
+}
+ // .globl CosineMultiDistance
+.visible .entry CosineMultiDistance(
+ .param .u64 CosineMultiDistance_param_0,
+ .param .u64 CosineMultiDistance_param_1,
+ .param .u64 CosineMultiDistance_param_2,
+ .param .u64 CosineMultiDistance_param_3,
+ .param .u64 CosineMultiDistance_param_4,
+ .param .u32 CosineMultiDistance_param_5,
+ .param .u32 CosineMultiDistance_param_6,
+ .param .u32 CosineMultiDistance_param_7
)
{
.reg .pred %p<7>;
@@ -4224,20 +4332,20 @@ $L__BB57_21:
.reg .b64 %rd<28>;
- ld.param.u64 %rd7, [MultiCosineDistance_param_0];
- ld.param.u64 %rd8, [MultiCosineDistance_param_1];
- ld.param.u64 %rd9, [MultiCosineDistance_param_2];
- ld.param.u64 %rd10, [MultiCosineDistance_param_3];
- ld.param.u64 %rd11, [MultiCosineDistance_param_4];
- ld.param.u32 %r17, [MultiCosineDistance_param_5];
- ld.param.u32 %r18, [MultiCosineDistance_param_6];
- ld.param.u32 %r19, [MultiCosineDistance_param_7];
+ ld.param.u64 %rd7, [CosineMultiDistance_param_0];
+ ld.param.u64 %rd8, [CosineMultiDistance_param_1];
+ ld.param.u64 %rd9, [CosineMultiDistance_param_2];
+ ld.param.u64 %rd10, [CosineMultiDistance_param_3];
+ ld.param.u64 %rd11, [CosineMultiDistance_param_4];
+ ld.param.u32 %r17, [CosineMultiDistance_param_5];
+ ld.param.u32 %r18, [CosineMultiDistance_param_6];
+ ld.param.u32 %r19, [CosineMultiDistance_param_7];
mov.u32 %r20, %ctaid.x;
mov.u32 %r1, %ntid.x;
mov.u32 %r21, %tid.x;
mad.lo.s32 %r31, %r1, %r20, %r21;
setp.ge.u32 %p1, %r31, %r19;
- @%p1 bra $L__BB58_9;
+ @%p1 bra $L__BB59_9;
mov.u32 %r22, %ntid.y;
mov.u32 %r23, %ctaid.y;
@@ -4259,16 +4367,16 @@ $L__BB57_21:
cvta.to.global.u64 %rd4, %rd10;
cvta.to.global.u64 %rd5, %rd11;
-$L__BB58_2:
+$L__BB59_2:
setp.ge.u32 %p2, %r3, %r18;
- @%p2 bra $L__BB58_8;
+ @%p2 bra $L__BB59_8;
cvt.u64.u32 %rd6, %r31;
mov.u32 %r32, %r3;
-$L__BB58_4:
+$L__BB59_4:
setp.ge.u32 %p3, %r5, %r17;
- @%p3 bra $L__BB58_7;
+ @%p3 bra $L__BB59_7;
mul.wide.u32 %rd12, %r32, 8;
add.s64 %rd13, %rd1, %rd12;
@@ -4281,7 +4389,7 @@ $L__BB58_4:
mad.lo.s32 %r33, %r32, %r17, %r5;
mov.u32 %r34, %r5;
-$L__BB58_6:
+$L__BB59_6:
mul.wide.u32 %rd18, %r34, 8;
add.s64 %rd19, %rd2, %rd18;
ld.global.nc.u64 %rd20, [%rd19];
@@ -4300,19 +4408,105 @@ $L__BB58_6:
add.s32 %r33, %r33, %r7;
add.s32 %r34, %r34, %r7;
setp.lt.u32 %p4, %r34, %r17;
- @%p4 bra $L__BB58_6;
+ @%p4 bra $L__BB59_6;
-$L__BB58_7:
+$L__BB59_7:
add.s32 %r32, %r32, %r6;
setp.lt.u32 %p5, %r32, %r18;
- @%p5 bra $L__BB58_4;
+ @%p5 bra $L__BB59_4;
-$L__BB58_8:
+$L__BB59_8:
add.s32 %r31, %r31, %r4;
setp.lt.u32 %p6, %r31, %r19;
- @%p6 bra $L__BB58_2;
+ @%p6 bra $L__BB59_2;
-$L__BB58_9:
+$L__BB59_9:
+ ret;
+
+}
+ // .globl CosineDistances
+.visible .entry CosineDistances(
+ .param .u64 CosineDistances_param_0,
+ .param .u64 CosineDistances_param_1,
+ .param .u64 CosineDistances_param_2,
+ .param .u64 CosineDistances_param_3,
+ .param .u64 CosineDistances_param_4,
+ .param .u32 CosineDistances_param_5,
+ .param .u32 CosineDistances_param_6
+)
+{
+ .reg .pred %p<5>;
+ .reg .f32 %f<9>;
+ .reg .b32 %r<21>;
+ .reg .b64 %rd<24>;
+
+
+ ld.param.u64 %rd7, [CosineDistances_param_0];
+ ld.param.u64 %rd8, [CosineDistances_param_1];
+ ld.param.u64 %rd9, [CosineDistances_param_2];
+ ld.param.u64 %rd10, [CosineDistances_param_3];
+ ld.param.u64 %rd11, [CosineDistances_param_4];
+ ld.param.u32 %r10, [CosineDistances_param_5];
+ ld.param.u32 %r11, [CosineDistances_param_6];
+ mov.u32 %r12, %ctaid.x;
+ mov.u32 %r1, %ntid.x;
+ mov.u32 %r13, %tid.x;
+ mad.lo.s32 %r19, %r1, %r12, %r13;
+ setp.ge.u32 %p1, %r19, %r11;
+ @%p1 bra $L__BB60_6;
+
+ mov.u32 %r14, %ntid.y;
+ mov.u32 %r15, %ctaid.y;
+ mov.u32 %r16, %tid.y;
+ mad.lo.s32 %r3, %r14, %r15, %r16;
+ mov.u32 %r17, %nctaid.x;
+ mul.lo.s32 %r4, %r1, %r17;
+ mov.u32 %r18, %nctaid.y;
+ mul.lo.s32 %r5, %r14, %r18;
+ cvta.to.global.u64 %rd1, %rd7;
+ cvta.to.global.u64 %rd2, %rd8;
+ cvta.to.global.u64 %rd3, %rd9;
+ cvta.to.global.u64 %rd4, %rd10;
+ cvta.to.global.u64 %rd5, %rd11;
+
+$L__BB60_2:
+ setp.ge.u32 %p2, %r3, %r10;
+ @%p2 bra $L__BB60_5;
+
+ cvt.u64.u32 %rd6, %r19;
+ mul.wide.u32 %rd12, %r19, 4;
+ add.s64 %rd13, %rd1, %rd12;
+ ld.global.nc.f32 %f1, [%rd13];
+ mul.ftz.f32 %f2, %f1, %f1;
+ mov.u32 %r20, %r3;
+
+$L__BB60_4:
+ mul.wide.u32 %rd14, %r20, 8;
+ add.s64 %rd15, %rd2, %rd14;
+ ld.global.nc.u64 %rd16, [%rd15];
+ cvta.to.global.u64 %rd17, %rd16;
+ shl.b64 %rd18, %rd6, 2;
+ add.s64 %rd19, %rd17, %rd18;
+ ld.global.f32 %f3, [%rd19];
+ mul.wide.u32 %rd20, %r20, 4;
+ add.s64 %rd21, %rd3, %rd20;
+ atom.global.add.f32 %f4, [%rd21], %f2;
+ add.s64 %rd22, %rd4, %rd20;
+ mul.ftz.f32 %f5, %f1, %f3;
+ atom.global.add.f32 %f6, [%rd22], %f5;
+ add.s64 %rd23, %rd5, %rd20;
+ mul.ftz.f32 %f7, %f3, %f3;
+ atom.global.add.f32 %f8, [%rd23], %f7;
+ add.s32 %r20, %r20, %r5;
+ setp.lt.u32 %p3, %r20, %r10;
+ @%p3 bra $L__BB60_4;
+
+$L__BB60_5:
+ add.s32 %r19, %r19, %r4;
+ setp.lt.u32 %p4, %r19, %r11;
+ @%p4 bra $L__BB60_2;
+
+$L__BB60_6:
ret;
}
@@ -4340,7 +4534,7 @@ $L__BB58_9:
mov.u32 %r2, %tid.x;
mad.lo.s32 %r3, %r22, %r1, %r2;
setp.ge.u32 %p1, %r3, %r20;
- @%p1 bra $L__BB59_2;
+ @%p1 bra $L__BB61_2;
cvta.to.global.u64 %rd3, %rd1;
mul.lo.s32 %r23, %r3, %r21;
@@ -4352,17 +4546,17 @@ $L__BB58_9:
add.s32 %r26, %r25, %r24;
st.shared.f32 [%r26], %f8;
-$L__BB59_2:
+$L__BB61_2:
bar.sync 0;
setp.ne.s32 %p2, %r2, 0;
- @%p2 bra $L__BB59_11;
+ @%p2 bra $L__BB61_11;
shl.b32 %r4, %r1, 10;
sub.s32 %r27, %r20, %r4;
min.u32 %r5, %r27, 1024;
setp.eq.s32 %p3, %r5, 0;
mov.f32 %f25, 0f00000000;
- @%p3 bra $L__BB59_10;
+ @%p3 bra $L__BB61_10;
not.b32 %r29, %r20;
add.s32 %r30, %r4, %r29;
@@ -4373,7 +4567,7 @@ $L__BB59_2:
setp.lt.u32 %p4, %r33, 3;
mov.f32 %f25, 0f00000000;
mov.u32 %r45, 0;
- @%p4 bra $L__BB59_7;
+ @%p4 bra $L__BB61_7;
add.s32 %r36, %r4, -1;
sub.s32 %r37, %r36, %r20;
@@ -4382,7 +4576,7 @@ $L__BB59_2:
neg.s32 %r42, %r39;
mov.u32 %r43, _ZZ9SumValuesE5block;
-$L__BB59_6:
+$L__BB61_6:
ld.shared.f32 %f13, [%r43];
add.ftz.f32 %f14, %f25, %f13;
ld.shared.f32 %f15, [%r43+4];
@@ -4395,32 +4589,32 @@ $L__BB59_6:
add.s32 %r43, %r43, 16;
add.s32 %r42, %r42, -4;
setp.ne.s32 %p5, %r42, 1;
- @%p5 bra $L__BB59_6;
+ @%p5 bra $L__BB61_6;
-$L__BB59_7:
+$L__BB61_7:
setp.eq.s32 %p6, %r47, 0;
- @%p6 bra $L__BB59_10;
+ @%p6 bra $L__BB61_10;
shl.b32 %r40, %r45, 2;
mov.u32 %r41, _ZZ9SumValuesE5block;
add.s32 %r46, %r41, %r40;
-$L__BB59_9:
+$L__BB61_9:
.pragma "nounroll";
ld.shared.f32 %f20, [%r46];
add.ftz.f32 %f25, %f25, %f20;
add.s32 %r46, %r46, 4;
add.s32 %r47, %r47, -1;
setp.ne.s32 %p7, %r47, 0;
- @%p7 bra $L__BB59_9;
+ @%p7 bra $L__BB61_9;
-$L__BB59_10:
+$L__BB61_10:
cvta.to.global.u64 %rd6, %rd2;
mul.wide.u32 %rd7, %r1, 4;
add.s64 %rd8, %rd6, %rd7;
st.global.f32 [%rd8], %f25;
-$L__BB59_11:
+$L__BB61_11:
ret;
}
diff --git a/BrightData.Cuda/cuda/build_kernels.bat b/BrightData.Cuda/cuda/build_kernels.bat
index 226d85f6..2a6eccc9 100644
--- a/BrightData.Cuda/cuda/build_kernels.bat
+++ b/BrightData.Cuda/cuda/build_kernels.bat
@@ -1,16 +1,16 @@
-nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_50 -o brightwire_50.ptx
-nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_52 -o brightwire_52.ptx
-nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_53 -o brightwire_53.ptx
-nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_60 -o brightwire_60.ptx
-nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_61 -o brightwire_61.ptx
-nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_62 -o brightwire_62.ptx
-nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_70 -o brightwire_70.ptx
-nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_72 -o brightwire_72.ptx
-nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_75 -o brightwire_75.ptx
-nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_80 -o brightwire_80.ptx
-nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_86 -o brightwire_86.ptx
-nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_87 -o brightwire_87.ptx
-nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_89 -o brightwire_89.ptx
-nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_90 -o brightwire_90.ptx
+nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_50 -o brightwire_50.ptx -allow-unsupported-compiler
+nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_52 -o brightwire_52.ptx -allow-unsupported-compiler
+nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_53 -o brightwire_53.ptx -allow-unsupported-compiler
+nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_60 -o brightwire_60.ptx -allow-unsupported-compiler
+nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_61 -o brightwire_61.ptx -allow-unsupported-compiler
+nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_62 -o brightwire_62.ptx -allow-unsupported-compiler
+nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_70 -o brightwire_70.ptx -allow-unsupported-compiler
+nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_72 -o brightwire_72.ptx -allow-unsupported-compiler
+nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_75 -o brightwire_75.ptx -allow-unsupported-compiler
+nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_80 -o brightwire_80.ptx -allow-unsupported-compiler
+nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_86 -o brightwire_86.ptx -allow-unsupported-compiler
+nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_87 -o brightwire_87.ptx -allow-unsupported-compiler
+nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_89 -o brightwire_89.ptx -allow-unsupported-compiler
+nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_90 -o brightwire_90.ptx -allow-unsupported-compiler
-nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_52 -o brightwire.ptx
\ No newline at end of file
+nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_52 -o brightwire.ptx -allow-unsupported-compiler
\ No newline at end of file
diff --git a/BrightData.UnitTests/CudaTests.cs b/BrightData.UnitTests/CudaTests.cs
index eb4cd5ec..e087565f 100644
--- a/BrightData.UnitTests/CudaTests.cs
+++ b/BrightData.UnitTests/CudaTests.cs
@@ -1,4 +1,6 @@
-using BrightData.LinearAlgebra.Segments;
+using System.Linq;
+using BrightData.Helper;
+using BrightData.LinearAlgebra.Segments;
using BrightData.UnitTests.Helper;
using FluentAssertions;
using Xunit;
@@ -10,11 +12,48 @@ public class CudaTests : CudaBase
[Fact]
public void CopyToWrapperWithStride()
{
- var empty = _cuda.CreateSegment(24, true);
- var ones = _cuda.CreateSegment(8, _ => 1);
+ using var empty = _cuda.CreateSegment(24, true);
+ using var ones = _cuda.CreateSegment(8, _ => 1);
var wrapper = new MutableTensorSegmentWrapper(empty, 0, 3, 8);
ones.CopyTo(wrapper);
empty.ToNewArray()[..6].Should().BeEquivalentTo([1, 0, 0, 1, 0, 0]);
}
+
+ void FindDistance(DistanceMetric distanceMetric)
+ {
+ using var gpuSegment = _cuda.CreateSegment(8, _ => _context.NextRandomFloat());
+ var gpuSegments = 8.AsRange().Select(_ => _cuda.CreateSegment(8, _ => _context.NextRandomFloat())).ToArray();
+ using var gpuDistance = _cuda.FindDistances(gpuSegment, gpuSegments, distanceMetric);
+
+ using var cpuSegment = _cpu.CreateSegment(gpuSegment);
+ var cpuSegments = gpuSegments.Select(_cpu.CreateSegment).ToArray();
+ using var cpuDistance = _cpu.FindDistances(cpuSegment, gpuSegments, distanceMetric);
+
+ foreach (var (g, c) in gpuDistance.Values.Zip(cpuDistance.Values))
+ Math.AreApproximatelyEqual(g, c).Should().BeTrue();
+
+ foreach(var item in cpuSegments)
+ item.Dispose();
+ foreach(var item in gpuSegments)
+ item.Dispose();
+ }
+
+ [Fact]
+ public void FindCosineDistance()
+ {
+ FindDistance(DistanceMetric.Cosine);
+ }
+
+ [Fact]
+ public void FindEuclideanDistance()
+ {
+ FindDistance(DistanceMetric.Euclidean);
+ }
+
+ [Fact]
+ public void FindManhattanDistance()
+ {
+ FindDistance(DistanceMetric.Manhattan);
+ }
}
}
diff --git a/BrightData.UnitTests/VectorTests.cs b/BrightData.UnitTests/VectorTests.cs
index f5315149..d7813939 100644
--- a/BrightData.UnitTests/VectorTests.cs
+++ b/BrightData.UnitTests/VectorTests.cs
@@ -77,20 +77,17 @@ public void TestVectorCreation()
void TestDistances(DistanceMetric distanceMetric)
{
var distribution = _context.CreateNormalDistribution(0, 5);
- var vectors = Enumerable.Range(0, 10).Select(_ => _cpu.CreateVector(100, _ => distribution.Sample())).ToArray();
- var compareTo = Enumerable.Range(0, 20).Select(_ => _cpu.CreateVector(100, _ => distribution.Sample())).ToArray();
+ var vectors = Enumerable.Range(0, 10).Select(_ => (IReadOnlyNumericSegment)_cpu.CreateSegment(100, _ => distribution.Sample())).ToArray();
+ var compareTo = Enumerable.Range(0, 20).Select(_ => (IReadOnlyNumericSegment)_cpu.CreateSegment(100, _ => distribution.Sample())).ToArray();
- var gpuVectors = vectors.Select(v => _cuda.CreateVector(v.Segment)).ToArray();
- var gpuCompareTo = compareTo.Select(v => _cuda.CreateVector(v.Segment)).ToArray();
-
- var mklVectors = vectors.Select(v => _mkl.CreateVector(v.Segment)).ToArray();
- var mklCompareTo = compareTo.Select(v => _mkl.CreateVector(v.Segment)).ToArray();
+ var gpuVectors = vectors.Select(_cuda.CreateSegment).ToArray();
+ var gpuCompareTo = compareTo.Select(_cuda.CreateSegment).ToArray();
try {
AssertSameAndThenDispose(
_cpu.FindDistances(vectors, compareTo, distanceMetric),
_cuda.FindDistances(gpuVectors, gpuCompareTo, distanceMetric),
- _mkl.FindDistances(mklVectors, mklCompareTo, distanceMetric)
+ _mkl.FindDistances(vectors, compareTo, distanceMetric)
);
}
finally {
@@ -98,8 +95,6 @@ void TestDistances(DistanceMetric distanceMetric)
compareTo.DisposeAll();
gpuVectors.DisposeAll();
gpuCompareTo.DisposeAll();
- mklVectors.DisposeAll();
- mklCompareTo.DisposeAll();
}
}
diff --git a/BrightData/BrightData.xml b/BrightData/BrightData.xml
index 226c3e14..87ebccac 100644
--- a/BrightData/BrightData.xml
+++ b/BrightData/BrightData.xml
@@ -9637,6 +9637,13 @@
+
+
+ Creates a tensor segment from an existing segment
+
+
+
+
Creates a tensor segment
@@ -10515,7 +10522,7 @@
-
+
Finds the distance between each pair of vectors
@@ -10524,6 +10531,24 @@
Distance metric
Matrix with the rows corresponding to the first set and columns corresponding to the second set and each element containing the distance
+
+
+ Finds the distance between each pair of vectors
+
+
+
+
+
+
+
+
+ Finds the distance between a vector and list of vectors
+
+
+
+
+
+
Binds a new thread to this provider
@@ -12059,12 +12084,12 @@
Current number of neighbours
-
+
The smallest neighbour weight
-
+
The largest neighbour weight
@@ -12079,7 +12104,7 @@
The index of the neighbour with the largest weight
-
+
Tries to add a new neighbour - will succeed if there aren't already max neighbours with a smaller weight
@@ -12114,6 +12139,17 @@
+
+
+ Creates a vector graph from an array of nodes
+
+
+
+
+
+ Number of nodes in the graph
+
+
Gets the neighbours for a node, sorted by distance
@@ -12128,6 +12164,13 @@
+
+
+ Enumerates the neighbour indices and their weights in ascending order
+
+
+
+
Creates
diff --git a/BrightData/ExtensionMethods.TensorSegment.cs b/BrightData/ExtensionMethods.TensorSegment.cs
index eb97c2f1..0fa60106 100644
--- a/BrightData/ExtensionMethods.TensorSegment.cs
+++ b/BrightData/ExtensionMethods.TensorSegment.cs
@@ -1,5 +1,6 @@
using BrightData.Helper;
using System;
+using System.Collections;
using System.Collections.Generic;
using System.IO;
using System.Linq;
@@ -616,5 +617,31 @@ public static ReadOnlyMemory GetMemory(this IReadOnlyNumericSegment seg
: segment.ToNewArray()
;
}
+
+ class MaxComparer : IComparer
+ where T : unmanaged, INumber, IComparable
+ {
+ public int Compare(T x, T y) => y.CompareTo(x);
+ }
+ class MinComparer : IComparer
+ where T : unmanaged, INumber, IComparable
+ {
+ public int Compare(T x, T y) => x.CompareTo(y);
+ }
+
+ public static PriorityQueue RankedIndices(this IReadOnlyNumericSegment segment, uint count, bool ascending = true)
+ where T : unmanaged, INumber
+ {
+ var ret = new PriorityQueue(ascending ? new MaxComparer() : new MinComparer());
+ segment.ApplyReadOnlySpan(x => {
+ for (int i = 0, len = x.Length; i < len; i++) {
+ if (ret.Count < count)
+ ret.Enqueue((uint)i, x[i]);
+ else
+ ret.EnqueueDequeue((uint)i, x[i]);
+ }
+ });
+ return ret;
+ }
}
}
diff --git a/BrightData/LinearAlgebra/LinearAlgebraProvider.cs b/BrightData/LinearAlgebra/LinearAlgebraProvider.cs
index e4590f83..4b8a7e8b 100644
--- a/BrightData/LinearAlgebra/LinearAlgebraProvider.cs
+++ b/BrightData/LinearAlgebra/LinearAlgebraProvider.cs
@@ -9,6 +9,7 @@
using BrightData.LinearAlgebra.ReadOnly;
using BrightData.LinearAlgebra.Segments;
using CommunityToolkit.HighPerformance.Buffers;
+using static BrightData.DataTable.ColumnOrientedDataTable;
namespace BrightData.LinearAlgebra
{
@@ -117,6 +118,13 @@ public virtual void PopScope()
///
public virtual INumericSegment CreateSegment(params T[] data) => new MutableTensorSegment(data);
+ ///
+ /// Creates a tensor segment from an existing segment
+ ///
+ ///
+ ///
+ public virtual INumericSegment CreateSegment(IReadOnlyNumericSegment segment) => new MutableTensorSegment(segment.ToNewArray());
+
///
/// Creates a tensor segment
///
@@ -1219,10 +1227,10 @@ public virtual IMatrix SoftmaxDerivative(IReadOnlyNumericSegment tensor)
/// Second set of vectors
/// Distance metric
/// Matrix with the rows corresponding to the first set and columns corresponding to the second set and each element containing the distance
- public virtual IMatrix FindDistances(IVector[] vectors, IReadOnlyList> compareTo, DistanceMetric distanceMetric)
+ public virtual IMatrix FindDistances(IReadOnlyList> vectors, IReadOnlyList> compareTo, DistanceMetric distanceMetric)
{
var rows = (uint)compareTo.Count;
- var columns = (uint)vectors.Length;
+ var columns = (uint)vectors.Count;
var ret = CreateMatrix(rows, columns, false);
var totalSize = rows * columns;
@@ -1230,13 +1238,13 @@ public virtual IMatrix FindDistances(IVector[] vectors, IReadOnlyList {
var i = (uint)(ind % rows);
var j = (uint)(ind / rows);
- ret[i, j] = compareTo[(int)i].FindDistance(vectors[j], distanceMetric);
+ ret[i, j] = compareTo[(int)i].FindDistance(vectors[(int)j], distanceMetric);
});
}
else {
for (uint i = 0; i < rows; i++) {
for (uint j = 0; j < columns; j++) {
- ret[i, j] = compareTo[(int)i].FindDistance(vectors[j], distanceMetric);
+ ret[i, j] = compareTo[(int)i].FindDistance(vectors[(int)j], distanceMetric);
}
}
}
@@ -1244,6 +1252,45 @@ public virtual IMatrix FindDistances(IVector[] vectors, IReadOnlyList
+ /// Finds the distance between each pair of vectors
+ ///
+ ///
+ ///
+ ///
+ ///
+ public virtual IMatrix FindDistances(IReadOnlyList> vectors, IReadOnlyList> compareTo, DistanceMetric distanceMetric)
+ {
+ return FindDistances(
+ vectors.Select(x => x.Segment).ToArray(),
+ compareTo.Select(x => x.Segment).ToArray(),
+ distanceMetric
+ );
+ }
+
+ ///
+ /// Finds the distance between a vector and list of vectors
+ ///
+ ///
+ ///
+ ///
+ ///
+ public virtual IVector FindDistances(IReadOnlyNumericSegment vector, IReadOnlyList> compareTo, DistanceMetric distanceMetric)
+ {
+ var size = (uint)compareTo.Count;
+ var ret = CreateVector(size, false);
+ if (size >= Consts.MinimumSizeForParallel) {
+ Parallel.For(0, ret.Size, i => {
+ ret[i] = vector.FindDistance(compareTo[(int)i], distanceMetric);
+ });
+ }
+ else {
+ for (uint i = 0; i < size; i++)
+ ret[i] = vector.FindDistance(compareTo[(int)i], distanceMetric);
+ }
+ return ret;
+ }
+
///
/// Binds a new thread to this provider
///
diff --git a/BrightData/LinearAlgebra/VectorIndexing/Helper/IndexedFixedSizeGraphNode.cs b/BrightData/LinearAlgebra/VectorIndexing/Helper/IndexedFixedSizeGraphNode.cs
index f8e9822f..b9f9365a 100644
--- a/BrightData/LinearAlgebra/VectorIndexing/Helper/IndexedFixedSizeGraphNode.cs
+++ b/BrightData/LinearAlgebra/VectorIndexing/Helper/IndexedFixedSizeGraphNode.cs
@@ -24,12 +24,12 @@ internal struct IndexFixedSize
public uint _element0;
}
[InlineArray(MaxNeighbours)]
- internal struct DistanceFixedSize
+ internal struct WeightFixedSize
{
public T _element0;
}
readonly IndexFixedSize _neighbourIndices = new();
- readonly DistanceFixedSize _neighbourWeights = new();
+ readonly WeightFixedSize _neighbourWeights = new();
///
/// Current number of neighbours
@@ -39,12 +39,12 @@ internal struct DistanceFixedSize
///
/// The smallest neighbour weight
///
- public readonly T MinDistance => NeighbourCount > 0 ? NeighbourWeights[0] : T.MaxValue;
+ public readonly T MinWeight => NeighbourCount > 0 ? NeighbourWeights[0] : T.MaxValue;
///
/// The largest neighbour weight
///
- public readonly T MaxDistance => NeighbourCount > 0 ? NeighbourWeights[NeighbourCount - 1] : T.MinValue;
+ public readonly T MaxWeight => NeighbourCount > 0 ? NeighbourWeights[NeighbourCount - 1] : T.MinValue;
///
/// The index of the neighbour with the smallest weight
@@ -62,60 +62,11 @@ internal struct DistanceFixedSize
///
///
///
- public unsafe bool TryAddNeighbour2(uint neighbourIndex, T neighbourWeight)
- {
- var isFull = NeighbourCount == MaxNeighbours;
- fixed (uint* indices = &_neighbourIndices._element0)
- fixed (T* weights = &_neighbourWeights._element0) {
- // check to see if it should be inserted
- if (isFull && weights[NeighbourCount - 1] <= neighbourWeight)
- return false;
-
- byte insertPosition = 0;
- var foundInsertPosition = false;
- for (byte i = 0; i < NeighbourCount; i++) {
- // check that the neighbour has not already been added
- if (indices[i] == neighbourIndex)
- return false;
-
- // see if we should insert here
- if (weights[i] > neighbourWeight) {
- insertPosition = i;
- foundInsertPosition = true;
- break;
- }
- }
-
- if (!foundInsertPosition) {
- // there is no room left
- if (isFull)
- return false;
-
- // insert at end
- insertPosition = NeighbourCount;
- }
- else {
- // shuffle to make room
- for (var i = NeighbourCount - (isFull ? 2 : 1); i >= insertPosition; i--) {
- indices[i + 1] = indices[i];
- weights[i + 1] = weights[i];
- }
- }
-
- // insert the item
- indices[insertPosition] = neighbourIndex;
- weights[insertPosition] = neighbourWeight;
- if (!isFull)
- ++NeighbourCount;
- }
- return true;
- }
-
public bool TryAddNeighbour(uint neighbourIndex, T neighbourWeight)
{
var isFull = NeighbourCount == MaxNeighbours;
var indices = MemoryMarshal.CreateSpan(ref Unsafe.As(ref Unsafe.AsRef(in _neighbourIndices)), MaxNeighbours);
- var weights = MemoryMarshal.CreateSpan(ref Unsafe.As(ref Unsafe.AsRef(in _neighbourWeights)), MaxNeighbours);
+ var weights = MemoryMarshal.CreateSpan(ref Unsafe.As(ref Unsafe.AsRef(in _neighbourWeights)), MaxNeighbours);
// check to see if it should be inserted
if (isFull && weights[NeighbourCount - 1] <= neighbourWeight)
@@ -181,7 +132,7 @@ public bool TryAddNeighbour(uint neighbourIndex, T neighbourWeight)
///
/// Sorted list of neighbour weights
///
- public readonly ReadOnlySpan NeighbourWeights => MemoryMarshal.CreateReadOnlySpan(ref Unsafe.As(ref Unsafe.AsRef(in _neighbourWeights)), NeighbourCount);
+ public readonly ReadOnlySpan NeighbourWeights => MemoryMarshal.CreateReadOnlySpan(ref Unsafe.As(ref Unsafe.AsRef(in _neighbourWeights)), NeighbourCount);
///
/// Returns a neighbour weight
diff --git a/BrightData/LinearAlgebra/VectorIndexing/Helper/VectorGraph.cs b/BrightData/LinearAlgebra/VectorIndexing/Helper/VectorGraph.cs
index d7150987..3fcb5d3f 100644
--- a/BrightData/LinearAlgebra/VectorIndexing/Helper/VectorGraph.cs
+++ b/BrightData/LinearAlgebra/VectorIndexing/Helper/VectorGraph.cs
@@ -1,4 +1,5 @@
using System;
+using System.Collections.Generic;
using System.IO;
using System.Numerics;
using System.Runtime.CompilerServices;
@@ -11,16 +12,25 @@ namespace BrightData.LinearAlgebra.VectorIndexing.Helper
/// Creates a graph of vectors with a fixed size set of neighbours
///
///
- public class VectorGraph
+ public class VectorGraph : IHaveSize
where T : unmanaged, IBinaryFloatingPointIeee754, IMinMaxValue
{
readonly IndexedFixedSizeGraphNode[] _nodes;
- VectorGraph(IndexedFixedSizeGraphNode[] nodes)
+ ///
+ /// Creates a vector graph from an array of nodes
+ ///
+ ///
+ public VectorGraph(IndexedFixedSizeGraphNode[] nodes)
{
_nodes = nodes;
}
+ ///
+ /// Number of nodes in the graph
+ ///
+ public uint Size => (uint)_nodes.Length;
+
///
/// Gets the neighbours for a node, sorted by distance
///
@@ -35,6 +45,13 @@ public class VectorGraph
///
public ReadOnlySpan GetNeighbourWeights(uint vectorIndex) => _nodes[vectorIndex].NeighbourWeights;
+ ///
+ /// Enumerates the neighbour indices and their weights in ascending order
+ ///
+ ///
+ ///
+ public IEnumerable<(uint NeighbourIndex, T NeighbourWeight)> GetWeightedNeighbours(uint vectorIndex) => _nodes[vectorIndex].WeightedNeighbours;
+
///
/// Creates
///
diff --git a/BrightWire/BrightWire.xml b/BrightWire/BrightWire.xml
index 6011257f..cd838d2d 100644
--- a/BrightWire/BrightWire.xml
+++ b/BrightWire/BrightWire.xml
@@ -2981,59 +2981,6 @@
Calculate vector based statistics
-
-
- Calculates the distance between a list of vectors and a list of vectors to compare against
-
-
-
-
- Constructor
-
- List of vectors to compare
- Distance metric for comparison
-
-
-
- The list of vectors to compare against
-
-
-
-
- Distance metric
-
-
-
-
- Adds a comparison vector (will be owned and disposed by the helper class)
-
- Vector to compare against
- Index of the comparison vector
-
-
-
- Updates the comparison vector at this index (disposes the old vector)
-
- Index to update
- Vector to replace with
-
-
-
- Updates the entire list of comparison vectors
-
- List of vectors to compare against
-
-
-
- Returns the index of the closest comparison vector for each vector
-
-
-
-
- Returns a vector averaged from the data vectors
-
- Indices of the data vectors to use in the averaged vector
-
K Nearest Neighbour classifier
diff --git a/BrightWire/Helper/VectorDistanceHelper.cs b/BrightWire/Helper/VectorDistanceHelper.cs
deleted file mode 100644
index 1e335999..00000000
--- a/BrightWire/Helper/VectorDistanceHelper.cs
+++ /dev/null
@@ -1,130 +0,0 @@
-using System;
-using System.Collections.Generic;
-using System.Linq;
-using BrightData;
-using BrightData.LinearAlgebra;
-
-namespace BrightWire.Helper
-{
- ///
- /// Calculates the distance between a list of vectors and a list of vectors to compare against
- ///
- public class VectorDistanceHelper : IDisposable
- {
- readonly LinearAlgebraProvider _lap;
- readonly List> _comparison = [];
- readonly IVector[] _data;
-
- ///
- /// Constructor
- ///
- /// List of vectors to compare
- /// Distance metric for comparison
- public VectorDistanceHelper(IVector[] data, DistanceMetric distanceMetric = DistanceMetric.Euclidean)
- {
- _lap = data[0].Context.LinearAlgebraProvider;
- Metric = distanceMetric;
- _data = data;
- }
-
- void IDisposable.Dispose()
- {
- _comparison.ForEach(x => x.Dispose());
- Array.ForEach(_data, x => x.Dispose());
- _comparison.Clear();
- }
-
- ///
- /// The list of vectors to compare against
- ///
- public IReadOnlyList> CompareTo => _comparison;
-
- ///
- /// Distance metric
- ///
- public DistanceMetric Metric { get; }
-
- ///
- /// Adds a comparison vector (will be owned and disposed by the helper class)
- ///
- /// Vector to compare against
- /// Index of the comparison vector
- public int AddComparison(IVector comparison)
- {
- var ret = _comparison.Count;
- _comparison.Add(comparison);
- return ret;
- }
-
- ///
- /// Updates the comparison vector at this index (disposes the old vector)
- ///
- /// Index to update
- /// Vector to replace with
- public void UpdateComparisonVector(int index, IVector newVector)
- {
- _comparison[index].Dispose();
- _comparison[index] = newVector;
- }
-
- ///
- /// Updates the entire list of comparison vectors
- ///
- /// List of vectors to compare against
- public void SetComparisonVectors(IEnumerable> comparisonVectors)
- {
- _comparison.ForEach(c => c.Dispose());
- _comparison.Clear();
- _comparison.AddRange(comparisonVectors);
- }
-
- ///
- /// Returns the index of the closest comparison vector for each vector
- ///
- public uint[] GetClosest()
- {
- using var distance = _lap.FindDistances(_data, _comparison, Metric);
- return _data.Length.AsRange()
- .Select(i => GetMinimum(distance, i).Index)
- .ToArray();
- }
-
- ///
- /// Returns a vector averaged from the data vectors
- ///
- /// Indices of the data vectors to use in the averaged vector
- public IVector GetAverageFromData(uint[] indices)
- {
- using var data = _lap.CreateMatrixFromColumns(indices.Select(i => _data[i]).ToArray());
- var result = data.RowSums();
- result.MultiplyInPlace(1f / indices.Length);
- return result;
- }
-
- (uint Index, float Value) GetMinimum(IMatrix matrix, uint index)
- {
- var len = _comparison.Count;
-
- switch (len) {
- case 1:
- return (0, matrix[0, index]);
- case 0:
- throw new Exception("Cannot find minimum with zero length");
- }
-
- var (min, _, minIndex, _) = matrix.GetColumnSpan(index).GetMinAndMaxValues();
- return (minIndex, min);
-
- //var bestIndex = uint.MaxValue;
- //var min = float.MaxValue;
- //for (uint j = 0; j < len; j++) {
- // var val = matrix[j, columnIndex];
- // if (val < min) {
- // bestIndex = j;
- // min = val;
- // }
- //}
- //return (bestIndex, min);
- }
- }
-}