From 3ff5c4e0bc64487129514384886e14c94a0211de Mon Sep 17 00:00:00 2001 From: Jack Dermody Date: Sat, 27 Jul 2024 16:38:03 +1000 Subject: [PATCH] added single to many vector distance calculation --- BrightData.Cuda/BrightData.Cuda.xml | 5 +- BrightData.Cuda/CudaLinearAlgebraProvider.cs | 74 ++- BrightData.Cuda/CudaProvider.cs | 155 +++-- BrightData.Cuda/CudaTensorSegment.cs | 3 +- BrightData.Cuda/cuda/brightwire.cu | 51 +- BrightData.Cuda/cuda/brightwire.ptx | 590 ++++++++++++------ BrightData.Cuda/cuda/brightwire_50.ptx | 590 ++++++++++++------ BrightData.Cuda/cuda/brightwire_52.ptx | 590 ++++++++++++------ BrightData.Cuda/cuda/brightwire_53.ptx | 590 ++++++++++++------ BrightData.Cuda/cuda/brightwire_60.ptx | 590 ++++++++++++------ BrightData.Cuda/cuda/brightwire_61.ptx | 590 ++++++++++++------ BrightData.Cuda/cuda/brightwire_62.ptx | 590 ++++++++++++------ BrightData.Cuda/cuda/brightwire_70.ptx | 590 ++++++++++++------ BrightData.Cuda/cuda/brightwire_72.ptx | 590 ++++++++++++------ BrightData.Cuda/cuda/brightwire_75.ptx | 590 ++++++++++++------ BrightData.Cuda/cuda/brightwire_80.ptx | 590 ++++++++++++------ BrightData.Cuda/cuda/brightwire_86.ptx | 590 ++++++++++++------ BrightData.Cuda/cuda/brightwire_87.ptx | 590 ++++++++++++------ BrightData.Cuda/cuda/brightwire_89.ptx | 590 ++++++++++++------ BrightData.Cuda/cuda/brightwire_90.ptx | 590 ++++++++++++------ BrightData.Cuda/cuda/build_kernels.bat | 30 +- BrightData.UnitTests/CudaTests.cs | 45 +- BrightData.UnitTests/VectorTests.cs | 15 +- BrightData/BrightData.xml | 51 +- BrightData/ExtensionMethods.TensorSegment.cs | 27 + .../LinearAlgebra/LinearAlgebraProvider.cs | 55 +- .../Helper/IndexedFixedSizeGraphNode.cs | 61 +- .../VectorIndexing/Helper/VectorGraph.cs | 21 +- BrightWire/BrightWire.xml | 53 -- BrightWire/Helper/VectorDistanceHelper.cs | 130 ---- 30 files changed, 6308 insertions(+), 3318 deletions(-) delete mode 100644 BrightWire/Helper/VectorDistanceHelper.cs diff --git a/BrightData.Cuda/BrightData.Cuda.xml b/BrightData.Cuda/BrightData.Cuda.xml index 3e746986..bbef7e59 100644 --- a/BrightData.Cuda/BrightData.Cuda.xml +++ b/BrightData.Cuda/BrightData.Cuda.xml @@ -55,6 +55,9 @@ + + + @@ -247,7 +250,7 @@ - + diff --git a/BrightData.Cuda/CudaLinearAlgebraProvider.cs b/BrightData.Cuda/CudaLinearAlgebraProvider.cs index 70916483..1d144526 100644 --- a/BrightData.Cuda/CudaLinearAlgebraProvider.cs +++ b/BrightData.Cuda/CudaLinearAlgebraProvider.cs @@ -9,6 +9,7 @@ using BrightData.LinearAlgebra; using BrightData.LinearAlgebra.Segments; using CommunityToolkit.HighPerformance.Buffers; +using static System.Runtime.InteropServices.JavaScript.JSType; namespace BrightData.Cuda { @@ -75,6 +76,23 @@ public override INumericSegment CreateSegment(params float[] data) return new CudaTensorSegment(deviceMemory, Provider); } + /// + public override INumericSegment CreateSegment(IReadOnlyNumericSegment segment) + { + var deviceMemory = Provider.Allocate(segment.Size); + var temp = SpanOwner.Empty; + var wasTempUsed = false; + try { + var span = segment.GetSpan(ref temp, out wasTempUsed); + deviceMemory.CopyToDevice(span, 0); + } + finally { + if (wasTempUsed) + temp.Dispose(); + } + return new CudaTensorSegment(deviceMemory, Provider); + } + internal CudaTensorSegment CreateCudaTensorSegment(IDeviceMemoryPtr ptr) => new(ptr, Provider); /// @@ -713,14 +731,14 @@ public override IMatrix CreateMatrix(uint rows, uint columns, bool initia } /// - public override IMatrix FindDistances(IVector[] vectors, IReadOnlyList> compareTo, DistanceMetric distanceMetric) + public override IMatrix FindDistances(IReadOnlyList> vectors, IReadOnlyList> compareTo, DistanceMetric distanceMetric) { if (distanceMetric is not (DistanceMetric.Euclidean or DistanceMetric.Manhattan or DistanceMetric.Cosine)) throw new NotImplementedException(); var size = vectors[0].Size; var rows = (uint)compareTo.Count; - var columns = (uint)vectors.Length; + var columns = (uint)vectors.Count; var ret = Provider.Allocate(rows * columns, null, true); using (var vectorPtr = new PtrToDeviceMemoryList(vectors.Cast().ToArray())) @@ -746,7 +764,7 @@ public override IMatrix FindDistances(IVector[] vectors, IReadOnly return ones.Subtract(distance); } - Provider.CalculateDistances(size, columns, rows, + Provider.CalculateMultiDistances(size, columns, rows, vectorPtr.DevicePointer, compareToPtr.DevicePointer, ret.DevicePointer, @@ -764,6 +782,56 @@ public override IMatrix FindDistances(IVector[] vectors, IReadOnly return matrix; } + public override IVector FindDistances(IReadOnlyNumericSegment vector, IReadOnlyList> compareTo, DistanceMetric distanceMetric) + { + if (distanceMetric is not (DistanceMetric.Euclidean or DistanceMetric.Manhattan or DistanceMetric.Cosine)) + throw new NotImplementedException(); + + var size = vector.Size; + var numVectors = (uint)compareTo.Count; + var ret = Provider.Allocate(numVectors, null, true); + + var vectorPtr = (IHaveDeviceMemory)vector; + using (var compareToPtr = new PtrToDeviceMemoryList(compareTo.Cast().ToArray())) { + if (distanceMetric == DistanceMetric.Cosine) { + var aa = Provider.Allocate(numVectors, null, true); + var bb = Provider.Allocate(numVectors, null, true); + Provider.CosineDistances(size, numVectors, + vectorPtr.Memory.DevicePointer, + compareToPtr.DevicePointer, + aa.DevicePointer, + ret.DevicePointer, + bb.DevicePointer + ); + using var ones = CreateVector(numVectors, _ => 1f); + using var vectorMagnitude = new CudaVector(CreateCudaTensorSegment(aa), this); + using var vectorSqrt = vectorMagnitude.Sqrt(); + using var compareToMagnitude = new CudaVector(CreateCudaTensorSegment(bb), this); + using var compareToSqrt = compareToMagnitude.Sqrt(); + using var norms = vectorSqrt.PointwiseMultiply(compareToSqrt); + using var result = new CudaVector(CreateCudaTensorSegment(ret), this); + using var distance = result.PointwiseDivide(norms); + return ones.Subtract(distance); + } + + Provider.CalculateDistances(size, numVectors, + vectorPtr.Memory.DevicePointer, + compareToPtr.DevicePointer, + ret.DevicePointer, + distanceMetric + ); + } + + IVector matrix = new CudaVector(CreateCudaTensorSegment(ret), this); + if (distanceMetric == DistanceMetric.Euclidean) { + var sqrt = matrix.Sqrt(); + matrix.Dispose(); + matrix = sqrt; + } + + return matrix; + } + /// public override void BindThread() { diff --git a/BrightData.Cuda/CudaProvider.cs b/BrightData.Cuda/CudaProvider.cs index 3022146a..90454a6a 100644 --- a/BrightData.Cuda/CudaProvider.cs +++ b/BrightData.Cuda/CudaProvider.cs @@ -135,7 +135,9 @@ readonly CuFunction _tensorReverseMaxPool, _tensorReverseIm2Col, _isFinite, - _calculateDistance, + _calculateMultiDistances, + _calculateDistances, + _cosineDistances, _roundInPlace, _scale ; @@ -197,66 +199,68 @@ public CudaProvider(BrightDataContext context, string? cudaKernelPath, string? c }); _cuda.SetCurrent(); - _pointwiseMultiply = _kernel.LoadFunction("PointwiseMultiply"); - _addInPlace = _kernel.LoadFunction("AddInPlace"); - _subtractInPlace = _kernel.LoadFunction("SubtractInPlace"); - _addToEachRow = _kernel.LoadFunction("AddToEachRow"); - _addToEachColumn = _kernel.LoadFunction("AddToEachColumn"); - _multiplyByEachRow = _kernel.LoadFunction("MultiplyByEachRow"); - _multiplyByEachColumn = _kernel.LoadFunction("MultiplyByEachColumn"); - _tanh = _kernel.LoadFunction("TanH"); - _tanhDerivative = _kernel.LoadFunction("TanHDerivative"); - _sigmoid = _kernel.LoadFunction("Sigmoid"); - _sigmoidDerivative = _kernel.LoadFunction("SigmoidDerivative"); - _sumRows = _kernel.LoadFunction("SumRows"); - _relu = _kernel.LoadFunction("RELU"); - _reluDerivative = _kernel.LoadFunction("RELUDerivative"); - _memSet = _kernel.LoadFunction("MemSet"); - _memCpy = _kernel.LoadFunction("MemCpy"); - _sumColumns = _kernel.LoadFunction("SumColumns"); - _pointwiseDivide = _kernel.LoadFunction("PointwiseDivide"); - _sqrt = _kernel.LoadFunction("Sqrt"); - _findMinAndMax = _kernel.LoadFunction("FindMinAndMax"); - _sumValues = _kernel.LoadFunction("SumValues"); - _findStdDev = _kernel.LoadFunction("FindStdDev"); - _constrain = _kernel.LoadFunction("Constrain"); - _pow = _kernel.LoadFunction("Pow"); - _diagonal = _kernel.LoadFunction("Diagonal"); - _l1Regularisation = _kernel.LoadFunction("L1Regularisation"); - _leakyRelu = _kernel.LoadFunction("LeakyRELU"); - _leakyReluDerivative = _kernel.LoadFunction("LeakyRELUDerivative"); - _pointwiseDivideRows = _kernel.LoadFunction("PointwiseDivideRows"); - _pointwiseDivideColumns = _kernel.LoadFunction("PointwiseDivideColumns"); - _splitRows = _kernel.LoadFunction("SplitRows"); - _splitColumns = _kernel.LoadFunction("SplitColumns"); - _concatRows = _kernel.LoadFunction("ConcatRows"); - _concatColumns = _kernel.LoadFunction("ConcatColumns"); - _euclideanDistance = _kernel.LoadFunction("EuclideanDistance"); - _manhattanDistance = _kernel.LoadFunction("ManhattanDistance"); - _cosineDistance = _kernel.LoadFunction("CosineDistance"); - _abs = _kernel.LoadFunction("Abs"); - _normalise = _kernel.LoadFunction("Normalise"); - _softmaxVector = _kernel.LoadFunction("SoftmaxVector"); - _multiCosine = _kernel.LoadFunction("MultiCosineDistance"); - _log = _kernel.LoadFunction("Log"); - _exp = _kernel.LoadFunction("Exp"); - _vectorAddInPlace = _kernel.LoadFunction("VectorAddInPlace"); - _vectorCopyRandom = _kernel.LoadFunction("VectorCopyRandom"); - _copyToMatrixColumns = _kernel.LoadFunction("CopyToMatrixColumns"); - _copyToMatrixRows = _kernel.LoadFunction("CopyToMatrixRows"); - _tensorAddPadding = _kernel.LoadFunction("TensorAddPadding"); - _tensorRemovePadding = _kernel.LoadFunction("TensorRemovePadding"); - _tensorIm2Col = _kernel.LoadFunction("TensorIm2Col"); - _softmaxDerivative = _kernel.LoadFunction("SoftmaxDerivative"); - _reverse = _kernel.LoadFunction("Reverse"); - _rotateInPlace = _kernel.LoadFunction("RotateInPlace"); - _tensorMaxPool = _kernel.LoadFunction("TensorMaxPool"); - _tensorReverseMaxPool = _kernel.LoadFunction("TensorReverseMaxPool"); - _tensorReverseIm2Col = _kernel.LoadFunction("TensorReverseIm2Col"); - _isFinite = _kernel.LoadFunction("IsFinite"); - _calculateDistance = _kernel.LoadFunction("CalculateDistances"); - _roundInPlace = _kernel.LoadFunction("RoundInPlace"); - _scale = _kernel.LoadFunction("Scale"); + _pointwiseMultiply = _kernel.LoadFunction("PointwiseMultiply"); + _addInPlace = _kernel.LoadFunction("AddInPlace"); + _subtractInPlace = _kernel.LoadFunction("SubtractInPlace"); + _addToEachRow = _kernel.LoadFunction("AddToEachRow"); + _addToEachColumn = _kernel.LoadFunction("AddToEachColumn"); + _multiplyByEachRow = _kernel.LoadFunction("MultiplyByEachRow"); + _multiplyByEachColumn = _kernel.LoadFunction("MultiplyByEachColumn"); + _tanh = _kernel.LoadFunction("TanH"); + _tanhDerivative = _kernel.LoadFunction("TanHDerivative"); + _sigmoid = _kernel.LoadFunction("Sigmoid"); + _sigmoidDerivative = _kernel.LoadFunction("SigmoidDerivative"); + _sumRows = _kernel.LoadFunction("SumRows"); + _relu = _kernel.LoadFunction("RELU"); + _reluDerivative = _kernel.LoadFunction("RELUDerivative"); + _memSet = _kernel.LoadFunction("MemSet"); + _memCpy = _kernel.LoadFunction("MemCpy"); + _sumColumns = _kernel.LoadFunction("SumColumns"); + _pointwiseDivide = _kernel.LoadFunction("PointwiseDivide"); + _sqrt = _kernel.LoadFunction("Sqrt"); + _findMinAndMax = _kernel.LoadFunction("FindMinAndMax"); + _sumValues = _kernel.LoadFunction("SumValues"); + _findStdDev = _kernel.LoadFunction("FindStdDev"); + _constrain = _kernel.LoadFunction("Constrain"); + _pow = _kernel.LoadFunction("Pow"); + _diagonal = _kernel.LoadFunction("Diagonal"); + _l1Regularisation = _kernel.LoadFunction("L1Regularisation"); + _leakyRelu = _kernel.LoadFunction("LeakyRELU"); + _leakyReluDerivative = _kernel.LoadFunction("LeakyRELUDerivative"); + _pointwiseDivideRows = _kernel.LoadFunction("PointwiseDivideRows"); + _pointwiseDivideColumns = _kernel.LoadFunction("PointwiseDivideColumns"); + _splitRows = _kernel.LoadFunction("SplitRows"); + _splitColumns = _kernel.LoadFunction("SplitColumns"); + _concatRows = _kernel.LoadFunction("ConcatRows"); + _concatColumns = _kernel.LoadFunction("ConcatColumns"); + _euclideanDistance = _kernel.LoadFunction("EuclideanDistance"); + _manhattanDistance = _kernel.LoadFunction("ManhattanDistance"); + _cosineDistance = _kernel.LoadFunction("CosineDistance"); + _cosineDistances = _kernel.LoadFunction("CosineDistances"); + _abs = _kernel.LoadFunction("Abs"); + _normalise = _kernel.LoadFunction("Normalise"); + _softmaxVector = _kernel.LoadFunction("SoftmaxVector"); + _multiCosine = _kernel.LoadFunction("CosineMultiDistance"); + _log = _kernel.LoadFunction("Log"); + _exp = _kernel.LoadFunction("Exp"); + _vectorAddInPlace = _kernel.LoadFunction("VectorAddInPlace"); + _vectorCopyRandom = _kernel.LoadFunction("VectorCopyRandom"); + _copyToMatrixColumns = _kernel.LoadFunction("CopyToMatrixColumns"); + _copyToMatrixRows = _kernel.LoadFunction("CopyToMatrixRows"); + _tensorAddPadding = _kernel.LoadFunction("TensorAddPadding"); + _tensorRemovePadding = _kernel.LoadFunction("TensorRemovePadding"); + _tensorIm2Col = _kernel.LoadFunction("TensorIm2Col"); + _softmaxDerivative = _kernel.LoadFunction("SoftmaxDerivative"); + _reverse = _kernel.LoadFunction("Reverse"); + _rotateInPlace = _kernel.LoadFunction("RotateInPlace"); + _tensorMaxPool = _kernel.LoadFunction("TensorMaxPool"); + _tensorReverseMaxPool = _kernel.LoadFunction("TensorReverseMaxPool"); + _tensorReverseIm2Col = _kernel.LoadFunction("TensorReverseIm2Col"); + _isFinite = _kernel.LoadFunction("IsFinite"); + _calculateMultiDistances = _kernel.LoadFunction("CalculateMultiDistances"); + _calculateDistances = _kernel.LoadFunction("CalculateDistances"); + _roundInPlace = _kernel.LoadFunction("RoundInPlace"); + _scale = _kernel.LoadFunction("Scale"); } /// @@ -1058,9 +1062,22 @@ internal void MultiCosine(uint size, uint columns, uint rows, CuDevicePtr vector ); } - internal void CalculateDistances(uint size, uint columns, uint rows, CuDevicePtr vectorPtr, CuDevicePtr compareToPtr, CuDevicePtr ret, DistanceMetric distanceMetric) + internal void CosineDistances(uint size, uint numVectors, CuDevicePtr vectorPtr, CuDevicePtr compareToPtr, CuDevicePtr aa, CuDevicePtr ret, CuDevicePtr bb) { - InvokeTensor(_calculateDistance, null, size, columns, rows, + InvokeMatrix(_cosineDistances, null, size, numVectors, + vectorPtr, + compareToPtr, + aa, + ret, + bb, + numVectors, + size + ); + } + + internal void CalculateMultiDistances(uint size, uint columns, uint rows, CuDevicePtr vectorPtr, CuDevicePtr compareToPtr, CuDevicePtr ret, DistanceMetric distanceMetric) + { + InvokeTensor(_calculateMultiDistances, null, size, columns, rows, vectorPtr, compareToPtr, ret, @@ -1071,6 +1088,18 @@ internal void CalculateDistances(uint size, uint columns, uint rows, CuDevicePtr ); } + internal void CalculateDistances(uint size, uint numVectors, CuDevicePtr vectorPtr, CuDevicePtr compareToPtr, CuDevicePtr ret, DistanceMetric distanceMetric) + { + InvokeMatrix(_calculateDistances, null, size, numVectors, + vectorPtr, + compareToPtr, + ret, + numVectors, + size, + (uint)distanceMetric + ); + } + internal void CopyToMatrixRows(uint rows, uint columns, CudaDeviceVariable from, IDeviceMemoryPtr to, CuStream* stream = null) { InvokeMatrix(_copyToMatrixRows, stream, rows, columns, from.DevicePointer, to.DevicePointer, rows, columns); diff --git a/BrightData.Cuda/CudaTensorSegment.cs b/BrightData.Cuda/CudaTensorSegment.cs index f2d2ea58..6233711d 100644 --- a/BrightData.Cuda/CudaTensorSegment.cs +++ b/BrightData.Cuda/CudaTensorSegment.cs @@ -6,7 +6,7 @@ namespace BrightData.Cuda { - internal class CudaTensorSegment(IDeviceMemoryPtr data, CudaProvider provider) : INumericSegment + internal class CudaTensorSegment(IDeviceMemoryPtr data, CudaProvider provider) : INumericSegment, IHaveDeviceMemory { const string CudaSegmentType = "cuda"; @@ -29,6 +29,7 @@ public static bool IsCuda(IReadOnlyNumericSegment segment, [NotNullWhen(t public int Release() => DeviceMemory.Release(); public IDeviceMemoryPtr DeviceMemory { get; } = data; + IDeviceMemoryPtr IHaveDeviceMemory.Memory => DeviceMemory; public bool IsValid => DeviceMemory.IsValid; public uint Size => DeviceMemory.Size; public string SegmentType => CudaSegmentType; diff --git a/BrightData.Cuda/cuda/brightwire.cu b/BrightData.Cuda/cuda/brightwire.cu index 4f6911c3..dfb1fdba 100644 --- a/BrightData.Cuda/cuda/brightwire.cu +++ b/BrightData.Cuda/cuda/brightwire.cu @@ -864,7 +864,7 @@ extern "C" } } - __global__ void CalculateDistances( + __global__ void CalculateMultiDistances( const float** __restrict a, const float** __restrict b, float* __restrict c, @@ -883,8 +883,6 @@ extern "C" if(distanceMetric == 0) { // euclidean float diff = aVal - bVal; output = diff * diff; - }else if(distanceMetric == 1) { // cosine - output = aVal * bVal; }else if(distanceMetric == 2) { // manhattan output = abs(aVal - bVal); } @@ -895,7 +893,32 @@ extern "C" } } - __global__ void MultiCosineDistance( + __global__ void CalculateDistances( + const float* __restrict a, + const float** __restrict b, + float* __restrict c, + uint numVectors, + uint size, + uint distanceMetric + ) { + for (uint i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) { + for (uint j = blockDim.y * blockIdx.y + threadIdx.y; j < numVectors; j += blockDim.y * gridDim.y) { + float aVal = a[i]; + float bVal = b[j][i]; + float output = 0; + + if(distanceMetric == 0) { // euclidean + float diff = aVal - bVal; + output = diff * diff; + }else if(distanceMetric == 2) { // manhattan + output = abs(aVal - bVal); + } + atomicAdd(c + j, output); + } + } + } + + __global__ void CosineMultiDistance( const float** __restrict a, const float** __restrict b, float* __restrict aa, @@ -919,6 +942,26 @@ extern "C" } } + __global__ void CosineDistances( + const float* __restrict a, + const float** __restrict b, + float* __restrict aa, + float* __restrict ab, + float* __restrict bb, + uint numVectors, + uint size + ) { + for (uint i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += blockDim.x * gridDim.x) { + for (uint j = blockDim.y * blockIdx.y + threadIdx.y; j < numVectors; j += blockDim.y * gridDim.y) { + float aVal = a[i]; + float bVal = b[j][i]; + atomicAdd(aa + j, aVal * aVal); + atomicAdd(ab + j, aVal * bVal); + atomicAdd(bb + j, bVal * bVal); + } + } + } + __global__ void SumValues(const float* __restrict a, uint count, float* __restrict sum, uint ai) { uint tidX = threadIdx.x; diff --git a/BrightData.Cuda/cuda/brightwire.ptx b/BrightData.Cuda/cuda/brightwire.ptx index 42cce186..66aa3204 100644 --- a/BrightData.Cuda/cuda/brightwire.ptx +++ b/BrightData.Cuda/cuda/brightwire.ptx @@ -4022,200 +4022,308 @@ $L__BB56_3: ret; } - // .globl CalculateDistances -.visible .entry CalculateDistances( - .param .u64 CalculateDistances_param_0, - .param .u64 CalculateDistances_param_1, - .param .u64 CalculateDistances_param_2, - .param .u32 CalculateDistances_param_3, - .param .u32 CalculateDistances_param_4, - .param .u32 CalculateDistances_param_5, - .param .u32 CalculateDistances_param_6 + // .globl CalculateMultiDistances +.visible .entry CalculateMultiDistances( + .param .u64 CalculateMultiDistances_param_0, + .param .u64 CalculateMultiDistances_param_1, + .param .u64 CalculateMultiDistances_param_2, + .param .u32 CalculateMultiDistances_param_3, + .param .u32 CalculateMultiDistances_param_4, + .param .u32 CalculateMultiDistances_param_5, + .param .u32 CalculateMultiDistances_param_6 ) { - .reg .pred %p<15>; - .reg .f32 %f<15>; - .reg .b32 %r<53>; - .reg .b64 %rd<46>; - - - ld.param.u64 %rd5, [CalculateDistances_param_0]; - ld.param.u64 %rd6, [CalculateDistances_param_1]; - ld.param.u64 %rd7, [CalculateDistances_param_2]; - ld.param.u32 %r27, [CalculateDistances_param_3]; - ld.param.u32 %r28, [CalculateDistances_param_4]; - ld.param.u32 %r29, [CalculateDistances_param_5]; - ld.param.u32 %r30, [CalculateDistances_param_6]; - cvta.to.global.u64 %rd1, %rd7; - cvta.to.global.u64 %rd2, %rd6; - cvta.to.global.u64 %rd3, %rd5; - mov.u32 %r31, %ctaid.x; + .reg .pred %p<13>; + .reg .f32 %f<12>; + .reg .b32 %r<49>; + .reg .b64 %rd<38>; + + + ld.param.u64 %rd6, [CalculateMultiDistances_param_0]; + ld.param.u64 %rd7, [CalculateMultiDistances_param_1]; + ld.param.u64 %rd8, [CalculateMultiDistances_param_2]; + ld.param.u32 %r25, [CalculateMultiDistances_param_3]; + ld.param.u32 %r26, [CalculateMultiDistances_param_4]; + ld.param.u32 %r27, [CalculateMultiDistances_param_5]; + ld.param.u32 %r28, [CalculateMultiDistances_param_6]; + cvta.to.global.u64 %rd1, %rd8; + cvta.to.global.u64 %rd2, %rd7; + cvta.to.global.u64 %rd3, %rd6; + mov.u32 %r29, %ctaid.x; mov.u32 %r1, %ntid.x; - mov.u32 %r32, %tid.x; - mad.lo.s32 %r45, %r1, %r31, %r32; - setp.ge.u32 %p1, %r45, %r29; - @%p1 bra $L__BB57_21; - - mov.u32 %r33, %ntid.y; - mov.u32 %r34, %ctaid.y; - mov.u32 %r35, %tid.y; - mad.lo.s32 %r3, %r33, %r34, %r35; - mov.u32 %r36, %nctaid.x; - mul.lo.s32 %r4, %r1, %r36; - mov.u32 %r37, %ctaid.z; - mov.u32 %r38, %ntid.z; - mov.u32 %r39, %tid.z; - mad.lo.s32 %r5, %r38, %r37, %r39; - mov.u32 %r40, %nctaid.y; - mul.lo.s32 %r6, %r33, %r40; - mov.u32 %r41, %nctaid.z; - mul.lo.s32 %r7, %r38, %r41; + mov.u32 %r30, %tid.x; + mad.lo.s32 %r42, %r1, %r29, %r30; + setp.ge.u32 %p1, %r42, %r27; + @%p1 bra $L__BB57_18; + + mov.u32 %r31, %ntid.y; + mov.u32 %r32, %ctaid.y; + mov.u32 %r33, %tid.y; + mad.lo.s32 %r3, %r31, %r32, %r33; + mov.u32 %r34, %nctaid.x; + mul.lo.s32 %r4, %r1, %r34; + mov.u32 %r35, %ctaid.z; + mov.u32 %r36, %ntid.z; + mov.u32 %r37, %tid.z; + mad.lo.s32 %r5, %r36, %r35, %r37; + mov.u32 %r38, %nctaid.y; + mul.lo.s32 %r6, %r31, %r38; + mov.u32 %r39, %nctaid.z; + mul.lo.s32 %r7, %r36, %r39; $L__BB57_2: - setp.ge.u32 %p2, %r3, %r28; - @%p2 bra $L__BB57_20; + setp.ge.u32 %p2, %r3, %r26; + @%p2 bra $L__BB57_17; - setp.eq.s32 %p3, %r30, 0; - cvt.u64.u32 %rd4, %r45; - @%p3 bra $L__BB57_15; + setp.eq.s32 %p3, %r28, 0; + cvt.u64.u32 %rd4, %r42; + @%p3 bra $L__BB57_12; - mov.u32 %r46, %r3; + mov.u32 %r43, %r3; $L__BB57_5: - setp.ge.u32 %p4, %r5, %r27; - @%p4 bra $L__BB57_14; - - mul.wide.u32 %rd8, %r46, 8; - add.s64 %rd9, %rd3, %rd8; - ld.global.nc.u64 %rd10, [%rd9]; - cvta.to.global.u64 %rd11, %rd10; - shl.b64 %rd12, %rd4, 2; - add.s64 %rd13, %rd11, %rd12; - ld.global.f32 %f1, [%rd13]; - mul.lo.s32 %r10, %r46, %r27; - setp.eq.s32 %p5, %r30, 1; - @%p5 bra $L__BB57_10; - - setp.ne.s32 %p6, %r30, 2; - @%p6 bra $L__BB57_12; + setp.ge.u32 %p4, %r5, %r25; + @%p4 bra $L__BB57_11; - add.s32 %r47, %r5, %r10; - mov.u32 %r48, %r5; + setp.eq.s32 %p5, %r28, 2; + mul.wide.u32 %rd9, %r43, 8; + add.s64 %rd10, %rd3, %rd9; + ld.global.nc.u64 %rd11, [%rd10]; + cvta.to.global.u64 %rd12, %rd11; + shl.b64 %rd13, %rd4, 2; + add.s64 %rd5, %rd12, %rd13; + mul.lo.s32 %r10, %r43, %r25; + @%p5 bra $L__BB57_9; + bra.uni $L__BB57_7; $L__BB57_9: - mul.wide.u32 %rd14, %r48, 8; - add.s64 %rd15, %rd2, %rd14; - ld.global.nc.u64 %rd16, [%rd15]; - cvta.to.global.u64 %rd17, %rd16; - add.s64 %rd19, %rd17, %rd12; - ld.global.f32 %f3, [%rd19]; - sub.ftz.f32 %f4, %f1, %f3; - abs.ftz.f32 %f5, %f4; - mul.wide.u32 %rd20, %r47, 4; - add.s64 %rd21, %rd1, %rd20; - atom.global.add.f32 %f6, [%rd21], %f5; - add.s32 %r47, %r47, %r7; - add.s32 %r48, %r48, %r7; - setp.lt.u32 %p7, %r48, %r27; - @%p7 bra $L__BB57_9; - bra.uni $L__BB57_14; + ld.global.f32 %f1, [%rd5]; + mov.u32 %r46, %r5; $L__BB57_10: - mov.u32 %r49, %r5; + mul.wide.u32 %rd16, %r46, 8; + add.s64 %rd17, %rd2, %rd16; + ld.global.nc.u64 %rd18, [%rd17]; + cvta.to.global.u64 %rd19, %rd18; + add.s64 %rd21, %rd19, %rd13; + ld.global.f32 %f4, [%rd21]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + add.s32 %r40, %r46, %r10; + mul.wide.u32 %rd22, %r40, 4; + add.s64 %rd23, %rd1, %rd22; + atom.global.add.f32 %f7, [%rd23], %f6; + add.s32 %r46, %r46, %r7; + setp.lt.u32 %p7, %r46, %r25; + @%p7 bra $L__BB57_10; + bra.uni $L__BB57_11; + +$L__BB57_7: + add.s32 %r44, %r5, %r10; + mov.u32 %r45, %r5; + +$L__BB57_8: + mul.wide.u32 %rd14, %r44, 4; + add.s64 %rd15, %rd1, %rd14; + atom.global.add.f32 %f3, [%rd15], 0f00000000; + add.s32 %r44, %r44, %r7; + add.s32 %r45, %r45, %r7; + setp.lt.u32 %p6, %r45, %r25; + @%p6 bra $L__BB57_8; $L__BB57_11: - mul.wide.u32 %rd22, %r49, 8; - add.s64 %rd23, %rd2, %rd22; - ld.global.nc.u64 %rd24, [%rd23]; - cvta.to.global.u64 %rd25, %rd24; - add.s64 %rd27, %rd25, %rd12; - ld.global.f32 %f7, [%rd27]; - mul.ftz.f32 %f8, %f1, %f7; - add.s32 %r42, %r49, %r10; - mul.wide.u32 %rd28, %r42, 4; - add.s64 %rd29, %rd1, %rd28; - atom.global.add.f32 %f9, [%rd29], %f8; - add.s32 %r49, %r49, %r7; - setp.lt.u32 %p8, %r49, %r27; - @%p8 bra $L__BB57_11; - bra.uni $L__BB57_14; + add.s32 %r43, %r43, %r6; + setp.lt.u32 %p8, %r43, %r26; + @%p8 bra $L__BB57_5; + bra.uni $L__BB57_17; $L__BB57_12: - mov.u32 %r50, %r5; + mov.u32 %r47, %r3; $L__BB57_13: - add.s32 %r43, %r50, %r10; - mul.wide.u32 %rd30, %r43, 4; - add.s64 %rd31, %rd1, %rd30; - atom.global.add.f32 %f10, [%rd31], 0f00000000; - add.s32 %r50, %r50, %r7; - setp.lt.u32 %p9, %r50, %r27; - @%p9 bra $L__BB57_13; - -$L__BB57_14: - add.s32 %r46, %r46, %r6; - setp.lt.u32 %p10, %r46, %r28; - @%p10 bra $L__BB57_5; - bra.uni $L__BB57_20; + setp.ge.u32 %p9, %r5, %r25; + @%p9 bra $L__BB57_16; + + mul.wide.u32 %rd24, %r47, 8; + add.s64 %rd25, %rd3, %rd24; + ld.global.nc.u64 %rd26, [%rd25]; + cvta.to.global.u64 %rd27, %rd26; + shl.b64 %rd28, %rd4, 2; + add.s64 %rd29, %rd27, %rd28; + ld.global.f32 %f2, [%rd29]; + mul.lo.s32 %r20, %r47, %r25; + mov.u32 %r48, %r5; $L__BB57_15: - mov.u32 %r51, %r3; + mul.wide.u32 %rd30, %r48, 8; + add.s64 %rd31, %rd2, %rd30; + ld.global.nc.u64 %rd32, [%rd31]; + cvta.to.global.u64 %rd33, %rd32; + add.s64 %rd35, %rd33, %rd28; + ld.global.f32 %f8, [%rd35]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + add.s32 %r41, %r48, %r20; + mul.wide.u32 %rd36, %r41, 4; + add.s64 %rd37, %rd1, %rd36; + atom.global.add.f32 %f11, [%rd37], %f10; + add.s32 %r48, %r48, %r7; + setp.lt.u32 %p10, %r48, %r25; + @%p10 bra $L__BB57_15; $L__BB57_16: - setp.ge.u32 %p11, %r5, %r27; - @%p11 bra $L__BB57_19; - - mul.wide.u32 %rd32, %r51, 8; - add.s64 %rd33, %rd3, %rd32; - ld.global.nc.u64 %rd34, [%rd33]; - cvta.to.global.u64 %rd35, %rd34; - shl.b64 %rd36, %rd4, 2; - add.s64 %rd37, %rd35, %rd36; - ld.global.f32 %f2, [%rd37]; - mul.lo.s32 %r22, %r51, %r27; - mov.u32 %r52, %r5; + add.s32 %r47, %r47, %r6; + setp.lt.u32 %p11, %r47, %r26; + @%p11 bra $L__BB57_13; + +$L__BB57_17: + add.s32 %r42, %r42, %r4; + setp.lt.u32 %p12, %r42, %r27; + @%p12 bra $L__BB57_2; $L__BB57_18: - mul.wide.u32 %rd38, %r52, 8; - add.s64 %rd39, %rd2, %rd38; - ld.global.nc.u64 %rd40, [%rd39]; - cvta.to.global.u64 %rd41, %rd40; - add.s64 %rd43, %rd41, %rd36; - ld.global.f32 %f11, [%rd43]; - sub.ftz.f32 %f12, %f2, %f11; - mul.ftz.f32 %f13, %f12, %f12; - add.s32 %r44, %r52, %r22; - mul.wide.u32 %rd44, %r44, 4; - add.s64 %rd45, %rd1, %rd44; - atom.global.add.f32 %f14, [%rd45], %f13; - add.s32 %r52, %r52, %r7; - setp.lt.u32 %p12, %r52, %r27; - @%p12 bra $L__BB57_18; - -$L__BB57_19: - add.s32 %r51, %r51, %r6; - setp.lt.u32 %p13, %r51, %r28; - @%p13 bra $L__BB57_16; - -$L__BB57_20: - add.s32 %r45, %r45, %r4; - setp.lt.u32 %p14, %r45, %r29; - @%p14 bra $L__BB57_2; - -$L__BB57_21: ret; } - // .globl MultiCosineDistance -.visible .entry MultiCosineDistance( - .param .u64 MultiCosineDistance_param_0, - .param .u64 MultiCosineDistance_param_1, - .param .u64 MultiCosineDistance_param_2, - .param .u64 MultiCosineDistance_param_3, - .param .u64 MultiCosineDistance_param_4, - .param .u32 MultiCosineDistance_param_5, - .param .u32 MultiCosineDistance_param_6, - .param .u32 MultiCosineDistance_param_7 + // .globl CalculateDistances +.visible .entry CalculateDistances( + .param .u64 CalculateDistances_param_0, + .param .u64 CalculateDistances_param_1, + .param .u64 CalculateDistances_param_2, + .param .u32 CalculateDistances_param_3, + .param .u32 CalculateDistances_param_4, + .param .u32 CalculateDistances_param_5 +) +{ + .reg .pred %p<11>; + .reg .f32 %f<12>; + .reg .b32 %r<31>; + .reg .b64 %rd<31>; + + + ld.param.u64 %rd7, [CalculateDistances_param_0]; + ld.param.u64 %rd8, [CalculateDistances_param_1]; + ld.param.u64 %rd9, [CalculateDistances_param_2]; + ld.param.u32 %r16, [CalculateDistances_param_3]; + ld.param.u32 %r17, [CalculateDistances_param_4]; + ld.param.u32 %r18, [CalculateDistances_param_5]; + cvta.to.global.u64 %rd1, %rd9; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd7; + mov.u32 %r19, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r20, %tid.x; + mad.lo.s32 %r26, %r1, %r19, %r20; + setp.ge.u32 %p1, %r26, %r17; + @%p1 bra $L__BB58_15; + + mov.u32 %r21, %ntid.y; + mov.u32 %r22, %ctaid.y; + mov.u32 %r23, %tid.y; + mad.lo.s32 %r3, %r21, %r22, %r23; + mov.u32 %r24, %nctaid.x; + mul.lo.s32 %r4, %r1, %r24; + mov.u32 %r25, %nctaid.y; + mul.lo.s32 %r5, %r21, %r25; + setp.eq.s32 %p2, %r18, 0; + @%p2 bra $L__BB58_11; + +$L__BB58_3: + setp.ge.u32 %p3, %r3, %r16; + @%p3 bra $L__BB58_9; + + setp.eq.s32 %p4, %r18, 2; + cvt.u64.u32 %rd4, %r26; + mul.wide.u32 %rd10, %r26, 4; + add.s64 %rd5, %rd3, %rd10; + @%p4 bra $L__BB58_7; + bra.uni $L__BB58_5; + +$L__BB58_7: + ld.global.nc.f32 %f1, [%rd5]; + mov.u32 %r28, %r3; + +$L__BB58_8: + mul.wide.u32 %rd13, %r28, 8; + add.s64 %rd14, %rd2, %rd13; + ld.global.nc.u64 %rd15, [%rd14]; + cvta.to.global.u64 %rd16, %rd15; + shl.b64 %rd17, %rd4, 2; + add.s64 %rd18, %rd16, %rd17; + ld.global.f32 %f4, [%rd18]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + mul.wide.u32 %rd19, %r28, 4; + add.s64 %rd20, %rd1, %rd19; + atom.global.add.f32 %f7, [%rd20], %f6; + add.s32 %r28, %r28, %r5; + setp.lt.u32 %p6, %r28, %r16; + @%p6 bra $L__BB58_8; + bra.uni $L__BB58_9; + +$L__BB58_5: + mov.u32 %r27, %r3; + +$L__BB58_6: + mul.wide.u32 %rd11, %r27, 4; + add.s64 %rd12, %rd1, %rd11; + atom.global.add.f32 %f3, [%rd12], 0f00000000; + add.s32 %r27, %r27, %r5; + setp.lt.u32 %p5, %r27, %r16; + @%p5 bra $L__BB58_6; + +$L__BB58_9: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p7, %r26, %r17; + @%p7 bra $L__BB58_3; + bra.uni $L__BB58_15; + +$L__BB58_11: + setp.ge.u32 %p8, %r3, %r16; + @%p8 bra $L__BB58_14; + + cvt.u64.u32 %rd6, %r26; + mul.wide.u32 %rd21, %r26, 4; + add.s64 %rd22, %rd3, %rd21; + ld.global.nc.f32 %f2, [%rd22]; + mov.u32 %r30, %r3; + +$L__BB58_13: + mul.wide.u32 %rd23, %r30, 8; + add.s64 %rd24, %rd2, %rd23; + ld.global.nc.u64 %rd25, [%rd24]; + cvta.to.global.u64 %rd26, %rd25; + shl.b64 %rd27, %rd6, 2; + add.s64 %rd28, %rd26, %rd27; + ld.global.f32 %f8, [%rd28]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + mul.wide.u32 %rd29, %r30, 4; + add.s64 %rd30, %rd1, %rd29; + atom.global.add.f32 %f11, [%rd30], %f10; + add.s32 %r30, %r30, %r5; + setp.lt.u32 %p9, %r30, %r16; + @%p9 bra $L__BB58_13; + +$L__BB58_14: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p10, %r26, %r17; + @%p10 bra $L__BB58_11; + +$L__BB58_15: + ret; + +} + // .globl CosineMultiDistance +.visible .entry CosineMultiDistance( + .param .u64 CosineMultiDistance_param_0, + .param .u64 CosineMultiDistance_param_1, + .param .u64 CosineMultiDistance_param_2, + .param .u64 CosineMultiDistance_param_3, + .param .u64 CosineMultiDistance_param_4, + .param .u32 CosineMultiDistance_param_5, + .param .u32 CosineMultiDistance_param_6, + .param .u32 CosineMultiDistance_param_7 ) { .reg .pred %p<7>; @@ -4224,20 +4332,20 @@ $L__BB57_21: .reg .b64 %rd<28>; - ld.param.u64 %rd7, [MultiCosineDistance_param_0]; - ld.param.u64 %rd8, [MultiCosineDistance_param_1]; - ld.param.u64 %rd9, [MultiCosineDistance_param_2]; - ld.param.u64 %rd10, [MultiCosineDistance_param_3]; - ld.param.u64 %rd11, [MultiCosineDistance_param_4]; - ld.param.u32 %r17, [MultiCosineDistance_param_5]; - ld.param.u32 %r18, [MultiCosineDistance_param_6]; - ld.param.u32 %r19, [MultiCosineDistance_param_7]; + ld.param.u64 %rd7, [CosineMultiDistance_param_0]; + ld.param.u64 %rd8, [CosineMultiDistance_param_1]; + ld.param.u64 %rd9, [CosineMultiDistance_param_2]; + ld.param.u64 %rd10, [CosineMultiDistance_param_3]; + ld.param.u64 %rd11, [CosineMultiDistance_param_4]; + ld.param.u32 %r17, [CosineMultiDistance_param_5]; + ld.param.u32 %r18, [CosineMultiDistance_param_6]; + ld.param.u32 %r19, [CosineMultiDistance_param_7]; mov.u32 %r20, %ctaid.x; mov.u32 %r1, %ntid.x; mov.u32 %r21, %tid.x; mad.lo.s32 %r31, %r1, %r20, %r21; setp.ge.u32 %p1, %r31, %r19; - @%p1 bra $L__BB58_9; + @%p1 bra $L__BB59_9; mov.u32 %r22, %ntid.y; mov.u32 %r23, %ctaid.y; @@ -4259,16 +4367,16 @@ $L__BB57_21: cvta.to.global.u64 %rd4, %rd10; cvta.to.global.u64 %rd5, %rd11; -$L__BB58_2: +$L__BB59_2: setp.ge.u32 %p2, %r3, %r18; - @%p2 bra $L__BB58_8; + @%p2 bra $L__BB59_8; cvt.u64.u32 %rd6, %r31; mov.u32 %r32, %r3; -$L__BB58_4: +$L__BB59_4: setp.ge.u32 %p3, %r5, %r17; - @%p3 bra $L__BB58_7; + @%p3 bra $L__BB59_7; mul.wide.u32 %rd12, %r32, 8; add.s64 %rd13, %rd1, %rd12; @@ -4281,7 +4389,7 @@ $L__BB58_4: mad.lo.s32 %r33, %r32, %r17, %r5; mov.u32 %r34, %r5; -$L__BB58_6: +$L__BB59_6: mul.wide.u32 %rd18, %r34, 8; add.s64 %rd19, %rd2, %rd18; ld.global.nc.u64 %rd20, [%rd19]; @@ -4300,19 +4408,105 @@ $L__BB58_6: add.s32 %r33, %r33, %r7; add.s32 %r34, %r34, %r7; setp.lt.u32 %p4, %r34, %r17; - @%p4 bra $L__BB58_6; + @%p4 bra $L__BB59_6; -$L__BB58_7: +$L__BB59_7: add.s32 %r32, %r32, %r6; setp.lt.u32 %p5, %r32, %r18; - @%p5 bra $L__BB58_4; + @%p5 bra $L__BB59_4; -$L__BB58_8: +$L__BB59_8: add.s32 %r31, %r31, %r4; setp.lt.u32 %p6, %r31, %r19; - @%p6 bra $L__BB58_2; + @%p6 bra $L__BB59_2; -$L__BB58_9: +$L__BB59_9: + ret; + +} + // .globl CosineDistances +.visible .entry CosineDistances( + .param .u64 CosineDistances_param_0, + .param .u64 CosineDistances_param_1, + .param .u64 CosineDistances_param_2, + .param .u64 CosineDistances_param_3, + .param .u64 CosineDistances_param_4, + .param .u32 CosineDistances_param_5, + .param .u32 CosineDistances_param_6 +) +{ + .reg .pred %p<5>; + .reg .f32 %f<9>; + .reg .b32 %r<21>; + .reg .b64 %rd<24>; + + + ld.param.u64 %rd7, [CosineDistances_param_0]; + ld.param.u64 %rd8, [CosineDistances_param_1]; + ld.param.u64 %rd9, [CosineDistances_param_2]; + ld.param.u64 %rd10, [CosineDistances_param_3]; + ld.param.u64 %rd11, [CosineDistances_param_4]; + ld.param.u32 %r10, [CosineDistances_param_5]; + ld.param.u32 %r11, [CosineDistances_param_6]; + mov.u32 %r12, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r13, %tid.x; + mad.lo.s32 %r19, %r1, %r12, %r13; + setp.ge.u32 %p1, %r19, %r11; + @%p1 bra $L__BB60_6; + + mov.u32 %r14, %ntid.y; + mov.u32 %r15, %ctaid.y; + mov.u32 %r16, %tid.y; + mad.lo.s32 %r3, %r14, %r15, %r16; + mov.u32 %r17, %nctaid.x; + mul.lo.s32 %r4, %r1, %r17; + mov.u32 %r18, %nctaid.y; + mul.lo.s32 %r5, %r14, %r18; + cvta.to.global.u64 %rd1, %rd7; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd9; + cvta.to.global.u64 %rd4, %rd10; + cvta.to.global.u64 %rd5, %rd11; + +$L__BB60_2: + setp.ge.u32 %p2, %r3, %r10; + @%p2 bra $L__BB60_5; + + cvt.u64.u32 %rd6, %r19; + mul.wide.u32 %rd12, %r19, 4; + add.s64 %rd13, %rd1, %rd12; + ld.global.nc.f32 %f1, [%rd13]; + mul.ftz.f32 %f2, %f1, %f1; + mov.u32 %r20, %r3; + +$L__BB60_4: + mul.wide.u32 %rd14, %r20, 8; + add.s64 %rd15, %rd2, %rd14; + ld.global.nc.u64 %rd16, [%rd15]; + cvta.to.global.u64 %rd17, %rd16; + shl.b64 %rd18, %rd6, 2; + add.s64 %rd19, %rd17, %rd18; + ld.global.f32 %f3, [%rd19]; + mul.wide.u32 %rd20, %r20, 4; + add.s64 %rd21, %rd3, %rd20; + atom.global.add.f32 %f4, [%rd21], %f2; + add.s64 %rd22, %rd4, %rd20; + mul.ftz.f32 %f5, %f1, %f3; + atom.global.add.f32 %f6, [%rd22], %f5; + add.s64 %rd23, %rd5, %rd20; + mul.ftz.f32 %f7, %f3, %f3; + atom.global.add.f32 %f8, [%rd23], %f7; + add.s32 %r20, %r20, %r5; + setp.lt.u32 %p3, %r20, %r10; + @%p3 bra $L__BB60_4; + +$L__BB60_5: + add.s32 %r19, %r19, %r4; + setp.lt.u32 %p4, %r19, %r11; + @%p4 bra $L__BB60_2; + +$L__BB60_6: ret; } @@ -4340,7 +4534,7 @@ $L__BB58_9: mov.u32 %r2, %tid.x; mad.lo.s32 %r3, %r22, %r1, %r2; setp.ge.u32 %p1, %r3, %r20; - @%p1 bra $L__BB59_2; + @%p1 bra $L__BB61_2; cvta.to.global.u64 %rd3, %rd1; mul.lo.s32 %r23, %r3, %r21; @@ -4352,17 +4546,17 @@ $L__BB58_9: add.s32 %r26, %r25, %r24; st.shared.f32 [%r26], %f8; -$L__BB59_2: +$L__BB61_2: bar.sync 0; setp.ne.s32 %p2, %r2, 0; - @%p2 bra $L__BB59_11; + @%p2 bra $L__BB61_11; shl.b32 %r4, %r1, 10; sub.s32 %r27, %r20, %r4; min.u32 %r5, %r27, 1024; setp.eq.s32 %p3, %r5, 0; mov.f32 %f25, 0f00000000; - @%p3 bra $L__BB59_10; + @%p3 bra $L__BB61_10; not.b32 %r29, %r20; add.s32 %r30, %r4, %r29; @@ -4373,7 +4567,7 @@ $L__BB59_2: setp.lt.u32 %p4, %r33, 3; mov.f32 %f25, 0f00000000; mov.u32 %r45, 0; - @%p4 bra $L__BB59_7; + @%p4 bra $L__BB61_7; add.s32 %r36, %r4, -1; sub.s32 %r37, %r36, %r20; @@ -4382,7 +4576,7 @@ $L__BB59_2: neg.s32 %r42, %r39; mov.u32 %r43, _ZZ9SumValuesE5block; -$L__BB59_6: +$L__BB61_6: ld.shared.f32 %f13, [%r43]; add.ftz.f32 %f14, %f25, %f13; ld.shared.f32 %f15, [%r43+4]; @@ -4395,32 +4589,32 @@ $L__BB59_6: add.s32 %r43, %r43, 16; add.s32 %r42, %r42, -4; setp.ne.s32 %p5, %r42, 1; - @%p5 bra $L__BB59_6; + @%p5 bra $L__BB61_6; -$L__BB59_7: +$L__BB61_7: setp.eq.s32 %p6, %r47, 0; - @%p6 bra $L__BB59_10; + @%p6 bra $L__BB61_10; shl.b32 %r40, %r45, 2; mov.u32 %r41, _ZZ9SumValuesE5block; add.s32 %r46, %r41, %r40; -$L__BB59_9: +$L__BB61_9: .pragma "nounroll"; ld.shared.f32 %f20, [%r46]; add.ftz.f32 %f25, %f25, %f20; add.s32 %r46, %r46, 4; add.s32 %r47, %r47, -1; setp.ne.s32 %p7, %r47, 0; - @%p7 bra $L__BB59_9; + @%p7 bra $L__BB61_9; -$L__BB59_10: +$L__BB61_10: cvta.to.global.u64 %rd6, %rd2; mul.wide.u32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f25; -$L__BB59_11: +$L__BB61_11: ret; } diff --git a/BrightData.Cuda/cuda/brightwire_50.ptx b/BrightData.Cuda/cuda/brightwire_50.ptx index 33090aa4..ce953a43 100644 --- a/BrightData.Cuda/cuda/brightwire_50.ptx +++ b/BrightData.Cuda/cuda/brightwire_50.ptx @@ -4022,200 +4022,308 @@ $L__BB56_3: ret; } - // .globl CalculateDistances -.visible .entry CalculateDistances( - .param .u64 CalculateDistances_param_0, - .param .u64 CalculateDistances_param_1, - .param .u64 CalculateDistances_param_2, - .param .u32 CalculateDistances_param_3, - .param .u32 CalculateDistances_param_4, - .param .u32 CalculateDistances_param_5, - .param .u32 CalculateDistances_param_6 + // .globl CalculateMultiDistances +.visible .entry CalculateMultiDistances( + .param .u64 CalculateMultiDistances_param_0, + .param .u64 CalculateMultiDistances_param_1, + .param .u64 CalculateMultiDistances_param_2, + .param .u32 CalculateMultiDistances_param_3, + .param .u32 CalculateMultiDistances_param_4, + .param .u32 CalculateMultiDistances_param_5, + .param .u32 CalculateMultiDistances_param_6 ) { - .reg .pred %p<15>; - .reg .f32 %f<15>; - .reg .b32 %r<53>; - .reg .b64 %rd<46>; - - - ld.param.u64 %rd5, [CalculateDistances_param_0]; - ld.param.u64 %rd6, [CalculateDistances_param_1]; - ld.param.u64 %rd7, [CalculateDistances_param_2]; - ld.param.u32 %r27, [CalculateDistances_param_3]; - ld.param.u32 %r28, [CalculateDistances_param_4]; - ld.param.u32 %r29, [CalculateDistances_param_5]; - ld.param.u32 %r30, [CalculateDistances_param_6]; - cvta.to.global.u64 %rd1, %rd7; - cvta.to.global.u64 %rd2, %rd6; - cvta.to.global.u64 %rd3, %rd5; - mov.u32 %r31, %ctaid.x; + .reg .pred %p<13>; + .reg .f32 %f<12>; + .reg .b32 %r<49>; + .reg .b64 %rd<38>; + + + ld.param.u64 %rd6, [CalculateMultiDistances_param_0]; + ld.param.u64 %rd7, [CalculateMultiDistances_param_1]; + ld.param.u64 %rd8, [CalculateMultiDistances_param_2]; + ld.param.u32 %r25, [CalculateMultiDistances_param_3]; + ld.param.u32 %r26, [CalculateMultiDistances_param_4]; + ld.param.u32 %r27, [CalculateMultiDistances_param_5]; + ld.param.u32 %r28, [CalculateMultiDistances_param_6]; + cvta.to.global.u64 %rd1, %rd8; + cvta.to.global.u64 %rd2, %rd7; + cvta.to.global.u64 %rd3, %rd6; + mov.u32 %r29, %ctaid.x; mov.u32 %r1, %ntid.x; - mov.u32 %r32, %tid.x; - mad.lo.s32 %r45, %r1, %r31, %r32; - setp.ge.u32 %p1, %r45, %r29; - @%p1 bra $L__BB57_21; - - mov.u32 %r33, %ntid.y; - mov.u32 %r34, %ctaid.y; - mov.u32 %r35, %tid.y; - mad.lo.s32 %r3, %r33, %r34, %r35; - mov.u32 %r36, %nctaid.x; - mul.lo.s32 %r4, %r1, %r36; - mov.u32 %r37, %ctaid.z; - mov.u32 %r38, %ntid.z; - mov.u32 %r39, %tid.z; - mad.lo.s32 %r5, %r38, %r37, %r39; - mov.u32 %r40, %nctaid.y; - mul.lo.s32 %r6, %r33, %r40; - mov.u32 %r41, %nctaid.z; - mul.lo.s32 %r7, %r38, %r41; + mov.u32 %r30, %tid.x; + mad.lo.s32 %r42, %r1, %r29, %r30; + setp.ge.u32 %p1, %r42, %r27; + @%p1 bra $L__BB57_18; + + mov.u32 %r31, %ntid.y; + mov.u32 %r32, %ctaid.y; + mov.u32 %r33, %tid.y; + mad.lo.s32 %r3, %r31, %r32, %r33; + mov.u32 %r34, %nctaid.x; + mul.lo.s32 %r4, %r1, %r34; + mov.u32 %r35, %ctaid.z; + mov.u32 %r36, %ntid.z; + mov.u32 %r37, %tid.z; + mad.lo.s32 %r5, %r36, %r35, %r37; + mov.u32 %r38, %nctaid.y; + mul.lo.s32 %r6, %r31, %r38; + mov.u32 %r39, %nctaid.z; + mul.lo.s32 %r7, %r36, %r39; $L__BB57_2: - setp.ge.u32 %p2, %r3, %r28; - @%p2 bra $L__BB57_20; + setp.ge.u32 %p2, %r3, %r26; + @%p2 bra $L__BB57_17; - setp.eq.s32 %p3, %r30, 0; - cvt.u64.u32 %rd4, %r45; - @%p3 bra $L__BB57_15; + setp.eq.s32 %p3, %r28, 0; + cvt.u64.u32 %rd4, %r42; + @%p3 bra $L__BB57_12; - mov.u32 %r46, %r3; + mov.u32 %r43, %r3; $L__BB57_5: - setp.ge.u32 %p4, %r5, %r27; - @%p4 bra $L__BB57_14; - - mul.wide.u32 %rd8, %r46, 8; - add.s64 %rd9, %rd3, %rd8; - ld.global.nc.u64 %rd10, [%rd9]; - cvta.to.global.u64 %rd11, %rd10; - shl.b64 %rd12, %rd4, 2; - add.s64 %rd13, %rd11, %rd12; - ld.global.f32 %f1, [%rd13]; - mul.lo.s32 %r10, %r46, %r27; - setp.eq.s32 %p5, %r30, 1; - @%p5 bra $L__BB57_10; - - setp.ne.s32 %p6, %r30, 2; - @%p6 bra $L__BB57_12; + setp.ge.u32 %p4, %r5, %r25; + @%p4 bra $L__BB57_11; - add.s32 %r47, %r5, %r10; - mov.u32 %r48, %r5; + setp.eq.s32 %p5, %r28, 2; + mul.wide.u32 %rd9, %r43, 8; + add.s64 %rd10, %rd3, %rd9; + ld.global.nc.u64 %rd11, [%rd10]; + cvta.to.global.u64 %rd12, %rd11; + shl.b64 %rd13, %rd4, 2; + add.s64 %rd5, %rd12, %rd13; + mul.lo.s32 %r10, %r43, %r25; + @%p5 bra $L__BB57_9; + bra.uni $L__BB57_7; $L__BB57_9: - mul.wide.u32 %rd14, %r48, 8; - add.s64 %rd15, %rd2, %rd14; - ld.global.nc.u64 %rd16, [%rd15]; - cvta.to.global.u64 %rd17, %rd16; - add.s64 %rd19, %rd17, %rd12; - ld.global.f32 %f3, [%rd19]; - sub.ftz.f32 %f4, %f1, %f3; - abs.ftz.f32 %f5, %f4; - mul.wide.u32 %rd20, %r47, 4; - add.s64 %rd21, %rd1, %rd20; - atom.global.add.f32 %f6, [%rd21], %f5; - add.s32 %r47, %r47, %r7; - add.s32 %r48, %r48, %r7; - setp.lt.u32 %p7, %r48, %r27; - @%p7 bra $L__BB57_9; - bra.uni $L__BB57_14; + ld.global.f32 %f1, [%rd5]; + mov.u32 %r46, %r5; $L__BB57_10: - mov.u32 %r49, %r5; + mul.wide.u32 %rd16, %r46, 8; + add.s64 %rd17, %rd2, %rd16; + ld.global.nc.u64 %rd18, [%rd17]; + cvta.to.global.u64 %rd19, %rd18; + add.s64 %rd21, %rd19, %rd13; + ld.global.f32 %f4, [%rd21]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + add.s32 %r40, %r46, %r10; + mul.wide.u32 %rd22, %r40, 4; + add.s64 %rd23, %rd1, %rd22; + atom.global.add.f32 %f7, [%rd23], %f6; + add.s32 %r46, %r46, %r7; + setp.lt.u32 %p7, %r46, %r25; + @%p7 bra $L__BB57_10; + bra.uni $L__BB57_11; + +$L__BB57_7: + add.s32 %r44, %r5, %r10; + mov.u32 %r45, %r5; + +$L__BB57_8: + mul.wide.u32 %rd14, %r44, 4; + add.s64 %rd15, %rd1, %rd14; + atom.global.add.f32 %f3, [%rd15], 0f00000000; + add.s32 %r44, %r44, %r7; + add.s32 %r45, %r45, %r7; + setp.lt.u32 %p6, %r45, %r25; + @%p6 bra $L__BB57_8; $L__BB57_11: - mul.wide.u32 %rd22, %r49, 8; - add.s64 %rd23, %rd2, %rd22; - ld.global.nc.u64 %rd24, [%rd23]; - cvta.to.global.u64 %rd25, %rd24; - add.s64 %rd27, %rd25, %rd12; - ld.global.f32 %f7, [%rd27]; - mul.ftz.f32 %f8, %f1, %f7; - add.s32 %r42, %r49, %r10; - mul.wide.u32 %rd28, %r42, 4; - add.s64 %rd29, %rd1, %rd28; - atom.global.add.f32 %f9, [%rd29], %f8; - add.s32 %r49, %r49, %r7; - setp.lt.u32 %p8, %r49, %r27; - @%p8 bra $L__BB57_11; - bra.uni $L__BB57_14; + add.s32 %r43, %r43, %r6; + setp.lt.u32 %p8, %r43, %r26; + @%p8 bra $L__BB57_5; + bra.uni $L__BB57_17; $L__BB57_12: - mov.u32 %r50, %r5; + mov.u32 %r47, %r3; $L__BB57_13: - add.s32 %r43, %r50, %r10; - mul.wide.u32 %rd30, %r43, 4; - add.s64 %rd31, %rd1, %rd30; - atom.global.add.f32 %f10, [%rd31], 0f00000000; - add.s32 %r50, %r50, %r7; - setp.lt.u32 %p9, %r50, %r27; - @%p9 bra $L__BB57_13; - -$L__BB57_14: - add.s32 %r46, %r46, %r6; - setp.lt.u32 %p10, %r46, %r28; - @%p10 bra $L__BB57_5; - bra.uni $L__BB57_20; + setp.ge.u32 %p9, %r5, %r25; + @%p9 bra $L__BB57_16; + + mul.wide.u32 %rd24, %r47, 8; + add.s64 %rd25, %rd3, %rd24; + ld.global.nc.u64 %rd26, [%rd25]; + cvta.to.global.u64 %rd27, %rd26; + shl.b64 %rd28, %rd4, 2; + add.s64 %rd29, %rd27, %rd28; + ld.global.f32 %f2, [%rd29]; + mul.lo.s32 %r20, %r47, %r25; + mov.u32 %r48, %r5; $L__BB57_15: - mov.u32 %r51, %r3; + mul.wide.u32 %rd30, %r48, 8; + add.s64 %rd31, %rd2, %rd30; + ld.global.nc.u64 %rd32, [%rd31]; + cvta.to.global.u64 %rd33, %rd32; + add.s64 %rd35, %rd33, %rd28; + ld.global.f32 %f8, [%rd35]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + add.s32 %r41, %r48, %r20; + mul.wide.u32 %rd36, %r41, 4; + add.s64 %rd37, %rd1, %rd36; + atom.global.add.f32 %f11, [%rd37], %f10; + add.s32 %r48, %r48, %r7; + setp.lt.u32 %p10, %r48, %r25; + @%p10 bra $L__BB57_15; $L__BB57_16: - setp.ge.u32 %p11, %r5, %r27; - @%p11 bra $L__BB57_19; - - mul.wide.u32 %rd32, %r51, 8; - add.s64 %rd33, %rd3, %rd32; - ld.global.nc.u64 %rd34, [%rd33]; - cvta.to.global.u64 %rd35, %rd34; - shl.b64 %rd36, %rd4, 2; - add.s64 %rd37, %rd35, %rd36; - ld.global.f32 %f2, [%rd37]; - mul.lo.s32 %r22, %r51, %r27; - mov.u32 %r52, %r5; + add.s32 %r47, %r47, %r6; + setp.lt.u32 %p11, %r47, %r26; + @%p11 bra $L__BB57_13; + +$L__BB57_17: + add.s32 %r42, %r42, %r4; + setp.lt.u32 %p12, %r42, %r27; + @%p12 bra $L__BB57_2; $L__BB57_18: - mul.wide.u32 %rd38, %r52, 8; - add.s64 %rd39, %rd2, %rd38; - ld.global.nc.u64 %rd40, [%rd39]; - cvta.to.global.u64 %rd41, %rd40; - add.s64 %rd43, %rd41, %rd36; - ld.global.f32 %f11, [%rd43]; - sub.ftz.f32 %f12, %f2, %f11; - mul.ftz.f32 %f13, %f12, %f12; - add.s32 %r44, %r52, %r22; - mul.wide.u32 %rd44, %r44, 4; - add.s64 %rd45, %rd1, %rd44; - atom.global.add.f32 %f14, [%rd45], %f13; - add.s32 %r52, %r52, %r7; - setp.lt.u32 %p12, %r52, %r27; - @%p12 bra $L__BB57_18; - -$L__BB57_19: - add.s32 %r51, %r51, %r6; - setp.lt.u32 %p13, %r51, %r28; - @%p13 bra $L__BB57_16; - -$L__BB57_20: - add.s32 %r45, %r45, %r4; - setp.lt.u32 %p14, %r45, %r29; - @%p14 bra $L__BB57_2; - -$L__BB57_21: ret; } - // .globl MultiCosineDistance -.visible .entry MultiCosineDistance( - .param .u64 MultiCosineDistance_param_0, - .param .u64 MultiCosineDistance_param_1, - .param .u64 MultiCosineDistance_param_2, - .param .u64 MultiCosineDistance_param_3, - .param .u64 MultiCosineDistance_param_4, - .param .u32 MultiCosineDistance_param_5, - .param .u32 MultiCosineDistance_param_6, - .param .u32 MultiCosineDistance_param_7 + // .globl CalculateDistances +.visible .entry CalculateDistances( + .param .u64 CalculateDistances_param_0, + .param .u64 CalculateDistances_param_1, + .param .u64 CalculateDistances_param_2, + .param .u32 CalculateDistances_param_3, + .param .u32 CalculateDistances_param_4, + .param .u32 CalculateDistances_param_5 +) +{ + .reg .pred %p<11>; + .reg .f32 %f<12>; + .reg .b32 %r<31>; + .reg .b64 %rd<31>; + + + ld.param.u64 %rd7, [CalculateDistances_param_0]; + ld.param.u64 %rd8, [CalculateDistances_param_1]; + ld.param.u64 %rd9, [CalculateDistances_param_2]; + ld.param.u32 %r16, [CalculateDistances_param_3]; + ld.param.u32 %r17, [CalculateDistances_param_4]; + ld.param.u32 %r18, [CalculateDistances_param_5]; + cvta.to.global.u64 %rd1, %rd9; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd7; + mov.u32 %r19, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r20, %tid.x; + mad.lo.s32 %r26, %r1, %r19, %r20; + setp.ge.u32 %p1, %r26, %r17; + @%p1 bra $L__BB58_15; + + mov.u32 %r21, %ntid.y; + mov.u32 %r22, %ctaid.y; + mov.u32 %r23, %tid.y; + mad.lo.s32 %r3, %r21, %r22, %r23; + mov.u32 %r24, %nctaid.x; + mul.lo.s32 %r4, %r1, %r24; + mov.u32 %r25, %nctaid.y; + mul.lo.s32 %r5, %r21, %r25; + setp.eq.s32 %p2, %r18, 0; + @%p2 bra $L__BB58_11; + +$L__BB58_3: + setp.ge.u32 %p3, %r3, %r16; + @%p3 bra $L__BB58_9; + + setp.eq.s32 %p4, %r18, 2; + cvt.u64.u32 %rd4, %r26; + mul.wide.u32 %rd10, %r26, 4; + add.s64 %rd5, %rd3, %rd10; + @%p4 bra $L__BB58_7; + bra.uni $L__BB58_5; + +$L__BB58_7: + ld.global.nc.f32 %f1, [%rd5]; + mov.u32 %r28, %r3; + +$L__BB58_8: + mul.wide.u32 %rd13, %r28, 8; + add.s64 %rd14, %rd2, %rd13; + ld.global.nc.u64 %rd15, [%rd14]; + cvta.to.global.u64 %rd16, %rd15; + shl.b64 %rd17, %rd4, 2; + add.s64 %rd18, %rd16, %rd17; + ld.global.f32 %f4, [%rd18]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + mul.wide.u32 %rd19, %r28, 4; + add.s64 %rd20, %rd1, %rd19; + atom.global.add.f32 %f7, [%rd20], %f6; + add.s32 %r28, %r28, %r5; + setp.lt.u32 %p6, %r28, %r16; + @%p6 bra $L__BB58_8; + bra.uni $L__BB58_9; + +$L__BB58_5: + mov.u32 %r27, %r3; + +$L__BB58_6: + mul.wide.u32 %rd11, %r27, 4; + add.s64 %rd12, %rd1, %rd11; + atom.global.add.f32 %f3, [%rd12], 0f00000000; + add.s32 %r27, %r27, %r5; + setp.lt.u32 %p5, %r27, %r16; + @%p5 bra $L__BB58_6; + +$L__BB58_9: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p7, %r26, %r17; + @%p7 bra $L__BB58_3; + bra.uni $L__BB58_15; + +$L__BB58_11: + setp.ge.u32 %p8, %r3, %r16; + @%p8 bra $L__BB58_14; + + cvt.u64.u32 %rd6, %r26; + mul.wide.u32 %rd21, %r26, 4; + add.s64 %rd22, %rd3, %rd21; + ld.global.nc.f32 %f2, [%rd22]; + mov.u32 %r30, %r3; + +$L__BB58_13: + mul.wide.u32 %rd23, %r30, 8; + add.s64 %rd24, %rd2, %rd23; + ld.global.nc.u64 %rd25, [%rd24]; + cvta.to.global.u64 %rd26, %rd25; + shl.b64 %rd27, %rd6, 2; + add.s64 %rd28, %rd26, %rd27; + ld.global.f32 %f8, [%rd28]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + mul.wide.u32 %rd29, %r30, 4; + add.s64 %rd30, %rd1, %rd29; + atom.global.add.f32 %f11, [%rd30], %f10; + add.s32 %r30, %r30, %r5; + setp.lt.u32 %p9, %r30, %r16; + @%p9 bra $L__BB58_13; + +$L__BB58_14: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p10, %r26, %r17; + @%p10 bra $L__BB58_11; + +$L__BB58_15: + ret; + +} + // .globl CosineMultiDistance +.visible .entry CosineMultiDistance( + .param .u64 CosineMultiDistance_param_0, + .param .u64 CosineMultiDistance_param_1, + .param .u64 CosineMultiDistance_param_2, + .param .u64 CosineMultiDistance_param_3, + .param .u64 CosineMultiDistance_param_4, + .param .u32 CosineMultiDistance_param_5, + .param .u32 CosineMultiDistance_param_6, + .param .u32 CosineMultiDistance_param_7 ) { .reg .pred %p<7>; @@ -4224,20 +4332,20 @@ $L__BB57_21: .reg .b64 %rd<28>; - ld.param.u64 %rd7, [MultiCosineDistance_param_0]; - ld.param.u64 %rd8, [MultiCosineDistance_param_1]; - ld.param.u64 %rd9, [MultiCosineDistance_param_2]; - ld.param.u64 %rd10, [MultiCosineDistance_param_3]; - ld.param.u64 %rd11, [MultiCosineDistance_param_4]; - ld.param.u32 %r17, [MultiCosineDistance_param_5]; - ld.param.u32 %r18, [MultiCosineDistance_param_6]; - ld.param.u32 %r19, [MultiCosineDistance_param_7]; + ld.param.u64 %rd7, [CosineMultiDistance_param_0]; + ld.param.u64 %rd8, [CosineMultiDistance_param_1]; + ld.param.u64 %rd9, [CosineMultiDistance_param_2]; + ld.param.u64 %rd10, [CosineMultiDistance_param_3]; + ld.param.u64 %rd11, [CosineMultiDistance_param_4]; + ld.param.u32 %r17, [CosineMultiDistance_param_5]; + ld.param.u32 %r18, [CosineMultiDistance_param_6]; + ld.param.u32 %r19, [CosineMultiDistance_param_7]; mov.u32 %r20, %ctaid.x; mov.u32 %r1, %ntid.x; mov.u32 %r21, %tid.x; mad.lo.s32 %r31, %r1, %r20, %r21; setp.ge.u32 %p1, %r31, %r19; - @%p1 bra $L__BB58_9; + @%p1 bra $L__BB59_9; mov.u32 %r22, %ntid.y; mov.u32 %r23, %ctaid.y; @@ -4259,16 +4367,16 @@ $L__BB57_21: cvta.to.global.u64 %rd4, %rd10; cvta.to.global.u64 %rd5, %rd11; -$L__BB58_2: +$L__BB59_2: setp.ge.u32 %p2, %r3, %r18; - @%p2 bra $L__BB58_8; + @%p2 bra $L__BB59_8; cvt.u64.u32 %rd6, %r31; mov.u32 %r32, %r3; -$L__BB58_4: +$L__BB59_4: setp.ge.u32 %p3, %r5, %r17; - @%p3 bra $L__BB58_7; + @%p3 bra $L__BB59_7; mul.wide.u32 %rd12, %r32, 8; add.s64 %rd13, %rd1, %rd12; @@ -4281,7 +4389,7 @@ $L__BB58_4: mad.lo.s32 %r33, %r32, %r17, %r5; mov.u32 %r34, %r5; -$L__BB58_6: +$L__BB59_6: mul.wide.u32 %rd18, %r34, 8; add.s64 %rd19, %rd2, %rd18; ld.global.nc.u64 %rd20, [%rd19]; @@ -4300,19 +4408,105 @@ $L__BB58_6: add.s32 %r33, %r33, %r7; add.s32 %r34, %r34, %r7; setp.lt.u32 %p4, %r34, %r17; - @%p4 bra $L__BB58_6; + @%p4 bra $L__BB59_6; -$L__BB58_7: +$L__BB59_7: add.s32 %r32, %r32, %r6; setp.lt.u32 %p5, %r32, %r18; - @%p5 bra $L__BB58_4; + @%p5 bra $L__BB59_4; -$L__BB58_8: +$L__BB59_8: add.s32 %r31, %r31, %r4; setp.lt.u32 %p6, %r31, %r19; - @%p6 bra $L__BB58_2; + @%p6 bra $L__BB59_2; -$L__BB58_9: +$L__BB59_9: + ret; + +} + // .globl CosineDistances +.visible .entry CosineDistances( + .param .u64 CosineDistances_param_0, + .param .u64 CosineDistances_param_1, + .param .u64 CosineDistances_param_2, + .param .u64 CosineDistances_param_3, + .param .u64 CosineDistances_param_4, + .param .u32 CosineDistances_param_5, + .param .u32 CosineDistances_param_6 +) +{ + .reg .pred %p<5>; + .reg .f32 %f<9>; + .reg .b32 %r<21>; + .reg .b64 %rd<24>; + + + ld.param.u64 %rd7, [CosineDistances_param_0]; + ld.param.u64 %rd8, [CosineDistances_param_1]; + ld.param.u64 %rd9, [CosineDistances_param_2]; + ld.param.u64 %rd10, [CosineDistances_param_3]; + ld.param.u64 %rd11, [CosineDistances_param_4]; + ld.param.u32 %r10, [CosineDistances_param_5]; + ld.param.u32 %r11, [CosineDistances_param_6]; + mov.u32 %r12, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r13, %tid.x; + mad.lo.s32 %r19, %r1, %r12, %r13; + setp.ge.u32 %p1, %r19, %r11; + @%p1 bra $L__BB60_6; + + mov.u32 %r14, %ntid.y; + mov.u32 %r15, %ctaid.y; + mov.u32 %r16, %tid.y; + mad.lo.s32 %r3, %r14, %r15, %r16; + mov.u32 %r17, %nctaid.x; + mul.lo.s32 %r4, %r1, %r17; + mov.u32 %r18, %nctaid.y; + mul.lo.s32 %r5, %r14, %r18; + cvta.to.global.u64 %rd1, %rd7; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd9; + cvta.to.global.u64 %rd4, %rd10; + cvta.to.global.u64 %rd5, %rd11; + +$L__BB60_2: + setp.ge.u32 %p2, %r3, %r10; + @%p2 bra $L__BB60_5; + + cvt.u64.u32 %rd6, %r19; + mul.wide.u32 %rd12, %r19, 4; + add.s64 %rd13, %rd1, %rd12; + ld.global.nc.f32 %f1, [%rd13]; + mul.ftz.f32 %f2, %f1, %f1; + mov.u32 %r20, %r3; + +$L__BB60_4: + mul.wide.u32 %rd14, %r20, 8; + add.s64 %rd15, %rd2, %rd14; + ld.global.nc.u64 %rd16, [%rd15]; + cvta.to.global.u64 %rd17, %rd16; + shl.b64 %rd18, %rd6, 2; + add.s64 %rd19, %rd17, %rd18; + ld.global.f32 %f3, [%rd19]; + mul.wide.u32 %rd20, %r20, 4; + add.s64 %rd21, %rd3, %rd20; + atom.global.add.f32 %f4, [%rd21], %f2; + add.s64 %rd22, %rd4, %rd20; + mul.ftz.f32 %f5, %f1, %f3; + atom.global.add.f32 %f6, [%rd22], %f5; + add.s64 %rd23, %rd5, %rd20; + mul.ftz.f32 %f7, %f3, %f3; + atom.global.add.f32 %f8, [%rd23], %f7; + add.s32 %r20, %r20, %r5; + setp.lt.u32 %p3, %r20, %r10; + @%p3 bra $L__BB60_4; + +$L__BB60_5: + add.s32 %r19, %r19, %r4; + setp.lt.u32 %p4, %r19, %r11; + @%p4 bra $L__BB60_2; + +$L__BB60_6: ret; } @@ -4340,7 +4534,7 @@ $L__BB58_9: mov.u32 %r2, %tid.x; mad.lo.s32 %r3, %r22, %r1, %r2; setp.ge.u32 %p1, %r3, %r20; - @%p1 bra $L__BB59_2; + @%p1 bra $L__BB61_2; cvta.to.global.u64 %rd3, %rd1; mul.lo.s32 %r23, %r3, %r21; @@ -4352,17 +4546,17 @@ $L__BB58_9: add.s32 %r26, %r25, %r24; st.shared.f32 [%r26], %f8; -$L__BB59_2: +$L__BB61_2: bar.sync 0; setp.ne.s32 %p2, %r2, 0; - @%p2 bra $L__BB59_11; + @%p2 bra $L__BB61_11; shl.b32 %r4, %r1, 10; sub.s32 %r27, %r20, %r4; min.u32 %r5, %r27, 1024; setp.eq.s32 %p3, %r5, 0; mov.f32 %f25, 0f00000000; - @%p3 bra $L__BB59_10; + @%p3 bra $L__BB61_10; not.b32 %r29, %r20; add.s32 %r30, %r4, %r29; @@ -4373,7 +4567,7 @@ $L__BB59_2: setp.lt.u32 %p4, %r33, 3; mov.f32 %f25, 0f00000000; mov.u32 %r45, 0; - @%p4 bra $L__BB59_7; + @%p4 bra $L__BB61_7; add.s32 %r36, %r4, -1; sub.s32 %r37, %r36, %r20; @@ -4382,7 +4576,7 @@ $L__BB59_2: neg.s32 %r42, %r39; mov.u32 %r43, _ZZ9SumValuesE5block; -$L__BB59_6: +$L__BB61_6: ld.shared.f32 %f13, [%r43]; add.ftz.f32 %f14, %f25, %f13; ld.shared.f32 %f15, [%r43+4]; @@ -4395,32 +4589,32 @@ $L__BB59_6: add.s32 %r43, %r43, 16; add.s32 %r42, %r42, -4; setp.ne.s32 %p5, %r42, 1; - @%p5 bra $L__BB59_6; + @%p5 bra $L__BB61_6; -$L__BB59_7: +$L__BB61_7: setp.eq.s32 %p6, %r47, 0; - @%p6 bra $L__BB59_10; + @%p6 bra $L__BB61_10; shl.b32 %r40, %r45, 2; mov.u32 %r41, _ZZ9SumValuesE5block; add.s32 %r46, %r41, %r40; -$L__BB59_9: +$L__BB61_9: .pragma "nounroll"; ld.shared.f32 %f20, [%r46]; add.ftz.f32 %f25, %f25, %f20; add.s32 %r46, %r46, 4; add.s32 %r47, %r47, -1; setp.ne.s32 %p7, %r47, 0; - @%p7 bra $L__BB59_9; + @%p7 bra $L__BB61_9; -$L__BB59_10: +$L__BB61_10: cvta.to.global.u64 %rd6, %rd2; mul.wide.u32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f25; -$L__BB59_11: +$L__BB61_11: ret; } diff --git a/BrightData.Cuda/cuda/brightwire_52.ptx b/BrightData.Cuda/cuda/brightwire_52.ptx index 42cce186..66aa3204 100644 --- a/BrightData.Cuda/cuda/brightwire_52.ptx +++ b/BrightData.Cuda/cuda/brightwire_52.ptx @@ -4022,200 +4022,308 @@ $L__BB56_3: ret; } - // .globl CalculateDistances -.visible .entry CalculateDistances( - .param .u64 CalculateDistances_param_0, - .param .u64 CalculateDistances_param_1, - .param .u64 CalculateDistances_param_2, - .param .u32 CalculateDistances_param_3, - .param .u32 CalculateDistances_param_4, - .param .u32 CalculateDistances_param_5, - .param .u32 CalculateDistances_param_6 + // .globl CalculateMultiDistances +.visible .entry CalculateMultiDistances( + .param .u64 CalculateMultiDistances_param_0, + .param .u64 CalculateMultiDistances_param_1, + .param .u64 CalculateMultiDistances_param_2, + .param .u32 CalculateMultiDistances_param_3, + .param .u32 CalculateMultiDistances_param_4, + .param .u32 CalculateMultiDistances_param_5, + .param .u32 CalculateMultiDistances_param_6 ) { - .reg .pred %p<15>; - .reg .f32 %f<15>; - .reg .b32 %r<53>; - .reg .b64 %rd<46>; - - - ld.param.u64 %rd5, [CalculateDistances_param_0]; - ld.param.u64 %rd6, [CalculateDistances_param_1]; - ld.param.u64 %rd7, [CalculateDistances_param_2]; - ld.param.u32 %r27, [CalculateDistances_param_3]; - ld.param.u32 %r28, [CalculateDistances_param_4]; - ld.param.u32 %r29, [CalculateDistances_param_5]; - ld.param.u32 %r30, [CalculateDistances_param_6]; - cvta.to.global.u64 %rd1, %rd7; - cvta.to.global.u64 %rd2, %rd6; - cvta.to.global.u64 %rd3, %rd5; - mov.u32 %r31, %ctaid.x; + .reg .pred %p<13>; + .reg .f32 %f<12>; + .reg .b32 %r<49>; + .reg .b64 %rd<38>; + + + ld.param.u64 %rd6, [CalculateMultiDistances_param_0]; + ld.param.u64 %rd7, [CalculateMultiDistances_param_1]; + ld.param.u64 %rd8, [CalculateMultiDistances_param_2]; + ld.param.u32 %r25, [CalculateMultiDistances_param_3]; + ld.param.u32 %r26, [CalculateMultiDistances_param_4]; + ld.param.u32 %r27, [CalculateMultiDistances_param_5]; + ld.param.u32 %r28, [CalculateMultiDistances_param_6]; + cvta.to.global.u64 %rd1, %rd8; + cvta.to.global.u64 %rd2, %rd7; + cvta.to.global.u64 %rd3, %rd6; + mov.u32 %r29, %ctaid.x; mov.u32 %r1, %ntid.x; - mov.u32 %r32, %tid.x; - mad.lo.s32 %r45, %r1, %r31, %r32; - setp.ge.u32 %p1, %r45, %r29; - @%p1 bra $L__BB57_21; - - mov.u32 %r33, %ntid.y; - mov.u32 %r34, %ctaid.y; - mov.u32 %r35, %tid.y; - mad.lo.s32 %r3, %r33, %r34, %r35; - mov.u32 %r36, %nctaid.x; - mul.lo.s32 %r4, %r1, %r36; - mov.u32 %r37, %ctaid.z; - mov.u32 %r38, %ntid.z; - mov.u32 %r39, %tid.z; - mad.lo.s32 %r5, %r38, %r37, %r39; - mov.u32 %r40, %nctaid.y; - mul.lo.s32 %r6, %r33, %r40; - mov.u32 %r41, %nctaid.z; - mul.lo.s32 %r7, %r38, %r41; + mov.u32 %r30, %tid.x; + mad.lo.s32 %r42, %r1, %r29, %r30; + setp.ge.u32 %p1, %r42, %r27; + @%p1 bra $L__BB57_18; + + mov.u32 %r31, %ntid.y; + mov.u32 %r32, %ctaid.y; + mov.u32 %r33, %tid.y; + mad.lo.s32 %r3, %r31, %r32, %r33; + mov.u32 %r34, %nctaid.x; + mul.lo.s32 %r4, %r1, %r34; + mov.u32 %r35, %ctaid.z; + mov.u32 %r36, %ntid.z; + mov.u32 %r37, %tid.z; + mad.lo.s32 %r5, %r36, %r35, %r37; + mov.u32 %r38, %nctaid.y; + mul.lo.s32 %r6, %r31, %r38; + mov.u32 %r39, %nctaid.z; + mul.lo.s32 %r7, %r36, %r39; $L__BB57_2: - setp.ge.u32 %p2, %r3, %r28; - @%p2 bra $L__BB57_20; + setp.ge.u32 %p2, %r3, %r26; + @%p2 bra $L__BB57_17; - setp.eq.s32 %p3, %r30, 0; - cvt.u64.u32 %rd4, %r45; - @%p3 bra $L__BB57_15; + setp.eq.s32 %p3, %r28, 0; + cvt.u64.u32 %rd4, %r42; + @%p3 bra $L__BB57_12; - mov.u32 %r46, %r3; + mov.u32 %r43, %r3; $L__BB57_5: - setp.ge.u32 %p4, %r5, %r27; - @%p4 bra $L__BB57_14; - - mul.wide.u32 %rd8, %r46, 8; - add.s64 %rd9, %rd3, %rd8; - ld.global.nc.u64 %rd10, [%rd9]; - cvta.to.global.u64 %rd11, %rd10; - shl.b64 %rd12, %rd4, 2; - add.s64 %rd13, %rd11, %rd12; - ld.global.f32 %f1, [%rd13]; - mul.lo.s32 %r10, %r46, %r27; - setp.eq.s32 %p5, %r30, 1; - @%p5 bra $L__BB57_10; - - setp.ne.s32 %p6, %r30, 2; - @%p6 bra $L__BB57_12; + setp.ge.u32 %p4, %r5, %r25; + @%p4 bra $L__BB57_11; - add.s32 %r47, %r5, %r10; - mov.u32 %r48, %r5; + setp.eq.s32 %p5, %r28, 2; + mul.wide.u32 %rd9, %r43, 8; + add.s64 %rd10, %rd3, %rd9; + ld.global.nc.u64 %rd11, [%rd10]; + cvta.to.global.u64 %rd12, %rd11; + shl.b64 %rd13, %rd4, 2; + add.s64 %rd5, %rd12, %rd13; + mul.lo.s32 %r10, %r43, %r25; + @%p5 bra $L__BB57_9; + bra.uni $L__BB57_7; $L__BB57_9: - mul.wide.u32 %rd14, %r48, 8; - add.s64 %rd15, %rd2, %rd14; - ld.global.nc.u64 %rd16, [%rd15]; - cvta.to.global.u64 %rd17, %rd16; - add.s64 %rd19, %rd17, %rd12; - ld.global.f32 %f3, [%rd19]; - sub.ftz.f32 %f4, %f1, %f3; - abs.ftz.f32 %f5, %f4; - mul.wide.u32 %rd20, %r47, 4; - add.s64 %rd21, %rd1, %rd20; - atom.global.add.f32 %f6, [%rd21], %f5; - add.s32 %r47, %r47, %r7; - add.s32 %r48, %r48, %r7; - setp.lt.u32 %p7, %r48, %r27; - @%p7 bra $L__BB57_9; - bra.uni $L__BB57_14; + ld.global.f32 %f1, [%rd5]; + mov.u32 %r46, %r5; $L__BB57_10: - mov.u32 %r49, %r5; + mul.wide.u32 %rd16, %r46, 8; + add.s64 %rd17, %rd2, %rd16; + ld.global.nc.u64 %rd18, [%rd17]; + cvta.to.global.u64 %rd19, %rd18; + add.s64 %rd21, %rd19, %rd13; + ld.global.f32 %f4, [%rd21]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + add.s32 %r40, %r46, %r10; + mul.wide.u32 %rd22, %r40, 4; + add.s64 %rd23, %rd1, %rd22; + atom.global.add.f32 %f7, [%rd23], %f6; + add.s32 %r46, %r46, %r7; + setp.lt.u32 %p7, %r46, %r25; + @%p7 bra $L__BB57_10; + bra.uni $L__BB57_11; + +$L__BB57_7: + add.s32 %r44, %r5, %r10; + mov.u32 %r45, %r5; + +$L__BB57_8: + mul.wide.u32 %rd14, %r44, 4; + add.s64 %rd15, %rd1, %rd14; + atom.global.add.f32 %f3, [%rd15], 0f00000000; + add.s32 %r44, %r44, %r7; + add.s32 %r45, %r45, %r7; + setp.lt.u32 %p6, %r45, %r25; + @%p6 bra $L__BB57_8; $L__BB57_11: - mul.wide.u32 %rd22, %r49, 8; - add.s64 %rd23, %rd2, %rd22; - ld.global.nc.u64 %rd24, [%rd23]; - cvta.to.global.u64 %rd25, %rd24; - add.s64 %rd27, %rd25, %rd12; - ld.global.f32 %f7, [%rd27]; - mul.ftz.f32 %f8, %f1, %f7; - add.s32 %r42, %r49, %r10; - mul.wide.u32 %rd28, %r42, 4; - add.s64 %rd29, %rd1, %rd28; - atom.global.add.f32 %f9, [%rd29], %f8; - add.s32 %r49, %r49, %r7; - setp.lt.u32 %p8, %r49, %r27; - @%p8 bra $L__BB57_11; - bra.uni $L__BB57_14; + add.s32 %r43, %r43, %r6; + setp.lt.u32 %p8, %r43, %r26; + @%p8 bra $L__BB57_5; + bra.uni $L__BB57_17; $L__BB57_12: - mov.u32 %r50, %r5; + mov.u32 %r47, %r3; $L__BB57_13: - add.s32 %r43, %r50, %r10; - mul.wide.u32 %rd30, %r43, 4; - add.s64 %rd31, %rd1, %rd30; - atom.global.add.f32 %f10, [%rd31], 0f00000000; - add.s32 %r50, %r50, %r7; - setp.lt.u32 %p9, %r50, %r27; - @%p9 bra $L__BB57_13; - -$L__BB57_14: - add.s32 %r46, %r46, %r6; - setp.lt.u32 %p10, %r46, %r28; - @%p10 bra $L__BB57_5; - bra.uni $L__BB57_20; + setp.ge.u32 %p9, %r5, %r25; + @%p9 bra $L__BB57_16; + + mul.wide.u32 %rd24, %r47, 8; + add.s64 %rd25, %rd3, %rd24; + ld.global.nc.u64 %rd26, [%rd25]; + cvta.to.global.u64 %rd27, %rd26; + shl.b64 %rd28, %rd4, 2; + add.s64 %rd29, %rd27, %rd28; + ld.global.f32 %f2, [%rd29]; + mul.lo.s32 %r20, %r47, %r25; + mov.u32 %r48, %r5; $L__BB57_15: - mov.u32 %r51, %r3; + mul.wide.u32 %rd30, %r48, 8; + add.s64 %rd31, %rd2, %rd30; + ld.global.nc.u64 %rd32, [%rd31]; + cvta.to.global.u64 %rd33, %rd32; + add.s64 %rd35, %rd33, %rd28; + ld.global.f32 %f8, [%rd35]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + add.s32 %r41, %r48, %r20; + mul.wide.u32 %rd36, %r41, 4; + add.s64 %rd37, %rd1, %rd36; + atom.global.add.f32 %f11, [%rd37], %f10; + add.s32 %r48, %r48, %r7; + setp.lt.u32 %p10, %r48, %r25; + @%p10 bra $L__BB57_15; $L__BB57_16: - setp.ge.u32 %p11, %r5, %r27; - @%p11 bra $L__BB57_19; - - mul.wide.u32 %rd32, %r51, 8; - add.s64 %rd33, %rd3, %rd32; - ld.global.nc.u64 %rd34, [%rd33]; - cvta.to.global.u64 %rd35, %rd34; - shl.b64 %rd36, %rd4, 2; - add.s64 %rd37, %rd35, %rd36; - ld.global.f32 %f2, [%rd37]; - mul.lo.s32 %r22, %r51, %r27; - mov.u32 %r52, %r5; + add.s32 %r47, %r47, %r6; + setp.lt.u32 %p11, %r47, %r26; + @%p11 bra $L__BB57_13; + +$L__BB57_17: + add.s32 %r42, %r42, %r4; + setp.lt.u32 %p12, %r42, %r27; + @%p12 bra $L__BB57_2; $L__BB57_18: - mul.wide.u32 %rd38, %r52, 8; - add.s64 %rd39, %rd2, %rd38; - ld.global.nc.u64 %rd40, [%rd39]; - cvta.to.global.u64 %rd41, %rd40; - add.s64 %rd43, %rd41, %rd36; - ld.global.f32 %f11, [%rd43]; - sub.ftz.f32 %f12, %f2, %f11; - mul.ftz.f32 %f13, %f12, %f12; - add.s32 %r44, %r52, %r22; - mul.wide.u32 %rd44, %r44, 4; - add.s64 %rd45, %rd1, %rd44; - atom.global.add.f32 %f14, [%rd45], %f13; - add.s32 %r52, %r52, %r7; - setp.lt.u32 %p12, %r52, %r27; - @%p12 bra $L__BB57_18; - -$L__BB57_19: - add.s32 %r51, %r51, %r6; - setp.lt.u32 %p13, %r51, %r28; - @%p13 bra $L__BB57_16; - -$L__BB57_20: - add.s32 %r45, %r45, %r4; - setp.lt.u32 %p14, %r45, %r29; - @%p14 bra $L__BB57_2; - -$L__BB57_21: ret; } - // .globl MultiCosineDistance -.visible .entry MultiCosineDistance( - .param .u64 MultiCosineDistance_param_0, - .param .u64 MultiCosineDistance_param_1, - .param .u64 MultiCosineDistance_param_2, - .param .u64 MultiCosineDistance_param_3, - .param .u64 MultiCosineDistance_param_4, - .param .u32 MultiCosineDistance_param_5, - .param .u32 MultiCosineDistance_param_6, - .param .u32 MultiCosineDistance_param_7 + // .globl CalculateDistances +.visible .entry CalculateDistances( + .param .u64 CalculateDistances_param_0, + .param .u64 CalculateDistances_param_1, + .param .u64 CalculateDistances_param_2, + .param .u32 CalculateDistances_param_3, + .param .u32 CalculateDistances_param_4, + .param .u32 CalculateDistances_param_5 +) +{ + .reg .pred %p<11>; + .reg .f32 %f<12>; + .reg .b32 %r<31>; + .reg .b64 %rd<31>; + + + ld.param.u64 %rd7, [CalculateDistances_param_0]; + ld.param.u64 %rd8, [CalculateDistances_param_1]; + ld.param.u64 %rd9, [CalculateDistances_param_2]; + ld.param.u32 %r16, [CalculateDistances_param_3]; + ld.param.u32 %r17, [CalculateDistances_param_4]; + ld.param.u32 %r18, [CalculateDistances_param_5]; + cvta.to.global.u64 %rd1, %rd9; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd7; + mov.u32 %r19, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r20, %tid.x; + mad.lo.s32 %r26, %r1, %r19, %r20; + setp.ge.u32 %p1, %r26, %r17; + @%p1 bra $L__BB58_15; + + mov.u32 %r21, %ntid.y; + mov.u32 %r22, %ctaid.y; + mov.u32 %r23, %tid.y; + mad.lo.s32 %r3, %r21, %r22, %r23; + mov.u32 %r24, %nctaid.x; + mul.lo.s32 %r4, %r1, %r24; + mov.u32 %r25, %nctaid.y; + mul.lo.s32 %r5, %r21, %r25; + setp.eq.s32 %p2, %r18, 0; + @%p2 bra $L__BB58_11; + +$L__BB58_3: + setp.ge.u32 %p3, %r3, %r16; + @%p3 bra $L__BB58_9; + + setp.eq.s32 %p4, %r18, 2; + cvt.u64.u32 %rd4, %r26; + mul.wide.u32 %rd10, %r26, 4; + add.s64 %rd5, %rd3, %rd10; + @%p4 bra $L__BB58_7; + bra.uni $L__BB58_5; + +$L__BB58_7: + ld.global.nc.f32 %f1, [%rd5]; + mov.u32 %r28, %r3; + +$L__BB58_8: + mul.wide.u32 %rd13, %r28, 8; + add.s64 %rd14, %rd2, %rd13; + ld.global.nc.u64 %rd15, [%rd14]; + cvta.to.global.u64 %rd16, %rd15; + shl.b64 %rd17, %rd4, 2; + add.s64 %rd18, %rd16, %rd17; + ld.global.f32 %f4, [%rd18]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + mul.wide.u32 %rd19, %r28, 4; + add.s64 %rd20, %rd1, %rd19; + atom.global.add.f32 %f7, [%rd20], %f6; + add.s32 %r28, %r28, %r5; + setp.lt.u32 %p6, %r28, %r16; + @%p6 bra $L__BB58_8; + bra.uni $L__BB58_9; + +$L__BB58_5: + mov.u32 %r27, %r3; + +$L__BB58_6: + mul.wide.u32 %rd11, %r27, 4; + add.s64 %rd12, %rd1, %rd11; + atom.global.add.f32 %f3, [%rd12], 0f00000000; + add.s32 %r27, %r27, %r5; + setp.lt.u32 %p5, %r27, %r16; + @%p5 bra $L__BB58_6; + +$L__BB58_9: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p7, %r26, %r17; + @%p7 bra $L__BB58_3; + bra.uni $L__BB58_15; + +$L__BB58_11: + setp.ge.u32 %p8, %r3, %r16; + @%p8 bra $L__BB58_14; + + cvt.u64.u32 %rd6, %r26; + mul.wide.u32 %rd21, %r26, 4; + add.s64 %rd22, %rd3, %rd21; + ld.global.nc.f32 %f2, [%rd22]; + mov.u32 %r30, %r3; + +$L__BB58_13: + mul.wide.u32 %rd23, %r30, 8; + add.s64 %rd24, %rd2, %rd23; + ld.global.nc.u64 %rd25, [%rd24]; + cvta.to.global.u64 %rd26, %rd25; + shl.b64 %rd27, %rd6, 2; + add.s64 %rd28, %rd26, %rd27; + ld.global.f32 %f8, [%rd28]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + mul.wide.u32 %rd29, %r30, 4; + add.s64 %rd30, %rd1, %rd29; + atom.global.add.f32 %f11, [%rd30], %f10; + add.s32 %r30, %r30, %r5; + setp.lt.u32 %p9, %r30, %r16; + @%p9 bra $L__BB58_13; + +$L__BB58_14: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p10, %r26, %r17; + @%p10 bra $L__BB58_11; + +$L__BB58_15: + ret; + +} + // .globl CosineMultiDistance +.visible .entry CosineMultiDistance( + .param .u64 CosineMultiDistance_param_0, + .param .u64 CosineMultiDistance_param_1, + .param .u64 CosineMultiDistance_param_2, + .param .u64 CosineMultiDistance_param_3, + .param .u64 CosineMultiDistance_param_4, + .param .u32 CosineMultiDistance_param_5, + .param .u32 CosineMultiDistance_param_6, + .param .u32 CosineMultiDistance_param_7 ) { .reg .pred %p<7>; @@ -4224,20 +4332,20 @@ $L__BB57_21: .reg .b64 %rd<28>; - ld.param.u64 %rd7, [MultiCosineDistance_param_0]; - ld.param.u64 %rd8, [MultiCosineDistance_param_1]; - ld.param.u64 %rd9, [MultiCosineDistance_param_2]; - ld.param.u64 %rd10, [MultiCosineDistance_param_3]; - ld.param.u64 %rd11, [MultiCosineDistance_param_4]; - ld.param.u32 %r17, [MultiCosineDistance_param_5]; - ld.param.u32 %r18, [MultiCosineDistance_param_6]; - ld.param.u32 %r19, [MultiCosineDistance_param_7]; + ld.param.u64 %rd7, [CosineMultiDistance_param_0]; + ld.param.u64 %rd8, [CosineMultiDistance_param_1]; + ld.param.u64 %rd9, [CosineMultiDistance_param_2]; + ld.param.u64 %rd10, [CosineMultiDistance_param_3]; + ld.param.u64 %rd11, [CosineMultiDistance_param_4]; + ld.param.u32 %r17, [CosineMultiDistance_param_5]; + ld.param.u32 %r18, [CosineMultiDistance_param_6]; + ld.param.u32 %r19, [CosineMultiDistance_param_7]; mov.u32 %r20, %ctaid.x; mov.u32 %r1, %ntid.x; mov.u32 %r21, %tid.x; mad.lo.s32 %r31, %r1, %r20, %r21; setp.ge.u32 %p1, %r31, %r19; - @%p1 bra $L__BB58_9; + @%p1 bra $L__BB59_9; mov.u32 %r22, %ntid.y; mov.u32 %r23, %ctaid.y; @@ -4259,16 +4367,16 @@ $L__BB57_21: cvta.to.global.u64 %rd4, %rd10; cvta.to.global.u64 %rd5, %rd11; -$L__BB58_2: +$L__BB59_2: setp.ge.u32 %p2, %r3, %r18; - @%p2 bra $L__BB58_8; + @%p2 bra $L__BB59_8; cvt.u64.u32 %rd6, %r31; mov.u32 %r32, %r3; -$L__BB58_4: +$L__BB59_4: setp.ge.u32 %p3, %r5, %r17; - @%p3 bra $L__BB58_7; + @%p3 bra $L__BB59_7; mul.wide.u32 %rd12, %r32, 8; add.s64 %rd13, %rd1, %rd12; @@ -4281,7 +4389,7 @@ $L__BB58_4: mad.lo.s32 %r33, %r32, %r17, %r5; mov.u32 %r34, %r5; -$L__BB58_6: +$L__BB59_6: mul.wide.u32 %rd18, %r34, 8; add.s64 %rd19, %rd2, %rd18; ld.global.nc.u64 %rd20, [%rd19]; @@ -4300,19 +4408,105 @@ $L__BB58_6: add.s32 %r33, %r33, %r7; add.s32 %r34, %r34, %r7; setp.lt.u32 %p4, %r34, %r17; - @%p4 bra $L__BB58_6; + @%p4 bra $L__BB59_6; -$L__BB58_7: +$L__BB59_7: add.s32 %r32, %r32, %r6; setp.lt.u32 %p5, %r32, %r18; - @%p5 bra $L__BB58_4; + @%p5 bra $L__BB59_4; -$L__BB58_8: +$L__BB59_8: add.s32 %r31, %r31, %r4; setp.lt.u32 %p6, %r31, %r19; - @%p6 bra $L__BB58_2; + @%p6 bra $L__BB59_2; -$L__BB58_9: +$L__BB59_9: + ret; + +} + // .globl CosineDistances +.visible .entry CosineDistances( + .param .u64 CosineDistances_param_0, + .param .u64 CosineDistances_param_1, + .param .u64 CosineDistances_param_2, + .param .u64 CosineDistances_param_3, + .param .u64 CosineDistances_param_4, + .param .u32 CosineDistances_param_5, + .param .u32 CosineDistances_param_6 +) +{ + .reg .pred %p<5>; + .reg .f32 %f<9>; + .reg .b32 %r<21>; + .reg .b64 %rd<24>; + + + ld.param.u64 %rd7, [CosineDistances_param_0]; + ld.param.u64 %rd8, [CosineDistances_param_1]; + ld.param.u64 %rd9, [CosineDistances_param_2]; + ld.param.u64 %rd10, [CosineDistances_param_3]; + ld.param.u64 %rd11, [CosineDistances_param_4]; + ld.param.u32 %r10, [CosineDistances_param_5]; + ld.param.u32 %r11, [CosineDistances_param_6]; + mov.u32 %r12, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r13, %tid.x; + mad.lo.s32 %r19, %r1, %r12, %r13; + setp.ge.u32 %p1, %r19, %r11; + @%p1 bra $L__BB60_6; + + mov.u32 %r14, %ntid.y; + mov.u32 %r15, %ctaid.y; + mov.u32 %r16, %tid.y; + mad.lo.s32 %r3, %r14, %r15, %r16; + mov.u32 %r17, %nctaid.x; + mul.lo.s32 %r4, %r1, %r17; + mov.u32 %r18, %nctaid.y; + mul.lo.s32 %r5, %r14, %r18; + cvta.to.global.u64 %rd1, %rd7; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd9; + cvta.to.global.u64 %rd4, %rd10; + cvta.to.global.u64 %rd5, %rd11; + +$L__BB60_2: + setp.ge.u32 %p2, %r3, %r10; + @%p2 bra $L__BB60_5; + + cvt.u64.u32 %rd6, %r19; + mul.wide.u32 %rd12, %r19, 4; + add.s64 %rd13, %rd1, %rd12; + ld.global.nc.f32 %f1, [%rd13]; + mul.ftz.f32 %f2, %f1, %f1; + mov.u32 %r20, %r3; + +$L__BB60_4: + mul.wide.u32 %rd14, %r20, 8; + add.s64 %rd15, %rd2, %rd14; + ld.global.nc.u64 %rd16, [%rd15]; + cvta.to.global.u64 %rd17, %rd16; + shl.b64 %rd18, %rd6, 2; + add.s64 %rd19, %rd17, %rd18; + ld.global.f32 %f3, [%rd19]; + mul.wide.u32 %rd20, %r20, 4; + add.s64 %rd21, %rd3, %rd20; + atom.global.add.f32 %f4, [%rd21], %f2; + add.s64 %rd22, %rd4, %rd20; + mul.ftz.f32 %f5, %f1, %f3; + atom.global.add.f32 %f6, [%rd22], %f5; + add.s64 %rd23, %rd5, %rd20; + mul.ftz.f32 %f7, %f3, %f3; + atom.global.add.f32 %f8, [%rd23], %f7; + add.s32 %r20, %r20, %r5; + setp.lt.u32 %p3, %r20, %r10; + @%p3 bra $L__BB60_4; + +$L__BB60_5: + add.s32 %r19, %r19, %r4; + setp.lt.u32 %p4, %r19, %r11; + @%p4 bra $L__BB60_2; + +$L__BB60_6: ret; } @@ -4340,7 +4534,7 @@ $L__BB58_9: mov.u32 %r2, %tid.x; mad.lo.s32 %r3, %r22, %r1, %r2; setp.ge.u32 %p1, %r3, %r20; - @%p1 bra $L__BB59_2; + @%p1 bra $L__BB61_2; cvta.to.global.u64 %rd3, %rd1; mul.lo.s32 %r23, %r3, %r21; @@ -4352,17 +4546,17 @@ $L__BB58_9: add.s32 %r26, %r25, %r24; st.shared.f32 [%r26], %f8; -$L__BB59_2: +$L__BB61_2: bar.sync 0; setp.ne.s32 %p2, %r2, 0; - @%p2 bra $L__BB59_11; + @%p2 bra $L__BB61_11; shl.b32 %r4, %r1, 10; sub.s32 %r27, %r20, %r4; min.u32 %r5, %r27, 1024; setp.eq.s32 %p3, %r5, 0; mov.f32 %f25, 0f00000000; - @%p3 bra $L__BB59_10; + @%p3 bra $L__BB61_10; not.b32 %r29, %r20; add.s32 %r30, %r4, %r29; @@ -4373,7 +4567,7 @@ $L__BB59_2: setp.lt.u32 %p4, %r33, 3; mov.f32 %f25, 0f00000000; mov.u32 %r45, 0; - @%p4 bra $L__BB59_7; + @%p4 bra $L__BB61_7; add.s32 %r36, %r4, -1; sub.s32 %r37, %r36, %r20; @@ -4382,7 +4576,7 @@ $L__BB59_2: neg.s32 %r42, %r39; mov.u32 %r43, _ZZ9SumValuesE5block; -$L__BB59_6: +$L__BB61_6: ld.shared.f32 %f13, [%r43]; add.ftz.f32 %f14, %f25, %f13; ld.shared.f32 %f15, [%r43+4]; @@ -4395,32 +4589,32 @@ $L__BB59_6: add.s32 %r43, %r43, 16; add.s32 %r42, %r42, -4; setp.ne.s32 %p5, %r42, 1; - @%p5 bra $L__BB59_6; + @%p5 bra $L__BB61_6; -$L__BB59_7: +$L__BB61_7: setp.eq.s32 %p6, %r47, 0; - @%p6 bra $L__BB59_10; + @%p6 bra $L__BB61_10; shl.b32 %r40, %r45, 2; mov.u32 %r41, _ZZ9SumValuesE5block; add.s32 %r46, %r41, %r40; -$L__BB59_9: +$L__BB61_9: .pragma "nounroll"; ld.shared.f32 %f20, [%r46]; add.ftz.f32 %f25, %f25, %f20; add.s32 %r46, %r46, 4; add.s32 %r47, %r47, -1; setp.ne.s32 %p7, %r47, 0; - @%p7 bra $L__BB59_9; + @%p7 bra $L__BB61_9; -$L__BB59_10: +$L__BB61_10: cvta.to.global.u64 %rd6, %rd2; mul.wide.u32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f25; -$L__BB59_11: +$L__BB61_11: ret; } diff --git a/BrightData.Cuda/cuda/brightwire_53.ptx b/BrightData.Cuda/cuda/brightwire_53.ptx index bb167564..08740d7c 100644 --- a/BrightData.Cuda/cuda/brightwire_53.ptx +++ b/BrightData.Cuda/cuda/brightwire_53.ptx @@ -4022,200 +4022,308 @@ $L__BB56_3: ret; } - // .globl CalculateDistances -.visible .entry CalculateDistances( - .param .u64 CalculateDistances_param_0, - .param .u64 CalculateDistances_param_1, - .param .u64 CalculateDistances_param_2, - .param .u32 CalculateDistances_param_3, - .param .u32 CalculateDistances_param_4, - .param .u32 CalculateDistances_param_5, - .param .u32 CalculateDistances_param_6 + // .globl CalculateMultiDistances +.visible .entry CalculateMultiDistances( + .param .u64 CalculateMultiDistances_param_0, + .param .u64 CalculateMultiDistances_param_1, + .param .u64 CalculateMultiDistances_param_2, + .param .u32 CalculateMultiDistances_param_3, + .param .u32 CalculateMultiDistances_param_4, + .param .u32 CalculateMultiDistances_param_5, + .param .u32 CalculateMultiDistances_param_6 ) { - .reg .pred %p<15>; - .reg .f32 %f<15>; - .reg .b32 %r<53>; - .reg .b64 %rd<46>; - - - ld.param.u64 %rd5, [CalculateDistances_param_0]; - ld.param.u64 %rd6, [CalculateDistances_param_1]; - ld.param.u64 %rd7, [CalculateDistances_param_2]; - ld.param.u32 %r27, [CalculateDistances_param_3]; - ld.param.u32 %r28, [CalculateDistances_param_4]; - ld.param.u32 %r29, [CalculateDistances_param_5]; - ld.param.u32 %r30, [CalculateDistances_param_6]; - cvta.to.global.u64 %rd1, %rd7; - cvta.to.global.u64 %rd2, %rd6; - cvta.to.global.u64 %rd3, %rd5; - mov.u32 %r31, %ctaid.x; + .reg .pred %p<13>; + .reg .f32 %f<12>; + .reg .b32 %r<49>; + .reg .b64 %rd<38>; + + + ld.param.u64 %rd6, [CalculateMultiDistances_param_0]; + ld.param.u64 %rd7, [CalculateMultiDistances_param_1]; + ld.param.u64 %rd8, [CalculateMultiDistances_param_2]; + ld.param.u32 %r25, [CalculateMultiDistances_param_3]; + ld.param.u32 %r26, [CalculateMultiDistances_param_4]; + ld.param.u32 %r27, [CalculateMultiDistances_param_5]; + ld.param.u32 %r28, [CalculateMultiDistances_param_6]; + cvta.to.global.u64 %rd1, %rd8; + cvta.to.global.u64 %rd2, %rd7; + cvta.to.global.u64 %rd3, %rd6; + mov.u32 %r29, %ctaid.x; mov.u32 %r1, %ntid.x; - mov.u32 %r32, %tid.x; - mad.lo.s32 %r45, %r1, %r31, %r32; - setp.ge.u32 %p1, %r45, %r29; - @%p1 bra $L__BB57_21; - - mov.u32 %r33, %ntid.y; - mov.u32 %r34, %ctaid.y; - mov.u32 %r35, %tid.y; - mad.lo.s32 %r3, %r33, %r34, %r35; - mov.u32 %r36, %nctaid.x; - mul.lo.s32 %r4, %r1, %r36; - mov.u32 %r37, %ctaid.z; - mov.u32 %r38, %ntid.z; - mov.u32 %r39, %tid.z; - mad.lo.s32 %r5, %r38, %r37, %r39; - mov.u32 %r40, %nctaid.y; - mul.lo.s32 %r6, %r33, %r40; - mov.u32 %r41, %nctaid.z; - mul.lo.s32 %r7, %r38, %r41; + mov.u32 %r30, %tid.x; + mad.lo.s32 %r42, %r1, %r29, %r30; + setp.ge.u32 %p1, %r42, %r27; + @%p1 bra $L__BB57_18; + + mov.u32 %r31, %ntid.y; + mov.u32 %r32, %ctaid.y; + mov.u32 %r33, %tid.y; + mad.lo.s32 %r3, %r31, %r32, %r33; + mov.u32 %r34, %nctaid.x; + mul.lo.s32 %r4, %r1, %r34; + mov.u32 %r35, %ctaid.z; + mov.u32 %r36, %ntid.z; + mov.u32 %r37, %tid.z; + mad.lo.s32 %r5, %r36, %r35, %r37; + mov.u32 %r38, %nctaid.y; + mul.lo.s32 %r6, %r31, %r38; + mov.u32 %r39, %nctaid.z; + mul.lo.s32 %r7, %r36, %r39; $L__BB57_2: - setp.ge.u32 %p2, %r3, %r28; - @%p2 bra $L__BB57_20; + setp.ge.u32 %p2, %r3, %r26; + @%p2 bra $L__BB57_17; - setp.eq.s32 %p3, %r30, 0; - cvt.u64.u32 %rd4, %r45; - @%p3 bra $L__BB57_15; + setp.eq.s32 %p3, %r28, 0; + cvt.u64.u32 %rd4, %r42; + @%p3 bra $L__BB57_12; - mov.u32 %r46, %r3; + mov.u32 %r43, %r3; $L__BB57_5: - setp.ge.u32 %p4, %r5, %r27; - @%p4 bra $L__BB57_14; - - mul.wide.u32 %rd8, %r46, 8; - add.s64 %rd9, %rd3, %rd8; - ld.global.nc.u64 %rd10, [%rd9]; - cvta.to.global.u64 %rd11, %rd10; - shl.b64 %rd12, %rd4, 2; - add.s64 %rd13, %rd11, %rd12; - ld.global.f32 %f1, [%rd13]; - mul.lo.s32 %r10, %r46, %r27; - setp.eq.s32 %p5, %r30, 1; - @%p5 bra $L__BB57_10; - - setp.ne.s32 %p6, %r30, 2; - @%p6 bra $L__BB57_12; + setp.ge.u32 %p4, %r5, %r25; + @%p4 bra $L__BB57_11; - add.s32 %r47, %r5, %r10; - mov.u32 %r48, %r5; + setp.eq.s32 %p5, %r28, 2; + mul.wide.u32 %rd9, %r43, 8; + add.s64 %rd10, %rd3, %rd9; + ld.global.nc.u64 %rd11, [%rd10]; + cvta.to.global.u64 %rd12, %rd11; + shl.b64 %rd13, %rd4, 2; + add.s64 %rd5, %rd12, %rd13; + mul.lo.s32 %r10, %r43, %r25; + @%p5 bra $L__BB57_9; + bra.uni $L__BB57_7; $L__BB57_9: - mul.wide.u32 %rd14, %r48, 8; - add.s64 %rd15, %rd2, %rd14; - ld.global.nc.u64 %rd16, [%rd15]; - cvta.to.global.u64 %rd17, %rd16; - add.s64 %rd19, %rd17, %rd12; - ld.global.f32 %f3, [%rd19]; - sub.ftz.f32 %f4, %f1, %f3; - abs.ftz.f32 %f5, %f4; - mul.wide.u32 %rd20, %r47, 4; - add.s64 %rd21, %rd1, %rd20; - atom.global.add.f32 %f6, [%rd21], %f5; - add.s32 %r47, %r47, %r7; - add.s32 %r48, %r48, %r7; - setp.lt.u32 %p7, %r48, %r27; - @%p7 bra $L__BB57_9; - bra.uni $L__BB57_14; + ld.global.f32 %f1, [%rd5]; + mov.u32 %r46, %r5; $L__BB57_10: - mov.u32 %r49, %r5; + mul.wide.u32 %rd16, %r46, 8; + add.s64 %rd17, %rd2, %rd16; + ld.global.nc.u64 %rd18, [%rd17]; + cvta.to.global.u64 %rd19, %rd18; + add.s64 %rd21, %rd19, %rd13; + ld.global.f32 %f4, [%rd21]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + add.s32 %r40, %r46, %r10; + mul.wide.u32 %rd22, %r40, 4; + add.s64 %rd23, %rd1, %rd22; + atom.global.add.f32 %f7, [%rd23], %f6; + add.s32 %r46, %r46, %r7; + setp.lt.u32 %p7, %r46, %r25; + @%p7 bra $L__BB57_10; + bra.uni $L__BB57_11; + +$L__BB57_7: + add.s32 %r44, %r5, %r10; + mov.u32 %r45, %r5; + +$L__BB57_8: + mul.wide.u32 %rd14, %r44, 4; + add.s64 %rd15, %rd1, %rd14; + atom.global.add.f32 %f3, [%rd15], 0f00000000; + add.s32 %r44, %r44, %r7; + add.s32 %r45, %r45, %r7; + setp.lt.u32 %p6, %r45, %r25; + @%p6 bra $L__BB57_8; $L__BB57_11: - mul.wide.u32 %rd22, %r49, 8; - add.s64 %rd23, %rd2, %rd22; - ld.global.nc.u64 %rd24, [%rd23]; - cvta.to.global.u64 %rd25, %rd24; - add.s64 %rd27, %rd25, %rd12; - ld.global.f32 %f7, [%rd27]; - mul.ftz.f32 %f8, %f1, %f7; - add.s32 %r42, %r49, %r10; - mul.wide.u32 %rd28, %r42, 4; - add.s64 %rd29, %rd1, %rd28; - atom.global.add.f32 %f9, [%rd29], %f8; - add.s32 %r49, %r49, %r7; - setp.lt.u32 %p8, %r49, %r27; - @%p8 bra $L__BB57_11; - bra.uni $L__BB57_14; + add.s32 %r43, %r43, %r6; + setp.lt.u32 %p8, %r43, %r26; + @%p8 bra $L__BB57_5; + bra.uni $L__BB57_17; $L__BB57_12: - mov.u32 %r50, %r5; + mov.u32 %r47, %r3; $L__BB57_13: - add.s32 %r43, %r50, %r10; - mul.wide.u32 %rd30, %r43, 4; - add.s64 %rd31, %rd1, %rd30; - atom.global.add.f32 %f10, [%rd31], 0f00000000; - add.s32 %r50, %r50, %r7; - setp.lt.u32 %p9, %r50, %r27; - @%p9 bra $L__BB57_13; - -$L__BB57_14: - add.s32 %r46, %r46, %r6; - setp.lt.u32 %p10, %r46, %r28; - @%p10 bra $L__BB57_5; - bra.uni $L__BB57_20; + setp.ge.u32 %p9, %r5, %r25; + @%p9 bra $L__BB57_16; + + mul.wide.u32 %rd24, %r47, 8; + add.s64 %rd25, %rd3, %rd24; + ld.global.nc.u64 %rd26, [%rd25]; + cvta.to.global.u64 %rd27, %rd26; + shl.b64 %rd28, %rd4, 2; + add.s64 %rd29, %rd27, %rd28; + ld.global.f32 %f2, [%rd29]; + mul.lo.s32 %r20, %r47, %r25; + mov.u32 %r48, %r5; $L__BB57_15: - mov.u32 %r51, %r3; + mul.wide.u32 %rd30, %r48, 8; + add.s64 %rd31, %rd2, %rd30; + ld.global.nc.u64 %rd32, [%rd31]; + cvta.to.global.u64 %rd33, %rd32; + add.s64 %rd35, %rd33, %rd28; + ld.global.f32 %f8, [%rd35]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + add.s32 %r41, %r48, %r20; + mul.wide.u32 %rd36, %r41, 4; + add.s64 %rd37, %rd1, %rd36; + atom.global.add.f32 %f11, [%rd37], %f10; + add.s32 %r48, %r48, %r7; + setp.lt.u32 %p10, %r48, %r25; + @%p10 bra $L__BB57_15; $L__BB57_16: - setp.ge.u32 %p11, %r5, %r27; - @%p11 bra $L__BB57_19; - - mul.wide.u32 %rd32, %r51, 8; - add.s64 %rd33, %rd3, %rd32; - ld.global.nc.u64 %rd34, [%rd33]; - cvta.to.global.u64 %rd35, %rd34; - shl.b64 %rd36, %rd4, 2; - add.s64 %rd37, %rd35, %rd36; - ld.global.f32 %f2, [%rd37]; - mul.lo.s32 %r22, %r51, %r27; - mov.u32 %r52, %r5; + add.s32 %r47, %r47, %r6; + setp.lt.u32 %p11, %r47, %r26; + @%p11 bra $L__BB57_13; + +$L__BB57_17: + add.s32 %r42, %r42, %r4; + setp.lt.u32 %p12, %r42, %r27; + @%p12 bra $L__BB57_2; $L__BB57_18: - mul.wide.u32 %rd38, %r52, 8; - add.s64 %rd39, %rd2, %rd38; - ld.global.nc.u64 %rd40, [%rd39]; - cvta.to.global.u64 %rd41, %rd40; - add.s64 %rd43, %rd41, %rd36; - ld.global.f32 %f11, [%rd43]; - sub.ftz.f32 %f12, %f2, %f11; - mul.ftz.f32 %f13, %f12, %f12; - add.s32 %r44, %r52, %r22; - mul.wide.u32 %rd44, %r44, 4; - add.s64 %rd45, %rd1, %rd44; - atom.global.add.f32 %f14, [%rd45], %f13; - add.s32 %r52, %r52, %r7; - setp.lt.u32 %p12, %r52, %r27; - @%p12 bra $L__BB57_18; - -$L__BB57_19: - add.s32 %r51, %r51, %r6; - setp.lt.u32 %p13, %r51, %r28; - @%p13 bra $L__BB57_16; - -$L__BB57_20: - add.s32 %r45, %r45, %r4; - setp.lt.u32 %p14, %r45, %r29; - @%p14 bra $L__BB57_2; - -$L__BB57_21: ret; } - // .globl MultiCosineDistance -.visible .entry MultiCosineDistance( - .param .u64 MultiCosineDistance_param_0, - .param .u64 MultiCosineDistance_param_1, - .param .u64 MultiCosineDistance_param_2, - .param .u64 MultiCosineDistance_param_3, - .param .u64 MultiCosineDistance_param_4, - .param .u32 MultiCosineDistance_param_5, - .param .u32 MultiCosineDistance_param_6, - .param .u32 MultiCosineDistance_param_7 + // .globl CalculateDistances +.visible .entry CalculateDistances( + .param .u64 CalculateDistances_param_0, + .param .u64 CalculateDistances_param_1, + .param .u64 CalculateDistances_param_2, + .param .u32 CalculateDistances_param_3, + .param .u32 CalculateDistances_param_4, + .param .u32 CalculateDistances_param_5 +) +{ + .reg .pred %p<11>; + .reg .f32 %f<12>; + .reg .b32 %r<31>; + .reg .b64 %rd<31>; + + + ld.param.u64 %rd7, [CalculateDistances_param_0]; + ld.param.u64 %rd8, [CalculateDistances_param_1]; + ld.param.u64 %rd9, [CalculateDistances_param_2]; + ld.param.u32 %r16, [CalculateDistances_param_3]; + ld.param.u32 %r17, [CalculateDistances_param_4]; + ld.param.u32 %r18, [CalculateDistances_param_5]; + cvta.to.global.u64 %rd1, %rd9; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd7; + mov.u32 %r19, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r20, %tid.x; + mad.lo.s32 %r26, %r1, %r19, %r20; + setp.ge.u32 %p1, %r26, %r17; + @%p1 bra $L__BB58_15; + + mov.u32 %r21, %ntid.y; + mov.u32 %r22, %ctaid.y; + mov.u32 %r23, %tid.y; + mad.lo.s32 %r3, %r21, %r22, %r23; + mov.u32 %r24, %nctaid.x; + mul.lo.s32 %r4, %r1, %r24; + mov.u32 %r25, %nctaid.y; + mul.lo.s32 %r5, %r21, %r25; + setp.eq.s32 %p2, %r18, 0; + @%p2 bra $L__BB58_11; + +$L__BB58_3: + setp.ge.u32 %p3, %r3, %r16; + @%p3 bra $L__BB58_9; + + setp.eq.s32 %p4, %r18, 2; + cvt.u64.u32 %rd4, %r26; + mul.wide.u32 %rd10, %r26, 4; + add.s64 %rd5, %rd3, %rd10; + @%p4 bra $L__BB58_7; + bra.uni $L__BB58_5; + +$L__BB58_7: + ld.global.nc.f32 %f1, [%rd5]; + mov.u32 %r28, %r3; + +$L__BB58_8: + mul.wide.u32 %rd13, %r28, 8; + add.s64 %rd14, %rd2, %rd13; + ld.global.nc.u64 %rd15, [%rd14]; + cvta.to.global.u64 %rd16, %rd15; + shl.b64 %rd17, %rd4, 2; + add.s64 %rd18, %rd16, %rd17; + ld.global.f32 %f4, [%rd18]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + mul.wide.u32 %rd19, %r28, 4; + add.s64 %rd20, %rd1, %rd19; + atom.global.add.f32 %f7, [%rd20], %f6; + add.s32 %r28, %r28, %r5; + setp.lt.u32 %p6, %r28, %r16; + @%p6 bra $L__BB58_8; + bra.uni $L__BB58_9; + +$L__BB58_5: + mov.u32 %r27, %r3; + +$L__BB58_6: + mul.wide.u32 %rd11, %r27, 4; + add.s64 %rd12, %rd1, %rd11; + atom.global.add.f32 %f3, [%rd12], 0f00000000; + add.s32 %r27, %r27, %r5; + setp.lt.u32 %p5, %r27, %r16; + @%p5 bra $L__BB58_6; + +$L__BB58_9: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p7, %r26, %r17; + @%p7 bra $L__BB58_3; + bra.uni $L__BB58_15; + +$L__BB58_11: + setp.ge.u32 %p8, %r3, %r16; + @%p8 bra $L__BB58_14; + + cvt.u64.u32 %rd6, %r26; + mul.wide.u32 %rd21, %r26, 4; + add.s64 %rd22, %rd3, %rd21; + ld.global.nc.f32 %f2, [%rd22]; + mov.u32 %r30, %r3; + +$L__BB58_13: + mul.wide.u32 %rd23, %r30, 8; + add.s64 %rd24, %rd2, %rd23; + ld.global.nc.u64 %rd25, [%rd24]; + cvta.to.global.u64 %rd26, %rd25; + shl.b64 %rd27, %rd6, 2; + add.s64 %rd28, %rd26, %rd27; + ld.global.f32 %f8, [%rd28]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + mul.wide.u32 %rd29, %r30, 4; + add.s64 %rd30, %rd1, %rd29; + atom.global.add.f32 %f11, [%rd30], %f10; + add.s32 %r30, %r30, %r5; + setp.lt.u32 %p9, %r30, %r16; + @%p9 bra $L__BB58_13; + +$L__BB58_14: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p10, %r26, %r17; + @%p10 bra $L__BB58_11; + +$L__BB58_15: + ret; + +} + // .globl CosineMultiDistance +.visible .entry CosineMultiDistance( + .param .u64 CosineMultiDistance_param_0, + .param .u64 CosineMultiDistance_param_1, + .param .u64 CosineMultiDistance_param_2, + .param .u64 CosineMultiDistance_param_3, + .param .u64 CosineMultiDistance_param_4, + .param .u32 CosineMultiDistance_param_5, + .param .u32 CosineMultiDistance_param_6, + .param .u32 CosineMultiDistance_param_7 ) { .reg .pred %p<7>; @@ -4224,20 +4332,20 @@ $L__BB57_21: .reg .b64 %rd<28>; - ld.param.u64 %rd7, [MultiCosineDistance_param_0]; - ld.param.u64 %rd8, [MultiCosineDistance_param_1]; - ld.param.u64 %rd9, [MultiCosineDistance_param_2]; - ld.param.u64 %rd10, [MultiCosineDistance_param_3]; - ld.param.u64 %rd11, [MultiCosineDistance_param_4]; - ld.param.u32 %r17, [MultiCosineDistance_param_5]; - ld.param.u32 %r18, [MultiCosineDistance_param_6]; - ld.param.u32 %r19, [MultiCosineDistance_param_7]; + ld.param.u64 %rd7, [CosineMultiDistance_param_0]; + ld.param.u64 %rd8, [CosineMultiDistance_param_1]; + ld.param.u64 %rd9, [CosineMultiDistance_param_2]; + ld.param.u64 %rd10, [CosineMultiDistance_param_3]; + ld.param.u64 %rd11, [CosineMultiDistance_param_4]; + ld.param.u32 %r17, [CosineMultiDistance_param_5]; + ld.param.u32 %r18, [CosineMultiDistance_param_6]; + ld.param.u32 %r19, [CosineMultiDistance_param_7]; mov.u32 %r20, %ctaid.x; mov.u32 %r1, %ntid.x; mov.u32 %r21, %tid.x; mad.lo.s32 %r31, %r1, %r20, %r21; setp.ge.u32 %p1, %r31, %r19; - @%p1 bra $L__BB58_9; + @%p1 bra $L__BB59_9; mov.u32 %r22, %ntid.y; mov.u32 %r23, %ctaid.y; @@ -4259,16 +4367,16 @@ $L__BB57_21: cvta.to.global.u64 %rd4, %rd10; cvta.to.global.u64 %rd5, %rd11; -$L__BB58_2: +$L__BB59_2: setp.ge.u32 %p2, %r3, %r18; - @%p2 bra $L__BB58_8; + @%p2 bra $L__BB59_8; cvt.u64.u32 %rd6, %r31; mov.u32 %r32, %r3; -$L__BB58_4: +$L__BB59_4: setp.ge.u32 %p3, %r5, %r17; - @%p3 bra $L__BB58_7; + @%p3 bra $L__BB59_7; mul.wide.u32 %rd12, %r32, 8; add.s64 %rd13, %rd1, %rd12; @@ -4281,7 +4389,7 @@ $L__BB58_4: mad.lo.s32 %r33, %r32, %r17, %r5; mov.u32 %r34, %r5; -$L__BB58_6: +$L__BB59_6: mul.wide.u32 %rd18, %r34, 8; add.s64 %rd19, %rd2, %rd18; ld.global.nc.u64 %rd20, [%rd19]; @@ -4300,19 +4408,105 @@ $L__BB58_6: add.s32 %r33, %r33, %r7; add.s32 %r34, %r34, %r7; setp.lt.u32 %p4, %r34, %r17; - @%p4 bra $L__BB58_6; + @%p4 bra $L__BB59_6; -$L__BB58_7: +$L__BB59_7: add.s32 %r32, %r32, %r6; setp.lt.u32 %p5, %r32, %r18; - @%p5 bra $L__BB58_4; + @%p5 bra $L__BB59_4; -$L__BB58_8: +$L__BB59_8: add.s32 %r31, %r31, %r4; setp.lt.u32 %p6, %r31, %r19; - @%p6 bra $L__BB58_2; + @%p6 bra $L__BB59_2; -$L__BB58_9: +$L__BB59_9: + ret; + +} + // .globl CosineDistances +.visible .entry CosineDistances( + .param .u64 CosineDistances_param_0, + .param .u64 CosineDistances_param_1, + .param .u64 CosineDistances_param_2, + .param .u64 CosineDistances_param_3, + .param .u64 CosineDistances_param_4, + .param .u32 CosineDistances_param_5, + .param .u32 CosineDistances_param_6 +) +{ + .reg .pred %p<5>; + .reg .f32 %f<9>; + .reg .b32 %r<21>; + .reg .b64 %rd<24>; + + + ld.param.u64 %rd7, [CosineDistances_param_0]; + ld.param.u64 %rd8, [CosineDistances_param_1]; + ld.param.u64 %rd9, [CosineDistances_param_2]; + ld.param.u64 %rd10, [CosineDistances_param_3]; + ld.param.u64 %rd11, [CosineDistances_param_4]; + ld.param.u32 %r10, [CosineDistances_param_5]; + ld.param.u32 %r11, [CosineDistances_param_6]; + mov.u32 %r12, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r13, %tid.x; + mad.lo.s32 %r19, %r1, %r12, %r13; + setp.ge.u32 %p1, %r19, %r11; + @%p1 bra $L__BB60_6; + + mov.u32 %r14, %ntid.y; + mov.u32 %r15, %ctaid.y; + mov.u32 %r16, %tid.y; + mad.lo.s32 %r3, %r14, %r15, %r16; + mov.u32 %r17, %nctaid.x; + mul.lo.s32 %r4, %r1, %r17; + mov.u32 %r18, %nctaid.y; + mul.lo.s32 %r5, %r14, %r18; + cvta.to.global.u64 %rd1, %rd7; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd9; + cvta.to.global.u64 %rd4, %rd10; + cvta.to.global.u64 %rd5, %rd11; + +$L__BB60_2: + setp.ge.u32 %p2, %r3, %r10; + @%p2 bra $L__BB60_5; + + cvt.u64.u32 %rd6, %r19; + mul.wide.u32 %rd12, %r19, 4; + add.s64 %rd13, %rd1, %rd12; + ld.global.nc.f32 %f1, [%rd13]; + mul.ftz.f32 %f2, %f1, %f1; + mov.u32 %r20, %r3; + +$L__BB60_4: + mul.wide.u32 %rd14, %r20, 8; + add.s64 %rd15, %rd2, %rd14; + ld.global.nc.u64 %rd16, [%rd15]; + cvta.to.global.u64 %rd17, %rd16; + shl.b64 %rd18, %rd6, 2; + add.s64 %rd19, %rd17, %rd18; + ld.global.f32 %f3, [%rd19]; + mul.wide.u32 %rd20, %r20, 4; + add.s64 %rd21, %rd3, %rd20; + atom.global.add.f32 %f4, [%rd21], %f2; + add.s64 %rd22, %rd4, %rd20; + mul.ftz.f32 %f5, %f1, %f3; + atom.global.add.f32 %f6, [%rd22], %f5; + add.s64 %rd23, %rd5, %rd20; + mul.ftz.f32 %f7, %f3, %f3; + atom.global.add.f32 %f8, [%rd23], %f7; + add.s32 %r20, %r20, %r5; + setp.lt.u32 %p3, %r20, %r10; + @%p3 bra $L__BB60_4; + +$L__BB60_5: + add.s32 %r19, %r19, %r4; + setp.lt.u32 %p4, %r19, %r11; + @%p4 bra $L__BB60_2; + +$L__BB60_6: ret; } @@ -4340,7 +4534,7 @@ $L__BB58_9: mov.u32 %r2, %tid.x; mad.lo.s32 %r3, %r22, %r1, %r2; setp.ge.u32 %p1, %r3, %r20; - @%p1 bra $L__BB59_2; + @%p1 bra $L__BB61_2; cvta.to.global.u64 %rd3, %rd1; mul.lo.s32 %r23, %r3, %r21; @@ -4352,17 +4546,17 @@ $L__BB58_9: add.s32 %r26, %r25, %r24; st.shared.f32 [%r26], %f8; -$L__BB59_2: +$L__BB61_2: bar.sync 0; setp.ne.s32 %p2, %r2, 0; - @%p2 bra $L__BB59_11; + @%p2 bra $L__BB61_11; shl.b32 %r4, %r1, 10; sub.s32 %r27, %r20, %r4; min.u32 %r5, %r27, 1024; setp.eq.s32 %p3, %r5, 0; mov.f32 %f25, 0f00000000; - @%p3 bra $L__BB59_10; + @%p3 bra $L__BB61_10; not.b32 %r29, %r20; add.s32 %r30, %r4, %r29; @@ -4373,7 +4567,7 @@ $L__BB59_2: setp.lt.u32 %p4, %r33, 3; mov.f32 %f25, 0f00000000; mov.u32 %r45, 0; - @%p4 bra $L__BB59_7; + @%p4 bra $L__BB61_7; add.s32 %r36, %r4, -1; sub.s32 %r37, %r36, %r20; @@ -4382,7 +4576,7 @@ $L__BB59_2: neg.s32 %r42, %r39; mov.u32 %r43, _ZZ9SumValuesE5block; -$L__BB59_6: +$L__BB61_6: ld.shared.f32 %f13, [%r43]; add.ftz.f32 %f14, %f25, %f13; ld.shared.f32 %f15, [%r43+4]; @@ -4395,32 +4589,32 @@ $L__BB59_6: add.s32 %r43, %r43, 16; add.s32 %r42, %r42, -4; setp.ne.s32 %p5, %r42, 1; - @%p5 bra $L__BB59_6; + @%p5 bra $L__BB61_6; -$L__BB59_7: +$L__BB61_7: setp.eq.s32 %p6, %r47, 0; - @%p6 bra $L__BB59_10; + @%p6 bra $L__BB61_10; shl.b32 %r40, %r45, 2; mov.u32 %r41, _ZZ9SumValuesE5block; add.s32 %r46, %r41, %r40; -$L__BB59_9: +$L__BB61_9: .pragma "nounroll"; ld.shared.f32 %f20, [%r46]; add.ftz.f32 %f25, %f25, %f20; add.s32 %r46, %r46, 4; add.s32 %r47, %r47, -1; setp.ne.s32 %p7, %r47, 0; - @%p7 bra $L__BB59_9; + @%p7 bra $L__BB61_9; -$L__BB59_10: +$L__BB61_10: cvta.to.global.u64 %rd6, %rd2; mul.wide.u32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f25; -$L__BB59_11: +$L__BB61_11: ret; } diff --git a/BrightData.Cuda/cuda/brightwire_60.ptx b/BrightData.Cuda/cuda/brightwire_60.ptx index 3be9b95b..6f747af4 100644 --- a/BrightData.Cuda/cuda/brightwire_60.ptx +++ b/BrightData.Cuda/cuda/brightwire_60.ptx @@ -4022,200 +4022,308 @@ $L__BB56_3: ret; } - // .globl CalculateDistances -.visible .entry CalculateDistances( - .param .u64 CalculateDistances_param_0, - .param .u64 CalculateDistances_param_1, - .param .u64 CalculateDistances_param_2, - .param .u32 CalculateDistances_param_3, - .param .u32 CalculateDistances_param_4, - .param .u32 CalculateDistances_param_5, - .param .u32 CalculateDistances_param_6 + // .globl CalculateMultiDistances +.visible .entry CalculateMultiDistances( + .param .u64 CalculateMultiDistances_param_0, + .param .u64 CalculateMultiDistances_param_1, + .param .u64 CalculateMultiDistances_param_2, + .param .u32 CalculateMultiDistances_param_3, + .param .u32 CalculateMultiDistances_param_4, + .param .u32 CalculateMultiDistances_param_5, + .param .u32 CalculateMultiDistances_param_6 ) { - .reg .pred %p<15>; - .reg .f32 %f<15>; - .reg .b32 %r<53>; - .reg .b64 %rd<46>; - - - ld.param.u64 %rd5, [CalculateDistances_param_0]; - ld.param.u64 %rd6, [CalculateDistances_param_1]; - ld.param.u64 %rd7, [CalculateDistances_param_2]; - ld.param.u32 %r27, [CalculateDistances_param_3]; - ld.param.u32 %r28, [CalculateDistances_param_4]; - ld.param.u32 %r29, [CalculateDistances_param_5]; - ld.param.u32 %r30, [CalculateDistances_param_6]; - cvta.to.global.u64 %rd1, %rd7; - cvta.to.global.u64 %rd2, %rd6; - cvta.to.global.u64 %rd3, %rd5; - mov.u32 %r31, %ctaid.x; + .reg .pred %p<13>; + .reg .f32 %f<12>; + .reg .b32 %r<49>; + .reg .b64 %rd<38>; + + + ld.param.u64 %rd6, [CalculateMultiDistances_param_0]; + ld.param.u64 %rd7, [CalculateMultiDistances_param_1]; + ld.param.u64 %rd8, [CalculateMultiDistances_param_2]; + ld.param.u32 %r25, [CalculateMultiDistances_param_3]; + ld.param.u32 %r26, [CalculateMultiDistances_param_4]; + ld.param.u32 %r27, [CalculateMultiDistances_param_5]; + ld.param.u32 %r28, [CalculateMultiDistances_param_6]; + cvta.to.global.u64 %rd1, %rd8; + cvta.to.global.u64 %rd2, %rd7; + cvta.to.global.u64 %rd3, %rd6; + mov.u32 %r29, %ctaid.x; mov.u32 %r1, %ntid.x; - mov.u32 %r32, %tid.x; - mad.lo.s32 %r45, %r1, %r31, %r32; - setp.ge.u32 %p1, %r45, %r29; - @%p1 bra $L__BB57_21; - - mov.u32 %r33, %ntid.y; - mov.u32 %r34, %ctaid.y; - mov.u32 %r35, %tid.y; - mad.lo.s32 %r3, %r33, %r34, %r35; - mov.u32 %r36, %nctaid.x; - mul.lo.s32 %r4, %r1, %r36; - mov.u32 %r37, %ctaid.z; - mov.u32 %r38, %ntid.z; - mov.u32 %r39, %tid.z; - mad.lo.s32 %r5, %r38, %r37, %r39; - mov.u32 %r40, %nctaid.y; - mul.lo.s32 %r6, %r33, %r40; - mov.u32 %r41, %nctaid.z; - mul.lo.s32 %r7, %r38, %r41; + mov.u32 %r30, %tid.x; + mad.lo.s32 %r42, %r1, %r29, %r30; + setp.ge.u32 %p1, %r42, %r27; + @%p1 bra $L__BB57_18; + + mov.u32 %r31, %ntid.y; + mov.u32 %r32, %ctaid.y; + mov.u32 %r33, %tid.y; + mad.lo.s32 %r3, %r31, %r32, %r33; + mov.u32 %r34, %nctaid.x; + mul.lo.s32 %r4, %r1, %r34; + mov.u32 %r35, %ctaid.z; + mov.u32 %r36, %ntid.z; + mov.u32 %r37, %tid.z; + mad.lo.s32 %r5, %r36, %r35, %r37; + mov.u32 %r38, %nctaid.y; + mul.lo.s32 %r6, %r31, %r38; + mov.u32 %r39, %nctaid.z; + mul.lo.s32 %r7, %r36, %r39; $L__BB57_2: - setp.ge.u32 %p2, %r3, %r28; - @%p2 bra $L__BB57_20; + setp.ge.u32 %p2, %r3, %r26; + @%p2 bra $L__BB57_17; - setp.eq.s32 %p3, %r30, 0; - cvt.u64.u32 %rd4, %r45; - @%p3 bra $L__BB57_15; + setp.eq.s32 %p3, %r28, 0; + cvt.u64.u32 %rd4, %r42; + @%p3 bra $L__BB57_12; - mov.u32 %r46, %r3; + mov.u32 %r43, %r3; $L__BB57_5: - setp.ge.u32 %p4, %r5, %r27; - @%p4 bra $L__BB57_14; - - mul.wide.u32 %rd8, %r46, 8; - add.s64 %rd9, %rd3, %rd8; - ld.global.nc.u64 %rd10, [%rd9]; - cvta.to.global.u64 %rd11, %rd10; - shl.b64 %rd12, %rd4, 2; - add.s64 %rd13, %rd11, %rd12; - ld.global.f32 %f1, [%rd13]; - mul.lo.s32 %r10, %r46, %r27; - setp.eq.s32 %p5, %r30, 1; - @%p5 bra $L__BB57_10; - - setp.ne.s32 %p6, %r30, 2; - @%p6 bra $L__BB57_12; + setp.ge.u32 %p4, %r5, %r25; + @%p4 bra $L__BB57_11; - add.s32 %r47, %r5, %r10; - mov.u32 %r48, %r5; + setp.eq.s32 %p5, %r28, 2; + mul.wide.u32 %rd9, %r43, 8; + add.s64 %rd10, %rd3, %rd9; + ld.global.nc.u64 %rd11, [%rd10]; + cvta.to.global.u64 %rd12, %rd11; + shl.b64 %rd13, %rd4, 2; + add.s64 %rd5, %rd12, %rd13; + mul.lo.s32 %r10, %r43, %r25; + @%p5 bra $L__BB57_9; + bra.uni $L__BB57_7; $L__BB57_9: - mul.wide.u32 %rd14, %r48, 8; - add.s64 %rd15, %rd2, %rd14; - ld.global.nc.u64 %rd16, [%rd15]; - cvta.to.global.u64 %rd17, %rd16; - add.s64 %rd19, %rd17, %rd12; - ld.global.f32 %f3, [%rd19]; - sub.ftz.f32 %f4, %f1, %f3; - abs.ftz.f32 %f5, %f4; - mul.wide.u32 %rd20, %r47, 4; - add.s64 %rd21, %rd1, %rd20; - atom.global.add.f32 %f6, [%rd21], %f5; - add.s32 %r47, %r47, %r7; - add.s32 %r48, %r48, %r7; - setp.lt.u32 %p7, %r48, %r27; - @%p7 bra $L__BB57_9; - bra.uni $L__BB57_14; + ld.global.f32 %f1, [%rd5]; + mov.u32 %r46, %r5; $L__BB57_10: - mov.u32 %r49, %r5; + mul.wide.u32 %rd16, %r46, 8; + add.s64 %rd17, %rd2, %rd16; + ld.global.nc.u64 %rd18, [%rd17]; + cvta.to.global.u64 %rd19, %rd18; + add.s64 %rd21, %rd19, %rd13; + ld.global.f32 %f4, [%rd21]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + add.s32 %r40, %r46, %r10; + mul.wide.u32 %rd22, %r40, 4; + add.s64 %rd23, %rd1, %rd22; + atom.global.add.f32 %f7, [%rd23], %f6; + add.s32 %r46, %r46, %r7; + setp.lt.u32 %p7, %r46, %r25; + @%p7 bra $L__BB57_10; + bra.uni $L__BB57_11; + +$L__BB57_7: + add.s32 %r44, %r5, %r10; + mov.u32 %r45, %r5; + +$L__BB57_8: + mul.wide.u32 %rd14, %r44, 4; + add.s64 %rd15, %rd1, %rd14; + atom.global.add.f32 %f3, [%rd15], 0f00000000; + add.s32 %r44, %r44, %r7; + add.s32 %r45, %r45, %r7; + setp.lt.u32 %p6, %r45, %r25; + @%p6 bra $L__BB57_8; $L__BB57_11: - mul.wide.u32 %rd22, %r49, 8; - add.s64 %rd23, %rd2, %rd22; - ld.global.nc.u64 %rd24, [%rd23]; - cvta.to.global.u64 %rd25, %rd24; - add.s64 %rd27, %rd25, %rd12; - ld.global.f32 %f7, [%rd27]; - mul.ftz.f32 %f8, %f1, %f7; - add.s32 %r42, %r49, %r10; - mul.wide.u32 %rd28, %r42, 4; - add.s64 %rd29, %rd1, %rd28; - atom.global.add.f32 %f9, [%rd29], %f8; - add.s32 %r49, %r49, %r7; - setp.lt.u32 %p8, %r49, %r27; - @%p8 bra $L__BB57_11; - bra.uni $L__BB57_14; + add.s32 %r43, %r43, %r6; + setp.lt.u32 %p8, %r43, %r26; + @%p8 bra $L__BB57_5; + bra.uni $L__BB57_17; $L__BB57_12: - mov.u32 %r50, %r5; + mov.u32 %r47, %r3; $L__BB57_13: - add.s32 %r43, %r50, %r10; - mul.wide.u32 %rd30, %r43, 4; - add.s64 %rd31, %rd1, %rd30; - atom.global.add.f32 %f10, [%rd31], 0f00000000; - add.s32 %r50, %r50, %r7; - setp.lt.u32 %p9, %r50, %r27; - @%p9 bra $L__BB57_13; - -$L__BB57_14: - add.s32 %r46, %r46, %r6; - setp.lt.u32 %p10, %r46, %r28; - @%p10 bra $L__BB57_5; - bra.uni $L__BB57_20; + setp.ge.u32 %p9, %r5, %r25; + @%p9 bra $L__BB57_16; + + mul.wide.u32 %rd24, %r47, 8; + add.s64 %rd25, %rd3, %rd24; + ld.global.nc.u64 %rd26, [%rd25]; + cvta.to.global.u64 %rd27, %rd26; + shl.b64 %rd28, %rd4, 2; + add.s64 %rd29, %rd27, %rd28; + ld.global.f32 %f2, [%rd29]; + mul.lo.s32 %r20, %r47, %r25; + mov.u32 %r48, %r5; $L__BB57_15: - mov.u32 %r51, %r3; + mul.wide.u32 %rd30, %r48, 8; + add.s64 %rd31, %rd2, %rd30; + ld.global.nc.u64 %rd32, [%rd31]; + cvta.to.global.u64 %rd33, %rd32; + add.s64 %rd35, %rd33, %rd28; + ld.global.f32 %f8, [%rd35]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + add.s32 %r41, %r48, %r20; + mul.wide.u32 %rd36, %r41, 4; + add.s64 %rd37, %rd1, %rd36; + atom.global.add.f32 %f11, [%rd37], %f10; + add.s32 %r48, %r48, %r7; + setp.lt.u32 %p10, %r48, %r25; + @%p10 bra $L__BB57_15; $L__BB57_16: - setp.ge.u32 %p11, %r5, %r27; - @%p11 bra $L__BB57_19; - - mul.wide.u32 %rd32, %r51, 8; - add.s64 %rd33, %rd3, %rd32; - ld.global.nc.u64 %rd34, [%rd33]; - cvta.to.global.u64 %rd35, %rd34; - shl.b64 %rd36, %rd4, 2; - add.s64 %rd37, %rd35, %rd36; - ld.global.f32 %f2, [%rd37]; - mul.lo.s32 %r22, %r51, %r27; - mov.u32 %r52, %r5; + add.s32 %r47, %r47, %r6; + setp.lt.u32 %p11, %r47, %r26; + @%p11 bra $L__BB57_13; + +$L__BB57_17: + add.s32 %r42, %r42, %r4; + setp.lt.u32 %p12, %r42, %r27; + @%p12 bra $L__BB57_2; $L__BB57_18: - mul.wide.u32 %rd38, %r52, 8; - add.s64 %rd39, %rd2, %rd38; - ld.global.nc.u64 %rd40, [%rd39]; - cvta.to.global.u64 %rd41, %rd40; - add.s64 %rd43, %rd41, %rd36; - ld.global.f32 %f11, [%rd43]; - sub.ftz.f32 %f12, %f2, %f11; - mul.ftz.f32 %f13, %f12, %f12; - add.s32 %r44, %r52, %r22; - mul.wide.u32 %rd44, %r44, 4; - add.s64 %rd45, %rd1, %rd44; - atom.global.add.f32 %f14, [%rd45], %f13; - add.s32 %r52, %r52, %r7; - setp.lt.u32 %p12, %r52, %r27; - @%p12 bra $L__BB57_18; - -$L__BB57_19: - add.s32 %r51, %r51, %r6; - setp.lt.u32 %p13, %r51, %r28; - @%p13 bra $L__BB57_16; - -$L__BB57_20: - add.s32 %r45, %r45, %r4; - setp.lt.u32 %p14, %r45, %r29; - @%p14 bra $L__BB57_2; - -$L__BB57_21: ret; } - // .globl MultiCosineDistance -.visible .entry MultiCosineDistance( - .param .u64 MultiCosineDistance_param_0, - .param .u64 MultiCosineDistance_param_1, - .param .u64 MultiCosineDistance_param_2, - .param .u64 MultiCosineDistance_param_3, - .param .u64 MultiCosineDistance_param_4, - .param .u32 MultiCosineDistance_param_5, - .param .u32 MultiCosineDistance_param_6, - .param .u32 MultiCosineDistance_param_7 + // .globl CalculateDistances +.visible .entry CalculateDistances( + .param .u64 CalculateDistances_param_0, + .param .u64 CalculateDistances_param_1, + .param .u64 CalculateDistances_param_2, + .param .u32 CalculateDistances_param_3, + .param .u32 CalculateDistances_param_4, + .param .u32 CalculateDistances_param_5 +) +{ + .reg .pred %p<11>; + .reg .f32 %f<12>; + .reg .b32 %r<31>; + .reg .b64 %rd<31>; + + + ld.param.u64 %rd7, [CalculateDistances_param_0]; + ld.param.u64 %rd8, [CalculateDistances_param_1]; + ld.param.u64 %rd9, [CalculateDistances_param_2]; + ld.param.u32 %r16, [CalculateDistances_param_3]; + ld.param.u32 %r17, [CalculateDistances_param_4]; + ld.param.u32 %r18, [CalculateDistances_param_5]; + cvta.to.global.u64 %rd1, %rd9; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd7; + mov.u32 %r19, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r20, %tid.x; + mad.lo.s32 %r26, %r1, %r19, %r20; + setp.ge.u32 %p1, %r26, %r17; + @%p1 bra $L__BB58_15; + + mov.u32 %r21, %ntid.y; + mov.u32 %r22, %ctaid.y; + mov.u32 %r23, %tid.y; + mad.lo.s32 %r3, %r21, %r22, %r23; + mov.u32 %r24, %nctaid.x; + mul.lo.s32 %r4, %r1, %r24; + mov.u32 %r25, %nctaid.y; + mul.lo.s32 %r5, %r21, %r25; + setp.eq.s32 %p2, %r18, 0; + @%p2 bra $L__BB58_11; + +$L__BB58_3: + setp.ge.u32 %p3, %r3, %r16; + @%p3 bra $L__BB58_9; + + setp.eq.s32 %p4, %r18, 2; + cvt.u64.u32 %rd4, %r26; + mul.wide.u32 %rd10, %r26, 4; + add.s64 %rd5, %rd3, %rd10; + @%p4 bra $L__BB58_7; + bra.uni $L__BB58_5; + +$L__BB58_7: + ld.global.nc.f32 %f1, [%rd5]; + mov.u32 %r28, %r3; + +$L__BB58_8: + mul.wide.u32 %rd13, %r28, 8; + add.s64 %rd14, %rd2, %rd13; + ld.global.nc.u64 %rd15, [%rd14]; + cvta.to.global.u64 %rd16, %rd15; + shl.b64 %rd17, %rd4, 2; + add.s64 %rd18, %rd16, %rd17; + ld.global.f32 %f4, [%rd18]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + mul.wide.u32 %rd19, %r28, 4; + add.s64 %rd20, %rd1, %rd19; + atom.global.add.f32 %f7, [%rd20], %f6; + add.s32 %r28, %r28, %r5; + setp.lt.u32 %p6, %r28, %r16; + @%p6 bra $L__BB58_8; + bra.uni $L__BB58_9; + +$L__BB58_5: + mov.u32 %r27, %r3; + +$L__BB58_6: + mul.wide.u32 %rd11, %r27, 4; + add.s64 %rd12, %rd1, %rd11; + atom.global.add.f32 %f3, [%rd12], 0f00000000; + add.s32 %r27, %r27, %r5; + setp.lt.u32 %p5, %r27, %r16; + @%p5 bra $L__BB58_6; + +$L__BB58_9: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p7, %r26, %r17; + @%p7 bra $L__BB58_3; + bra.uni $L__BB58_15; + +$L__BB58_11: + setp.ge.u32 %p8, %r3, %r16; + @%p8 bra $L__BB58_14; + + cvt.u64.u32 %rd6, %r26; + mul.wide.u32 %rd21, %r26, 4; + add.s64 %rd22, %rd3, %rd21; + ld.global.nc.f32 %f2, [%rd22]; + mov.u32 %r30, %r3; + +$L__BB58_13: + mul.wide.u32 %rd23, %r30, 8; + add.s64 %rd24, %rd2, %rd23; + ld.global.nc.u64 %rd25, [%rd24]; + cvta.to.global.u64 %rd26, %rd25; + shl.b64 %rd27, %rd6, 2; + add.s64 %rd28, %rd26, %rd27; + ld.global.f32 %f8, [%rd28]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + mul.wide.u32 %rd29, %r30, 4; + add.s64 %rd30, %rd1, %rd29; + atom.global.add.f32 %f11, [%rd30], %f10; + add.s32 %r30, %r30, %r5; + setp.lt.u32 %p9, %r30, %r16; + @%p9 bra $L__BB58_13; + +$L__BB58_14: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p10, %r26, %r17; + @%p10 bra $L__BB58_11; + +$L__BB58_15: + ret; + +} + // .globl CosineMultiDistance +.visible .entry CosineMultiDistance( + .param .u64 CosineMultiDistance_param_0, + .param .u64 CosineMultiDistance_param_1, + .param .u64 CosineMultiDistance_param_2, + .param .u64 CosineMultiDistance_param_3, + .param .u64 CosineMultiDistance_param_4, + .param .u32 CosineMultiDistance_param_5, + .param .u32 CosineMultiDistance_param_6, + .param .u32 CosineMultiDistance_param_7 ) { .reg .pred %p<7>; @@ -4224,20 +4332,20 @@ $L__BB57_21: .reg .b64 %rd<28>; - ld.param.u64 %rd7, [MultiCosineDistance_param_0]; - ld.param.u64 %rd8, [MultiCosineDistance_param_1]; - ld.param.u64 %rd9, [MultiCosineDistance_param_2]; - ld.param.u64 %rd10, [MultiCosineDistance_param_3]; - ld.param.u64 %rd11, [MultiCosineDistance_param_4]; - ld.param.u32 %r17, [MultiCosineDistance_param_5]; - ld.param.u32 %r18, [MultiCosineDistance_param_6]; - ld.param.u32 %r19, [MultiCosineDistance_param_7]; + ld.param.u64 %rd7, [CosineMultiDistance_param_0]; + ld.param.u64 %rd8, [CosineMultiDistance_param_1]; + ld.param.u64 %rd9, [CosineMultiDistance_param_2]; + ld.param.u64 %rd10, [CosineMultiDistance_param_3]; + ld.param.u64 %rd11, [CosineMultiDistance_param_4]; + ld.param.u32 %r17, [CosineMultiDistance_param_5]; + ld.param.u32 %r18, [CosineMultiDistance_param_6]; + ld.param.u32 %r19, [CosineMultiDistance_param_7]; mov.u32 %r20, %ctaid.x; mov.u32 %r1, %ntid.x; mov.u32 %r21, %tid.x; mad.lo.s32 %r31, %r1, %r20, %r21; setp.ge.u32 %p1, %r31, %r19; - @%p1 bra $L__BB58_9; + @%p1 bra $L__BB59_9; mov.u32 %r22, %ntid.y; mov.u32 %r23, %ctaid.y; @@ -4259,16 +4367,16 @@ $L__BB57_21: cvta.to.global.u64 %rd4, %rd10; cvta.to.global.u64 %rd5, %rd11; -$L__BB58_2: +$L__BB59_2: setp.ge.u32 %p2, %r3, %r18; - @%p2 bra $L__BB58_8; + @%p2 bra $L__BB59_8; cvt.u64.u32 %rd6, %r31; mov.u32 %r32, %r3; -$L__BB58_4: +$L__BB59_4: setp.ge.u32 %p3, %r5, %r17; - @%p3 bra $L__BB58_7; + @%p3 bra $L__BB59_7; mul.wide.u32 %rd12, %r32, 8; add.s64 %rd13, %rd1, %rd12; @@ -4281,7 +4389,7 @@ $L__BB58_4: mad.lo.s32 %r33, %r32, %r17, %r5; mov.u32 %r34, %r5; -$L__BB58_6: +$L__BB59_6: mul.wide.u32 %rd18, %r34, 8; add.s64 %rd19, %rd2, %rd18; ld.global.nc.u64 %rd20, [%rd19]; @@ -4300,19 +4408,105 @@ $L__BB58_6: add.s32 %r33, %r33, %r7; add.s32 %r34, %r34, %r7; setp.lt.u32 %p4, %r34, %r17; - @%p4 bra $L__BB58_6; + @%p4 bra $L__BB59_6; -$L__BB58_7: +$L__BB59_7: add.s32 %r32, %r32, %r6; setp.lt.u32 %p5, %r32, %r18; - @%p5 bra $L__BB58_4; + @%p5 bra $L__BB59_4; -$L__BB58_8: +$L__BB59_8: add.s32 %r31, %r31, %r4; setp.lt.u32 %p6, %r31, %r19; - @%p6 bra $L__BB58_2; + @%p6 bra $L__BB59_2; -$L__BB58_9: +$L__BB59_9: + ret; + +} + // .globl CosineDistances +.visible .entry CosineDistances( + .param .u64 CosineDistances_param_0, + .param .u64 CosineDistances_param_1, + .param .u64 CosineDistances_param_2, + .param .u64 CosineDistances_param_3, + .param .u64 CosineDistances_param_4, + .param .u32 CosineDistances_param_5, + .param .u32 CosineDistances_param_6 +) +{ + .reg .pred %p<5>; + .reg .f32 %f<9>; + .reg .b32 %r<21>; + .reg .b64 %rd<24>; + + + ld.param.u64 %rd7, [CosineDistances_param_0]; + ld.param.u64 %rd8, [CosineDistances_param_1]; + ld.param.u64 %rd9, [CosineDistances_param_2]; + ld.param.u64 %rd10, [CosineDistances_param_3]; + ld.param.u64 %rd11, [CosineDistances_param_4]; + ld.param.u32 %r10, [CosineDistances_param_5]; + ld.param.u32 %r11, [CosineDistances_param_6]; + mov.u32 %r12, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r13, %tid.x; + mad.lo.s32 %r19, %r1, %r12, %r13; + setp.ge.u32 %p1, %r19, %r11; + @%p1 bra $L__BB60_6; + + mov.u32 %r14, %ntid.y; + mov.u32 %r15, %ctaid.y; + mov.u32 %r16, %tid.y; + mad.lo.s32 %r3, %r14, %r15, %r16; + mov.u32 %r17, %nctaid.x; + mul.lo.s32 %r4, %r1, %r17; + mov.u32 %r18, %nctaid.y; + mul.lo.s32 %r5, %r14, %r18; + cvta.to.global.u64 %rd1, %rd7; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd9; + cvta.to.global.u64 %rd4, %rd10; + cvta.to.global.u64 %rd5, %rd11; + +$L__BB60_2: + setp.ge.u32 %p2, %r3, %r10; + @%p2 bra $L__BB60_5; + + cvt.u64.u32 %rd6, %r19; + mul.wide.u32 %rd12, %r19, 4; + add.s64 %rd13, %rd1, %rd12; + ld.global.nc.f32 %f1, [%rd13]; + mul.ftz.f32 %f2, %f1, %f1; + mov.u32 %r20, %r3; + +$L__BB60_4: + mul.wide.u32 %rd14, %r20, 8; + add.s64 %rd15, %rd2, %rd14; + ld.global.nc.u64 %rd16, [%rd15]; + cvta.to.global.u64 %rd17, %rd16; + shl.b64 %rd18, %rd6, 2; + add.s64 %rd19, %rd17, %rd18; + ld.global.f32 %f3, [%rd19]; + mul.wide.u32 %rd20, %r20, 4; + add.s64 %rd21, %rd3, %rd20; + atom.global.add.f32 %f4, [%rd21], %f2; + add.s64 %rd22, %rd4, %rd20; + mul.ftz.f32 %f5, %f1, %f3; + atom.global.add.f32 %f6, [%rd22], %f5; + add.s64 %rd23, %rd5, %rd20; + mul.ftz.f32 %f7, %f3, %f3; + atom.global.add.f32 %f8, [%rd23], %f7; + add.s32 %r20, %r20, %r5; + setp.lt.u32 %p3, %r20, %r10; + @%p3 bra $L__BB60_4; + +$L__BB60_5: + add.s32 %r19, %r19, %r4; + setp.lt.u32 %p4, %r19, %r11; + @%p4 bra $L__BB60_2; + +$L__BB60_6: ret; } @@ -4340,7 +4534,7 @@ $L__BB58_9: mov.u32 %r2, %tid.x; mad.lo.s32 %r3, %r22, %r1, %r2; setp.ge.u32 %p1, %r3, %r20; - @%p1 bra $L__BB59_2; + @%p1 bra $L__BB61_2; cvta.to.global.u64 %rd3, %rd1; mul.lo.s32 %r23, %r3, %r21; @@ -4352,17 +4546,17 @@ $L__BB58_9: add.s32 %r26, %r25, %r24; st.shared.f32 [%r26], %f8; -$L__BB59_2: +$L__BB61_2: bar.sync 0; setp.ne.s32 %p2, %r2, 0; - @%p2 bra $L__BB59_11; + @%p2 bra $L__BB61_11; shl.b32 %r4, %r1, 10; sub.s32 %r27, %r20, %r4; min.u32 %r5, %r27, 1024; setp.eq.s32 %p3, %r5, 0; mov.f32 %f25, 0f00000000; - @%p3 bra $L__BB59_10; + @%p3 bra $L__BB61_10; not.b32 %r29, %r20; add.s32 %r30, %r4, %r29; @@ -4373,7 +4567,7 @@ $L__BB59_2: setp.lt.u32 %p4, %r33, 3; mov.f32 %f25, 0f00000000; mov.u32 %r45, 0; - @%p4 bra $L__BB59_7; + @%p4 bra $L__BB61_7; add.s32 %r36, %r4, -1; sub.s32 %r37, %r36, %r20; @@ -4382,7 +4576,7 @@ $L__BB59_2: neg.s32 %r42, %r39; mov.u32 %r43, _ZZ9SumValuesE5block; -$L__BB59_6: +$L__BB61_6: ld.shared.f32 %f13, [%r43]; add.ftz.f32 %f14, %f25, %f13; ld.shared.f32 %f15, [%r43+4]; @@ -4395,32 +4589,32 @@ $L__BB59_6: add.s32 %r43, %r43, 16; add.s32 %r42, %r42, -4; setp.ne.s32 %p5, %r42, 1; - @%p5 bra $L__BB59_6; + @%p5 bra $L__BB61_6; -$L__BB59_7: +$L__BB61_7: setp.eq.s32 %p6, %r47, 0; - @%p6 bra $L__BB59_10; + @%p6 bra $L__BB61_10; shl.b32 %r40, %r45, 2; mov.u32 %r41, _ZZ9SumValuesE5block; add.s32 %r46, %r41, %r40; -$L__BB59_9: +$L__BB61_9: .pragma "nounroll"; ld.shared.f32 %f20, [%r46]; add.ftz.f32 %f25, %f25, %f20; add.s32 %r46, %r46, 4; add.s32 %r47, %r47, -1; setp.ne.s32 %p7, %r47, 0; - @%p7 bra $L__BB59_9; + @%p7 bra $L__BB61_9; -$L__BB59_10: +$L__BB61_10: cvta.to.global.u64 %rd6, %rd2; mul.wide.u32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f25; -$L__BB59_11: +$L__BB61_11: ret; } diff --git a/BrightData.Cuda/cuda/brightwire_61.ptx b/BrightData.Cuda/cuda/brightwire_61.ptx index 2736bd0a..9e033fca 100644 --- a/BrightData.Cuda/cuda/brightwire_61.ptx +++ b/BrightData.Cuda/cuda/brightwire_61.ptx @@ -4022,200 +4022,308 @@ $L__BB56_3: ret; } - // .globl CalculateDistances -.visible .entry CalculateDistances( - .param .u64 CalculateDistances_param_0, - .param .u64 CalculateDistances_param_1, - .param .u64 CalculateDistances_param_2, - .param .u32 CalculateDistances_param_3, - .param .u32 CalculateDistances_param_4, - .param .u32 CalculateDistances_param_5, - .param .u32 CalculateDistances_param_6 + // .globl CalculateMultiDistances +.visible .entry CalculateMultiDistances( + .param .u64 CalculateMultiDistances_param_0, + .param .u64 CalculateMultiDistances_param_1, + .param .u64 CalculateMultiDistances_param_2, + .param .u32 CalculateMultiDistances_param_3, + .param .u32 CalculateMultiDistances_param_4, + .param .u32 CalculateMultiDistances_param_5, + .param .u32 CalculateMultiDistances_param_6 ) { - .reg .pred %p<15>; - .reg .f32 %f<15>; - .reg .b32 %r<53>; - .reg .b64 %rd<46>; - - - ld.param.u64 %rd5, [CalculateDistances_param_0]; - ld.param.u64 %rd6, [CalculateDistances_param_1]; - ld.param.u64 %rd7, [CalculateDistances_param_2]; - ld.param.u32 %r27, [CalculateDistances_param_3]; - ld.param.u32 %r28, [CalculateDistances_param_4]; - ld.param.u32 %r29, [CalculateDistances_param_5]; - ld.param.u32 %r30, [CalculateDistances_param_6]; - cvta.to.global.u64 %rd1, %rd7; - cvta.to.global.u64 %rd2, %rd6; - cvta.to.global.u64 %rd3, %rd5; - mov.u32 %r31, %ctaid.x; + .reg .pred %p<13>; + .reg .f32 %f<12>; + .reg .b32 %r<49>; + .reg .b64 %rd<38>; + + + ld.param.u64 %rd6, [CalculateMultiDistances_param_0]; + ld.param.u64 %rd7, [CalculateMultiDistances_param_1]; + ld.param.u64 %rd8, [CalculateMultiDistances_param_2]; + ld.param.u32 %r25, [CalculateMultiDistances_param_3]; + ld.param.u32 %r26, [CalculateMultiDistances_param_4]; + ld.param.u32 %r27, [CalculateMultiDistances_param_5]; + ld.param.u32 %r28, [CalculateMultiDistances_param_6]; + cvta.to.global.u64 %rd1, %rd8; + cvta.to.global.u64 %rd2, %rd7; + cvta.to.global.u64 %rd3, %rd6; + mov.u32 %r29, %ctaid.x; mov.u32 %r1, %ntid.x; - mov.u32 %r32, %tid.x; - mad.lo.s32 %r45, %r1, %r31, %r32; - setp.ge.u32 %p1, %r45, %r29; - @%p1 bra $L__BB57_21; - - mov.u32 %r33, %ntid.y; - mov.u32 %r34, %ctaid.y; - mov.u32 %r35, %tid.y; - mad.lo.s32 %r3, %r33, %r34, %r35; - mov.u32 %r36, %nctaid.x; - mul.lo.s32 %r4, %r1, %r36; - mov.u32 %r37, %ctaid.z; - mov.u32 %r38, %ntid.z; - mov.u32 %r39, %tid.z; - mad.lo.s32 %r5, %r38, %r37, %r39; - mov.u32 %r40, %nctaid.y; - mul.lo.s32 %r6, %r33, %r40; - mov.u32 %r41, %nctaid.z; - mul.lo.s32 %r7, %r38, %r41; + mov.u32 %r30, %tid.x; + mad.lo.s32 %r42, %r1, %r29, %r30; + setp.ge.u32 %p1, %r42, %r27; + @%p1 bra $L__BB57_18; + + mov.u32 %r31, %ntid.y; + mov.u32 %r32, %ctaid.y; + mov.u32 %r33, %tid.y; + mad.lo.s32 %r3, %r31, %r32, %r33; + mov.u32 %r34, %nctaid.x; + mul.lo.s32 %r4, %r1, %r34; + mov.u32 %r35, %ctaid.z; + mov.u32 %r36, %ntid.z; + mov.u32 %r37, %tid.z; + mad.lo.s32 %r5, %r36, %r35, %r37; + mov.u32 %r38, %nctaid.y; + mul.lo.s32 %r6, %r31, %r38; + mov.u32 %r39, %nctaid.z; + mul.lo.s32 %r7, %r36, %r39; $L__BB57_2: - setp.ge.u32 %p2, %r3, %r28; - @%p2 bra $L__BB57_20; + setp.ge.u32 %p2, %r3, %r26; + @%p2 bra $L__BB57_17; - setp.eq.s32 %p3, %r30, 0; - cvt.u64.u32 %rd4, %r45; - @%p3 bra $L__BB57_15; + setp.eq.s32 %p3, %r28, 0; + cvt.u64.u32 %rd4, %r42; + @%p3 bra $L__BB57_12; - mov.u32 %r46, %r3; + mov.u32 %r43, %r3; $L__BB57_5: - setp.ge.u32 %p4, %r5, %r27; - @%p4 bra $L__BB57_14; - - mul.wide.u32 %rd8, %r46, 8; - add.s64 %rd9, %rd3, %rd8; - ld.global.nc.u64 %rd10, [%rd9]; - cvta.to.global.u64 %rd11, %rd10; - shl.b64 %rd12, %rd4, 2; - add.s64 %rd13, %rd11, %rd12; - ld.global.f32 %f1, [%rd13]; - mul.lo.s32 %r10, %r46, %r27; - setp.eq.s32 %p5, %r30, 1; - @%p5 bra $L__BB57_10; - - setp.ne.s32 %p6, %r30, 2; - @%p6 bra $L__BB57_12; + setp.ge.u32 %p4, %r5, %r25; + @%p4 bra $L__BB57_11; - add.s32 %r47, %r5, %r10; - mov.u32 %r48, %r5; + setp.eq.s32 %p5, %r28, 2; + mul.wide.u32 %rd9, %r43, 8; + add.s64 %rd10, %rd3, %rd9; + ld.global.nc.u64 %rd11, [%rd10]; + cvta.to.global.u64 %rd12, %rd11; + shl.b64 %rd13, %rd4, 2; + add.s64 %rd5, %rd12, %rd13; + mul.lo.s32 %r10, %r43, %r25; + @%p5 bra $L__BB57_9; + bra.uni $L__BB57_7; $L__BB57_9: - mul.wide.u32 %rd14, %r48, 8; - add.s64 %rd15, %rd2, %rd14; - ld.global.nc.u64 %rd16, [%rd15]; - cvta.to.global.u64 %rd17, %rd16; - add.s64 %rd19, %rd17, %rd12; - ld.global.f32 %f3, [%rd19]; - sub.ftz.f32 %f4, %f1, %f3; - abs.ftz.f32 %f5, %f4; - mul.wide.u32 %rd20, %r47, 4; - add.s64 %rd21, %rd1, %rd20; - atom.global.add.f32 %f6, [%rd21], %f5; - add.s32 %r47, %r47, %r7; - add.s32 %r48, %r48, %r7; - setp.lt.u32 %p7, %r48, %r27; - @%p7 bra $L__BB57_9; - bra.uni $L__BB57_14; + ld.global.f32 %f1, [%rd5]; + mov.u32 %r46, %r5; $L__BB57_10: - mov.u32 %r49, %r5; + mul.wide.u32 %rd16, %r46, 8; + add.s64 %rd17, %rd2, %rd16; + ld.global.nc.u64 %rd18, [%rd17]; + cvta.to.global.u64 %rd19, %rd18; + add.s64 %rd21, %rd19, %rd13; + ld.global.f32 %f4, [%rd21]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + add.s32 %r40, %r46, %r10; + mul.wide.u32 %rd22, %r40, 4; + add.s64 %rd23, %rd1, %rd22; + atom.global.add.f32 %f7, [%rd23], %f6; + add.s32 %r46, %r46, %r7; + setp.lt.u32 %p7, %r46, %r25; + @%p7 bra $L__BB57_10; + bra.uni $L__BB57_11; + +$L__BB57_7: + add.s32 %r44, %r5, %r10; + mov.u32 %r45, %r5; + +$L__BB57_8: + mul.wide.u32 %rd14, %r44, 4; + add.s64 %rd15, %rd1, %rd14; + atom.global.add.f32 %f3, [%rd15], 0f00000000; + add.s32 %r44, %r44, %r7; + add.s32 %r45, %r45, %r7; + setp.lt.u32 %p6, %r45, %r25; + @%p6 bra $L__BB57_8; $L__BB57_11: - mul.wide.u32 %rd22, %r49, 8; - add.s64 %rd23, %rd2, %rd22; - ld.global.nc.u64 %rd24, [%rd23]; - cvta.to.global.u64 %rd25, %rd24; - add.s64 %rd27, %rd25, %rd12; - ld.global.f32 %f7, [%rd27]; - mul.ftz.f32 %f8, %f1, %f7; - add.s32 %r42, %r49, %r10; - mul.wide.u32 %rd28, %r42, 4; - add.s64 %rd29, %rd1, %rd28; - atom.global.add.f32 %f9, [%rd29], %f8; - add.s32 %r49, %r49, %r7; - setp.lt.u32 %p8, %r49, %r27; - @%p8 bra $L__BB57_11; - bra.uni $L__BB57_14; + add.s32 %r43, %r43, %r6; + setp.lt.u32 %p8, %r43, %r26; + @%p8 bra $L__BB57_5; + bra.uni $L__BB57_17; $L__BB57_12: - mov.u32 %r50, %r5; + mov.u32 %r47, %r3; $L__BB57_13: - add.s32 %r43, %r50, %r10; - mul.wide.u32 %rd30, %r43, 4; - add.s64 %rd31, %rd1, %rd30; - atom.global.add.f32 %f10, [%rd31], 0f00000000; - add.s32 %r50, %r50, %r7; - setp.lt.u32 %p9, %r50, %r27; - @%p9 bra $L__BB57_13; - -$L__BB57_14: - add.s32 %r46, %r46, %r6; - setp.lt.u32 %p10, %r46, %r28; - @%p10 bra $L__BB57_5; - bra.uni $L__BB57_20; + setp.ge.u32 %p9, %r5, %r25; + @%p9 bra $L__BB57_16; + + mul.wide.u32 %rd24, %r47, 8; + add.s64 %rd25, %rd3, %rd24; + ld.global.nc.u64 %rd26, [%rd25]; + cvta.to.global.u64 %rd27, %rd26; + shl.b64 %rd28, %rd4, 2; + add.s64 %rd29, %rd27, %rd28; + ld.global.f32 %f2, [%rd29]; + mul.lo.s32 %r20, %r47, %r25; + mov.u32 %r48, %r5; $L__BB57_15: - mov.u32 %r51, %r3; + mul.wide.u32 %rd30, %r48, 8; + add.s64 %rd31, %rd2, %rd30; + ld.global.nc.u64 %rd32, [%rd31]; + cvta.to.global.u64 %rd33, %rd32; + add.s64 %rd35, %rd33, %rd28; + ld.global.f32 %f8, [%rd35]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + add.s32 %r41, %r48, %r20; + mul.wide.u32 %rd36, %r41, 4; + add.s64 %rd37, %rd1, %rd36; + atom.global.add.f32 %f11, [%rd37], %f10; + add.s32 %r48, %r48, %r7; + setp.lt.u32 %p10, %r48, %r25; + @%p10 bra $L__BB57_15; $L__BB57_16: - setp.ge.u32 %p11, %r5, %r27; - @%p11 bra $L__BB57_19; - - mul.wide.u32 %rd32, %r51, 8; - add.s64 %rd33, %rd3, %rd32; - ld.global.nc.u64 %rd34, [%rd33]; - cvta.to.global.u64 %rd35, %rd34; - shl.b64 %rd36, %rd4, 2; - add.s64 %rd37, %rd35, %rd36; - ld.global.f32 %f2, [%rd37]; - mul.lo.s32 %r22, %r51, %r27; - mov.u32 %r52, %r5; + add.s32 %r47, %r47, %r6; + setp.lt.u32 %p11, %r47, %r26; + @%p11 bra $L__BB57_13; + +$L__BB57_17: + add.s32 %r42, %r42, %r4; + setp.lt.u32 %p12, %r42, %r27; + @%p12 bra $L__BB57_2; $L__BB57_18: - mul.wide.u32 %rd38, %r52, 8; - add.s64 %rd39, %rd2, %rd38; - ld.global.nc.u64 %rd40, [%rd39]; - cvta.to.global.u64 %rd41, %rd40; - add.s64 %rd43, %rd41, %rd36; - ld.global.f32 %f11, [%rd43]; - sub.ftz.f32 %f12, %f2, %f11; - mul.ftz.f32 %f13, %f12, %f12; - add.s32 %r44, %r52, %r22; - mul.wide.u32 %rd44, %r44, 4; - add.s64 %rd45, %rd1, %rd44; - atom.global.add.f32 %f14, [%rd45], %f13; - add.s32 %r52, %r52, %r7; - setp.lt.u32 %p12, %r52, %r27; - @%p12 bra $L__BB57_18; - -$L__BB57_19: - add.s32 %r51, %r51, %r6; - setp.lt.u32 %p13, %r51, %r28; - @%p13 bra $L__BB57_16; - -$L__BB57_20: - add.s32 %r45, %r45, %r4; - setp.lt.u32 %p14, %r45, %r29; - @%p14 bra $L__BB57_2; - -$L__BB57_21: ret; } - // .globl MultiCosineDistance -.visible .entry MultiCosineDistance( - .param .u64 MultiCosineDistance_param_0, - .param .u64 MultiCosineDistance_param_1, - .param .u64 MultiCosineDistance_param_2, - .param .u64 MultiCosineDistance_param_3, - .param .u64 MultiCosineDistance_param_4, - .param .u32 MultiCosineDistance_param_5, - .param .u32 MultiCosineDistance_param_6, - .param .u32 MultiCosineDistance_param_7 + // .globl CalculateDistances +.visible .entry CalculateDistances( + .param .u64 CalculateDistances_param_0, + .param .u64 CalculateDistances_param_1, + .param .u64 CalculateDistances_param_2, + .param .u32 CalculateDistances_param_3, + .param .u32 CalculateDistances_param_4, + .param .u32 CalculateDistances_param_5 +) +{ + .reg .pred %p<11>; + .reg .f32 %f<12>; + .reg .b32 %r<31>; + .reg .b64 %rd<31>; + + + ld.param.u64 %rd7, [CalculateDistances_param_0]; + ld.param.u64 %rd8, [CalculateDistances_param_1]; + ld.param.u64 %rd9, [CalculateDistances_param_2]; + ld.param.u32 %r16, [CalculateDistances_param_3]; + ld.param.u32 %r17, [CalculateDistances_param_4]; + ld.param.u32 %r18, [CalculateDistances_param_5]; + cvta.to.global.u64 %rd1, %rd9; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd7; + mov.u32 %r19, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r20, %tid.x; + mad.lo.s32 %r26, %r1, %r19, %r20; + setp.ge.u32 %p1, %r26, %r17; + @%p1 bra $L__BB58_15; + + mov.u32 %r21, %ntid.y; + mov.u32 %r22, %ctaid.y; + mov.u32 %r23, %tid.y; + mad.lo.s32 %r3, %r21, %r22, %r23; + mov.u32 %r24, %nctaid.x; + mul.lo.s32 %r4, %r1, %r24; + mov.u32 %r25, %nctaid.y; + mul.lo.s32 %r5, %r21, %r25; + setp.eq.s32 %p2, %r18, 0; + @%p2 bra $L__BB58_11; + +$L__BB58_3: + setp.ge.u32 %p3, %r3, %r16; + @%p3 bra $L__BB58_9; + + setp.eq.s32 %p4, %r18, 2; + cvt.u64.u32 %rd4, %r26; + mul.wide.u32 %rd10, %r26, 4; + add.s64 %rd5, %rd3, %rd10; + @%p4 bra $L__BB58_7; + bra.uni $L__BB58_5; + +$L__BB58_7: + ld.global.nc.f32 %f1, [%rd5]; + mov.u32 %r28, %r3; + +$L__BB58_8: + mul.wide.u32 %rd13, %r28, 8; + add.s64 %rd14, %rd2, %rd13; + ld.global.nc.u64 %rd15, [%rd14]; + cvta.to.global.u64 %rd16, %rd15; + shl.b64 %rd17, %rd4, 2; + add.s64 %rd18, %rd16, %rd17; + ld.global.f32 %f4, [%rd18]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + mul.wide.u32 %rd19, %r28, 4; + add.s64 %rd20, %rd1, %rd19; + atom.global.add.f32 %f7, [%rd20], %f6; + add.s32 %r28, %r28, %r5; + setp.lt.u32 %p6, %r28, %r16; + @%p6 bra $L__BB58_8; + bra.uni $L__BB58_9; + +$L__BB58_5: + mov.u32 %r27, %r3; + +$L__BB58_6: + mul.wide.u32 %rd11, %r27, 4; + add.s64 %rd12, %rd1, %rd11; + atom.global.add.f32 %f3, [%rd12], 0f00000000; + add.s32 %r27, %r27, %r5; + setp.lt.u32 %p5, %r27, %r16; + @%p5 bra $L__BB58_6; + +$L__BB58_9: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p7, %r26, %r17; + @%p7 bra $L__BB58_3; + bra.uni $L__BB58_15; + +$L__BB58_11: + setp.ge.u32 %p8, %r3, %r16; + @%p8 bra $L__BB58_14; + + cvt.u64.u32 %rd6, %r26; + mul.wide.u32 %rd21, %r26, 4; + add.s64 %rd22, %rd3, %rd21; + ld.global.nc.f32 %f2, [%rd22]; + mov.u32 %r30, %r3; + +$L__BB58_13: + mul.wide.u32 %rd23, %r30, 8; + add.s64 %rd24, %rd2, %rd23; + ld.global.nc.u64 %rd25, [%rd24]; + cvta.to.global.u64 %rd26, %rd25; + shl.b64 %rd27, %rd6, 2; + add.s64 %rd28, %rd26, %rd27; + ld.global.f32 %f8, [%rd28]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + mul.wide.u32 %rd29, %r30, 4; + add.s64 %rd30, %rd1, %rd29; + atom.global.add.f32 %f11, [%rd30], %f10; + add.s32 %r30, %r30, %r5; + setp.lt.u32 %p9, %r30, %r16; + @%p9 bra $L__BB58_13; + +$L__BB58_14: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p10, %r26, %r17; + @%p10 bra $L__BB58_11; + +$L__BB58_15: + ret; + +} + // .globl CosineMultiDistance +.visible .entry CosineMultiDistance( + .param .u64 CosineMultiDistance_param_0, + .param .u64 CosineMultiDistance_param_1, + .param .u64 CosineMultiDistance_param_2, + .param .u64 CosineMultiDistance_param_3, + .param .u64 CosineMultiDistance_param_4, + .param .u32 CosineMultiDistance_param_5, + .param .u32 CosineMultiDistance_param_6, + .param .u32 CosineMultiDistance_param_7 ) { .reg .pred %p<7>; @@ -4224,20 +4332,20 @@ $L__BB57_21: .reg .b64 %rd<28>; - ld.param.u64 %rd7, [MultiCosineDistance_param_0]; - ld.param.u64 %rd8, [MultiCosineDistance_param_1]; - ld.param.u64 %rd9, [MultiCosineDistance_param_2]; - ld.param.u64 %rd10, [MultiCosineDistance_param_3]; - ld.param.u64 %rd11, [MultiCosineDistance_param_4]; - ld.param.u32 %r17, [MultiCosineDistance_param_5]; - ld.param.u32 %r18, [MultiCosineDistance_param_6]; - ld.param.u32 %r19, [MultiCosineDistance_param_7]; + ld.param.u64 %rd7, [CosineMultiDistance_param_0]; + ld.param.u64 %rd8, [CosineMultiDistance_param_1]; + ld.param.u64 %rd9, [CosineMultiDistance_param_2]; + ld.param.u64 %rd10, [CosineMultiDistance_param_3]; + ld.param.u64 %rd11, [CosineMultiDistance_param_4]; + ld.param.u32 %r17, [CosineMultiDistance_param_5]; + ld.param.u32 %r18, [CosineMultiDistance_param_6]; + ld.param.u32 %r19, [CosineMultiDistance_param_7]; mov.u32 %r20, %ctaid.x; mov.u32 %r1, %ntid.x; mov.u32 %r21, %tid.x; mad.lo.s32 %r31, %r1, %r20, %r21; setp.ge.u32 %p1, %r31, %r19; - @%p1 bra $L__BB58_9; + @%p1 bra $L__BB59_9; mov.u32 %r22, %ntid.y; mov.u32 %r23, %ctaid.y; @@ -4259,16 +4367,16 @@ $L__BB57_21: cvta.to.global.u64 %rd4, %rd10; cvta.to.global.u64 %rd5, %rd11; -$L__BB58_2: +$L__BB59_2: setp.ge.u32 %p2, %r3, %r18; - @%p2 bra $L__BB58_8; + @%p2 bra $L__BB59_8; cvt.u64.u32 %rd6, %r31; mov.u32 %r32, %r3; -$L__BB58_4: +$L__BB59_4: setp.ge.u32 %p3, %r5, %r17; - @%p3 bra $L__BB58_7; + @%p3 bra $L__BB59_7; mul.wide.u32 %rd12, %r32, 8; add.s64 %rd13, %rd1, %rd12; @@ -4281,7 +4389,7 @@ $L__BB58_4: mad.lo.s32 %r33, %r32, %r17, %r5; mov.u32 %r34, %r5; -$L__BB58_6: +$L__BB59_6: mul.wide.u32 %rd18, %r34, 8; add.s64 %rd19, %rd2, %rd18; ld.global.nc.u64 %rd20, [%rd19]; @@ -4300,19 +4408,105 @@ $L__BB58_6: add.s32 %r33, %r33, %r7; add.s32 %r34, %r34, %r7; setp.lt.u32 %p4, %r34, %r17; - @%p4 bra $L__BB58_6; + @%p4 bra $L__BB59_6; -$L__BB58_7: +$L__BB59_7: add.s32 %r32, %r32, %r6; setp.lt.u32 %p5, %r32, %r18; - @%p5 bra $L__BB58_4; + @%p5 bra $L__BB59_4; -$L__BB58_8: +$L__BB59_8: add.s32 %r31, %r31, %r4; setp.lt.u32 %p6, %r31, %r19; - @%p6 bra $L__BB58_2; + @%p6 bra $L__BB59_2; -$L__BB58_9: +$L__BB59_9: + ret; + +} + // .globl CosineDistances +.visible .entry CosineDistances( + .param .u64 CosineDistances_param_0, + .param .u64 CosineDistances_param_1, + .param .u64 CosineDistances_param_2, + .param .u64 CosineDistances_param_3, + .param .u64 CosineDistances_param_4, + .param .u32 CosineDistances_param_5, + .param .u32 CosineDistances_param_6 +) +{ + .reg .pred %p<5>; + .reg .f32 %f<9>; + .reg .b32 %r<21>; + .reg .b64 %rd<24>; + + + ld.param.u64 %rd7, [CosineDistances_param_0]; + ld.param.u64 %rd8, [CosineDistances_param_1]; + ld.param.u64 %rd9, [CosineDistances_param_2]; + ld.param.u64 %rd10, [CosineDistances_param_3]; + ld.param.u64 %rd11, [CosineDistances_param_4]; + ld.param.u32 %r10, [CosineDistances_param_5]; + ld.param.u32 %r11, [CosineDistances_param_6]; + mov.u32 %r12, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r13, %tid.x; + mad.lo.s32 %r19, %r1, %r12, %r13; + setp.ge.u32 %p1, %r19, %r11; + @%p1 bra $L__BB60_6; + + mov.u32 %r14, %ntid.y; + mov.u32 %r15, %ctaid.y; + mov.u32 %r16, %tid.y; + mad.lo.s32 %r3, %r14, %r15, %r16; + mov.u32 %r17, %nctaid.x; + mul.lo.s32 %r4, %r1, %r17; + mov.u32 %r18, %nctaid.y; + mul.lo.s32 %r5, %r14, %r18; + cvta.to.global.u64 %rd1, %rd7; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd9; + cvta.to.global.u64 %rd4, %rd10; + cvta.to.global.u64 %rd5, %rd11; + +$L__BB60_2: + setp.ge.u32 %p2, %r3, %r10; + @%p2 bra $L__BB60_5; + + cvt.u64.u32 %rd6, %r19; + mul.wide.u32 %rd12, %r19, 4; + add.s64 %rd13, %rd1, %rd12; + ld.global.nc.f32 %f1, [%rd13]; + mul.ftz.f32 %f2, %f1, %f1; + mov.u32 %r20, %r3; + +$L__BB60_4: + mul.wide.u32 %rd14, %r20, 8; + add.s64 %rd15, %rd2, %rd14; + ld.global.nc.u64 %rd16, [%rd15]; + cvta.to.global.u64 %rd17, %rd16; + shl.b64 %rd18, %rd6, 2; + add.s64 %rd19, %rd17, %rd18; + ld.global.f32 %f3, [%rd19]; + mul.wide.u32 %rd20, %r20, 4; + add.s64 %rd21, %rd3, %rd20; + atom.global.add.f32 %f4, [%rd21], %f2; + add.s64 %rd22, %rd4, %rd20; + mul.ftz.f32 %f5, %f1, %f3; + atom.global.add.f32 %f6, [%rd22], %f5; + add.s64 %rd23, %rd5, %rd20; + mul.ftz.f32 %f7, %f3, %f3; + atom.global.add.f32 %f8, [%rd23], %f7; + add.s32 %r20, %r20, %r5; + setp.lt.u32 %p3, %r20, %r10; + @%p3 bra $L__BB60_4; + +$L__BB60_5: + add.s32 %r19, %r19, %r4; + setp.lt.u32 %p4, %r19, %r11; + @%p4 bra $L__BB60_2; + +$L__BB60_6: ret; } @@ -4340,7 +4534,7 @@ $L__BB58_9: mov.u32 %r2, %tid.x; mad.lo.s32 %r3, %r22, %r1, %r2; setp.ge.u32 %p1, %r3, %r20; - @%p1 bra $L__BB59_2; + @%p1 bra $L__BB61_2; cvta.to.global.u64 %rd3, %rd1; mul.lo.s32 %r23, %r3, %r21; @@ -4352,17 +4546,17 @@ $L__BB58_9: add.s32 %r26, %r25, %r24; st.shared.f32 [%r26], %f8; -$L__BB59_2: +$L__BB61_2: bar.sync 0; setp.ne.s32 %p2, %r2, 0; - @%p2 bra $L__BB59_11; + @%p2 bra $L__BB61_11; shl.b32 %r4, %r1, 10; sub.s32 %r27, %r20, %r4; min.u32 %r5, %r27, 1024; setp.eq.s32 %p3, %r5, 0; mov.f32 %f25, 0f00000000; - @%p3 bra $L__BB59_10; + @%p3 bra $L__BB61_10; not.b32 %r29, %r20; add.s32 %r30, %r4, %r29; @@ -4373,7 +4567,7 @@ $L__BB59_2: setp.lt.u32 %p4, %r33, 3; mov.f32 %f25, 0f00000000; mov.u32 %r45, 0; - @%p4 bra $L__BB59_7; + @%p4 bra $L__BB61_7; add.s32 %r36, %r4, -1; sub.s32 %r37, %r36, %r20; @@ -4382,7 +4576,7 @@ $L__BB59_2: neg.s32 %r42, %r39; mov.u32 %r43, _ZZ9SumValuesE5block; -$L__BB59_6: +$L__BB61_6: ld.shared.f32 %f13, [%r43]; add.ftz.f32 %f14, %f25, %f13; ld.shared.f32 %f15, [%r43+4]; @@ -4395,32 +4589,32 @@ $L__BB59_6: add.s32 %r43, %r43, 16; add.s32 %r42, %r42, -4; setp.ne.s32 %p5, %r42, 1; - @%p5 bra $L__BB59_6; + @%p5 bra $L__BB61_6; -$L__BB59_7: +$L__BB61_7: setp.eq.s32 %p6, %r47, 0; - @%p6 bra $L__BB59_10; + @%p6 bra $L__BB61_10; shl.b32 %r40, %r45, 2; mov.u32 %r41, _ZZ9SumValuesE5block; add.s32 %r46, %r41, %r40; -$L__BB59_9: +$L__BB61_9: .pragma "nounroll"; ld.shared.f32 %f20, [%r46]; add.ftz.f32 %f25, %f25, %f20; add.s32 %r46, %r46, 4; add.s32 %r47, %r47, -1; setp.ne.s32 %p7, %r47, 0; - @%p7 bra $L__BB59_9; + @%p7 bra $L__BB61_9; -$L__BB59_10: +$L__BB61_10: cvta.to.global.u64 %rd6, %rd2; mul.wide.u32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f25; -$L__BB59_11: +$L__BB61_11: ret; } diff --git a/BrightData.Cuda/cuda/brightwire_62.ptx b/BrightData.Cuda/cuda/brightwire_62.ptx index 5a821223..2c544749 100644 --- a/BrightData.Cuda/cuda/brightwire_62.ptx +++ b/BrightData.Cuda/cuda/brightwire_62.ptx @@ -4022,200 +4022,308 @@ $L__BB56_3: ret; } - // .globl CalculateDistances -.visible .entry CalculateDistances( - .param .u64 CalculateDistances_param_0, - .param .u64 CalculateDistances_param_1, - .param .u64 CalculateDistances_param_2, - .param .u32 CalculateDistances_param_3, - .param .u32 CalculateDistances_param_4, - .param .u32 CalculateDistances_param_5, - .param .u32 CalculateDistances_param_6 + // .globl CalculateMultiDistances +.visible .entry CalculateMultiDistances( + .param .u64 CalculateMultiDistances_param_0, + .param .u64 CalculateMultiDistances_param_1, + .param .u64 CalculateMultiDistances_param_2, + .param .u32 CalculateMultiDistances_param_3, + .param .u32 CalculateMultiDistances_param_4, + .param .u32 CalculateMultiDistances_param_5, + .param .u32 CalculateMultiDistances_param_6 ) { - .reg .pred %p<15>; - .reg .f32 %f<15>; - .reg .b32 %r<53>; - .reg .b64 %rd<46>; - - - ld.param.u64 %rd5, [CalculateDistances_param_0]; - ld.param.u64 %rd6, [CalculateDistances_param_1]; - ld.param.u64 %rd7, [CalculateDistances_param_2]; - ld.param.u32 %r27, [CalculateDistances_param_3]; - ld.param.u32 %r28, [CalculateDistances_param_4]; - ld.param.u32 %r29, [CalculateDistances_param_5]; - ld.param.u32 %r30, [CalculateDistances_param_6]; - cvta.to.global.u64 %rd1, %rd7; - cvta.to.global.u64 %rd2, %rd6; - cvta.to.global.u64 %rd3, %rd5; - mov.u32 %r31, %ctaid.x; + .reg .pred %p<13>; + .reg .f32 %f<12>; + .reg .b32 %r<49>; + .reg .b64 %rd<38>; + + + ld.param.u64 %rd6, [CalculateMultiDistances_param_0]; + ld.param.u64 %rd7, [CalculateMultiDistances_param_1]; + ld.param.u64 %rd8, [CalculateMultiDistances_param_2]; + ld.param.u32 %r25, [CalculateMultiDistances_param_3]; + ld.param.u32 %r26, [CalculateMultiDistances_param_4]; + ld.param.u32 %r27, [CalculateMultiDistances_param_5]; + ld.param.u32 %r28, [CalculateMultiDistances_param_6]; + cvta.to.global.u64 %rd1, %rd8; + cvta.to.global.u64 %rd2, %rd7; + cvta.to.global.u64 %rd3, %rd6; + mov.u32 %r29, %ctaid.x; mov.u32 %r1, %ntid.x; - mov.u32 %r32, %tid.x; - mad.lo.s32 %r45, %r1, %r31, %r32; - setp.ge.u32 %p1, %r45, %r29; - @%p1 bra $L__BB57_21; - - mov.u32 %r33, %ntid.y; - mov.u32 %r34, %ctaid.y; - mov.u32 %r35, %tid.y; - mad.lo.s32 %r3, %r33, %r34, %r35; - mov.u32 %r36, %nctaid.x; - mul.lo.s32 %r4, %r1, %r36; - mov.u32 %r37, %ctaid.z; - mov.u32 %r38, %ntid.z; - mov.u32 %r39, %tid.z; - mad.lo.s32 %r5, %r38, %r37, %r39; - mov.u32 %r40, %nctaid.y; - mul.lo.s32 %r6, %r33, %r40; - mov.u32 %r41, %nctaid.z; - mul.lo.s32 %r7, %r38, %r41; + mov.u32 %r30, %tid.x; + mad.lo.s32 %r42, %r1, %r29, %r30; + setp.ge.u32 %p1, %r42, %r27; + @%p1 bra $L__BB57_18; + + mov.u32 %r31, %ntid.y; + mov.u32 %r32, %ctaid.y; + mov.u32 %r33, %tid.y; + mad.lo.s32 %r3, %r31, %r32, %r33; + mov.u32 %r34, %nctaid.x; + mul.lo.s32 %r4, %r1, %r34; + mov.u32 %r35, %ctaid.z; + mov.u32 %r36, %ntid.z; + mov.u32 %r37, %tid.z; + mad.lo.s32 %r5, %r36, %r35, %r37; + mov.u32 %r38, %nctaid.y; + mul.lo.s32 %r6, %r31, %r38; + mov.u32 %r39, %nctaid.z; + mul.lo.s32 %r7, %r36, %r39; $L__BB57_2: - setp.ge.u32 %p2, %r3, %r28; - @%p2 bra $L__BB57_20; + setp.ge.u32 %p2, %r3, %r26; + @%p2 bra $L__BB57_17; - setp.eq.s32 %p3, %r30, 0; - cvt.u64.u32 %rd4, %r45; - @%p3 bra $L__BB57_15; + setp.eq.s32 %p3, %r28, 0; + cvt.u64.u32 %rd4, %r42; + @%p3 bra $L__BB57_12; - mov.u32 %r46, %r3; + mov.u32 %r43, %r3; $L__BB57_5: - setp.ge.u32 %p4, %r5, %r27; - @%p4 bra $L__BB57_14; - - mul.wide.u32 %rd8, %r46, 8; - add.s64 %rd9, %rd3, %rd8; - ld.global.nc.u64 %rd10, [%rd9]; - cvta.to.global.u64 %rd11, %rd10; - shl.b64 %rd12, %rd4, 2; - add.s64 %rd13, %rd11, %rd12; - ld.global.f32 %f1, [%rd13]; - mul.lo.s32 %r10, %r46, %r27; - setp.eq.s32 %p5, %r30, 1; - @%p5 bra $L__BB57_10; - - setp.ne.s32 %p6, %r30, 2; - @%p6 bra $L__BB57_12; + setp.ge.u32 %p4, %r5, %r25; + @%p4 bra $L__BB57_11; - add.s32 %r47, %r5, %r10; - mov.u32 %r48, %r5; + setp.eq.s32 %p5, %r28, 2; + mul.wide.u32 %rd9, %r43, 8; + add.s64 %rd10, %rd3, %rd9; + ld.global.nc.u64 %rd11, [%rd10]; + cvta.to.global.u64 %rd12, %rd11; + shl.b64 %rd13, %rd4, 2; + add.s64 %rd5, %rd12, %rd13; + mul.lo.s32 %r10, %r43, %r25; + @%p5 bra $L__BB57_9; + bra.uni $L__BB57_7; $L__BB57_9: - mul.wide.u32 %rd14, %r48, 8; - add.s64 %rd15, %rd2, %rd14; - ld.global.nc.u64 %rd16, [%rd15]; - cvta.to.global.u64 %rd17, %rd16; - add.s64 %rd19, %rd17, %rd12; - ld.global.f32 %f3, [%rd19]; - sub.ftz.f32 %f4, %f1, %f3; - abs.ftz.f32 %f5, %f4; - mul.wide.u32 %rd20, %r47, 4; - add.s64 %rd21, %rd1, %rd20; - atom.global.add.f32 %f6, [%rd21], %f5; - add.s32 %r47, %r47, %r7; - add.s32 %r48, %r48, %r7; - setp.lt.u32 %p7, %r48, %r27; - @%p7 bra $L__BB57_9; - bra.uni $L__BB57_14; + ld.global.f32 %f1, [%rd5]; + mov.u32 %r46, %r5; $L__BB57_10: - mov.u32 %r49, %r5; + mul.wide.u32 %rd16, %r46, 8; + add.s64 %rd17, %rd2, %rd16; + ld.global.nc.u64 %rd18, [%rd17]; + cvta.to.global.u64 %rd19, %rd18; + add.s64 %rd21, %rd19, %rd13; + ld.global.f32 %f4, [%rd21]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + add.s32 %r40, %r46, %r10; + mul.wide.u32 %rd22, %r40, 4; + add.s64 %rd23, %rd1, %rd22; + atom.global.add.f32 %f7, [%rd23], %f6; + add.s32 %r46, %r46, %r7; + setp.lt.u32 %p7, %r46, %r25; + @%p7 bra $L__BB57_10; + bra.uni $L__BB57_11; + +$L__BB57_7: + add.s32 %r44, %r5, %r10; + mov.u32 %r45, %r5; + +$L__BB57_8: + mul.wide.u32 %rd14, %r44, 4; + add.s64 %rd15, %rd1, %rd14; + atom.global.add.f32 %f3, [%rd15], 0f00000000; + add.s32 %r44, %r44, %r7; + add.s32 %r45, %r45, %r7; + setp.lt.u32 %p6, %r45, %r25; + @%p6 bra $L__BB57_8; $L__BB57_11: - mul.wide.u32 %rd22, %r49, 8; - add.s64 %rd23, %rd2, %rd22; - ld.global.nc.u64 %rd24, [%rd23]; - cvta.to.global.u64 %rd25, %rd24; - add.s64 %rd27, %rd25, %rd12; - ld.global.f32 %f7, [%rd27]; - mul.ftz.f32 %f8, %f1, %f7; - add.s32 %r42, %r49, %r10; - mul.wide.u32 %rd28, %r42, 4; - add.s64 %rd29, %rd1, %rd28; - atom.global.add.f32 %f9, [%rd29], %f8; - add.s32 %r49, %r49, %r7; - setp.lt.u32 %p8, %r49, %r27; - @%p8 bra $L__BB57_11; - bra.uni $L__BB57_14; + add.s32 %r43, %r43, %r6; + setp.lt.u32 %p8, %r43, %r26; + @%p8 bra $L__BB57_5; + bra.uni $L__BB57_17; $L__BB57_12: - mov.u32 %r50, %r5; + mov.u32 %r47, %r3; $L__BB57_13: - add.s32 %r43, %r50, %r10; - mul.wide.u32 %rd30, %r43, 4; - add.s64 %rd31, %rd1, %rd30; - atom.global.add.f32 %f10, [%rd31], 0f00000000; - add.s32 %r50, %r50, %r7; - setp.lt.u32 %p9, %r50, %r27; - @%p9 bra $L__BB57_13; - -$L__BB57_14: - add.s32 %r46, %r46, %r6; - setp.lt.u32 %p10, %r46, %r28; - @%p10 bra $L__BB57_5; - bra.uni $L__BB57_20; + setp.ge.u32 %p9, %r5, %r25; + @%p9 bra $L__BB57_16; + + mul.wide.u32 %rd24, %r47, 8; + add.s64 %rd25, %rd3, %rd24; + ld.global.nc.u64 %rd26, [%rd25]; + cvta.to.global.u64 %rd27, %rd26; + shl.b64 %rd28, %rd4, 2; + add.s64 %rd29, %rd27, %rd28; + ld.global.f32 %f2, [%rd29]; + mul.lo.s32 %r20, %r47, %r25; + mov.u32 %r48, %r5; $L__BB57_15: - mov.u32 %r51, %r3; + mul.wide.u32 %rd30, %r48, 8; + add.s64 %rd31, %rd2, %rd30; + ld.global.nc.u64 %rd32, [%rd31]; + cvta.to.global.u64 %rd33, %rd32; + add.s64 %rd35, %rd33, %rd28; + ld.global.f32 %f8, [%rd35]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + add.s32 %r41, %r48, %r20; + mul.wide.u32 %rd36, %r41, 4; + add.s64 %rd37, %rd1, %rd36; + atom.global.add.f32 %f11, [%rd37], %f10; + add.s32 %r48, %r48, %r7; + setp.lt.u32 %p10, %r48, %r25; + @%p10 bra $L__BB57_15; $L__BB57_16: - setp.ge.u32 %p11, %r5, %r27; - @%p11 bra $L__BB57_19; - - mul.wide.u32 %rd32, %r51, 8; - add.s64 %rd33, %rd3, %rd32; - ld.global.nc.u64 %rd34, [%rd33]; - cvta.to.global.u64 %rd35, %rd34; - shl.b64 %rd36, %rd4, 2; - add.s64 %rd37, %rd35, %rd36; - ld.global.f32 %f2, [%rd37]; - mul.lo.s32 %r22, %r51, %r27; - mov.u32 %r52, %r5; + add.s32 %r47, %r47, %r6; + setp.lt.u32 %p11, %r47, %r26; + @%p11 bra $L__BB57_13; + +$L__BB57_17: + add.s32 %r42, %r42, %r4; + setp.lt.u32 %p12, %r42, %r27; + @%p12 bra $L__BB57_2; $L__BB57_18: - mul.wide.u32 %rd38, %r52, 8; - add.s64 %rd39, %rd2, %rd38; - ld.global.nc.u64 %rd40, [%rd39]; - cvta.to.global.u64 %rd41, %rd40; - add.s64 %rd43, %rd41, %rd36; - ld.global.f32 %f11, [%rd43]; - sub.ftz.f32 %f12, %f2, %f11; - mul.ftz.f32 %f13, %f12, %f12; - add.s32 %r44, %r52, %r22; - mul.wide.u32 %rd44, %r44, 4; - add.s64 %rd45, %rd1, %rd44; - atom.global.add.f32 %f14, [%rd45], %f13; - add.s32 %r52, %r52, %r7; - setp.lt.u32 %p12, %r52, %r27; - @%p12 bra $L__BB57_18; - -$L__BB57_19: - add.s32 %r51, %r51, %r6; - setp.lt.u32 %p13, %r51, %r28; - @%p13 bra $L__BB57_16; - -$L__BB57_20: - add.s32 %r45, %r45, %r4; - setp.lt.u32 %p14, %r45, %r29; - @%p14 bra $L__BB57_2; - -$L__BB57_21: ret; } - // .globl MultiCosineDistance -.visible .entry MultiCosineDistance( - .param .u64 MultiCosineDistance_param_0, - .param .u64 MultiCosineDistance_param_1, - .param .u64 MultiCosineDistance_param_2, - .param .u64 MultiCosineDistance_param_3, - .param .u64 MultiCosineDistance_param_4, - .param .u32 MultiCosineDistance_param_5, - .param .u32 MultiCosineDistance_param_6, - .param .u32 MultiCosineDistance_param_7 + // .globl CalculateDistances +.visible .entry CalculateDistances( + .param .u64 CalculateDistances_param_0, + .param .u64 CalculateDistances_param_1, + .param .u64 CalculateDistances_param_2, + .param .u32 CalculateDistances_param_3, + .param .u32 CalculateDistances_param_4, + .param .u32 CalculateDistances_param_5 +) +{ + .reg .pred %p<11>; + .reg .f32 %f<12>; + .reg .b32 %r<31>; + .reg .b64 %rd<31>; + + + ld.param.u64 %rd7, [CalculateDistances_param_0]; + ld.param.u64 %rd8, [CalculateDistances_param_1]; + ld.param.u64 %rd9, [CalculateDistances_param_2]; + ld.param.u32 %r16, [CalculateDistances_param_3]; + ld.param.u32 %r17, [CalculateDistances_param_4]; + ld.param.u32 %r18, [CalculateDistances_param_5]; + cvta.to.global.u64 %rd1, %rd9; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd7; + mov.u32 %r19, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r20, %tid.x; + mad.lo.s32 %r26, %r1, %r19, %r20; + setp.ge.u32 %p1, %r26, %r17; + @%p1 bra $L__BB58_15; + + mov.u32 %r21, %ntid.y; + mov.u32 %r22, %ctaid.y; + mov.u32 %r23, %tid.y; + mad.lo.s32 %r3, %r21, %r22, %r23; + mov.u32 %r24, %nctaid.x; + mul.lo.s32 %r4, %r1, %r24; + mov.u32 %r25, %nctaid.y; + mul.lo.s32 %r5, %r21, %r25; + setp.eq.s32 %p2, %r18, 0; + @%p2 bra $L__BB58_11; + +$L__BB58_3: + setp.ge.u32 %p3, %r3, %r16; + @%p3 bra $L__BB58_9; + + setp.eq.s32 %p4, %r18, 2; + cvt.u64.u32 %rd4, %r26; + mul.wide.u32 %rd10, %r26, 4; + add.s64 %rd5, %rd3, %rd10; + @%p4 bra $L__BB58_7; + bra.uni $L__BB58_5; + +$L__BB58_7: + ld.global.nc.f32 %f1, [%rd5]; + mov.u32 %r28, %r3; + +$L__BB58_8: + mul.wide.u32 %rd13, %r28, 8; + add.s64 %rd14, %rd2, %rd13; + ld.global.nc.u64 %rd15, [%rd14]; + cvta.to.global.u64 %rd16, %rd15; + shl.b64 %rd17, %rd4, 2; + add.s64 %rd18, %rd16, %rd17; + ld.global.f32 %f4, [%rd18]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + mul.wide.u32 %rd19, %r28, 4; + add.s64 %rd20, %rd1, %rd19; + atom.global.add.f32 %f7, [%rd20], %f6; + add.s32 %r28, %r28, %r5; + setp.lt.u32 %p6, %r28, %r16; + @%p6 bra $L__BB58_8; + bra.uni $L__BB58_9; + +$L__BB58_5: + mov.u32 %r27, %r3; + +$L__BB58_6: + mul.wide.u32 %rd11, %r27, 4; + add.s64 %rd12, %rd1, %rd11; + atom.global.add.f32 %f3, [%rd12], 0f00000000; + add.s32 %r27, %r27, %r5; + setp.lt.u32 %p5, %r27, %r16; + @%p5 bra $L__BB58_6; + +$L__BB58_9: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p7, %r26, %r17; + @%p7 bra $L__BB58_3; + bra.uni $L__BB58_15; + +$L__BB58_11: + setp.ge.u32 %p8, %r3, %r16; + @%p8 bra $L__BB58_14; + + cvt.u64.u32 %rd6, %r26; + mul.wide.u32 %rd21, %r26, 4; + add.s64 %rd22, %rd3, %rd21; + ld.global.nc.f32 %f2, [%rd22]; + mov.u32 %r30, %r3; + +$L__BB58_13: + mul.wide.u32 %rd23, %r30, 8; + add.s64 %rd24, %rd2, %rd23; + ld.global.nc.u64 %rd25, [%rd24]; + cvta.to.global.u64 %rd26, %rd25; + shl.b64 %rd27, %rd6, 2; + add.s64 %rd28, %rd26, %rd27; + ld.global.f32 %f8, [%rd28]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + mul.wide.u32 %rd29, %r30, 4; + add.s64 %rd30, %rd1, %rd29; + atom.global.add.f32 %f11, [%rd30], %f10; + add.s32 %r30, %r30, %r5; + setp.lt.u32 %p9, %r30, %r16; + @%p9 bra $L__BB58_13; + +$L__BB58_14: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p10, %r26, %r17; + @%p10 bra $L__BB58_11; + +$L__BB58_15: + ret; + +} + // .globl CosineMultiDistance +.visible .entry CosineMultiDistance( + .param .u64 CosineMultiDistance_param_0, + .param .u64 CosineMultiDistance_param_1, + .param .u64 CosineMultiDistance_param_2, + .param .u64 CosineMultiDistance_param_3, + .param .u64 CosineMultiDistance_param_4, + .param .u32 CosineMultiDistance_param_5, + .param .u32 CosineMultiDistance_param_6, + .param .u32 CosineMultiDistance_param_7 ) { .reg .pred %p<7>; @@ -4224,20 +4332,20 @@ $L__BB57_21: .reg .b64 %rd<28>; - ld.param.u64 %rd7, [MultiCosineDistance_param_0]; - ld.param.u64 %rd8, [MultiCosineDistance_param_1]; - ld.param.u64 %rd9, [MultiCosineDistance_param_2]; - ld.param.u64 %rd10, [MultiCosineDistance_param_3]; - ld.param.u64 %rd11, [MultiCosineDistance_param_4]; - ld.param.u32 %r17, [MultiCosineDistance_param_5]; - ld.param.u32 %r18, [MultiCosineDistance_param_6]; - ld.param.u32 %r19, [MultiCosineDistance_param_7]; + ld.param.u64 %rd7, [CosineMultiDistance_param_0]; + ld.param.u64 %rd8, [CosineMultiDistance_param_1]; + ld.param.u64 %rd9, [CosineMultiDistance_param_2]; + ld.param.u64 %rd10, [CosineMultiDistance_param_3]; + ld.param.u64 %rd11, [CosineMultiDistance_param_4]; + ld.param.u32 %r17, [CosineMultiDistance_param_5]; + ld.param.u32 %r18, [CosineMultiDistance_param_6]; + ld.param.u32 %r19, [CosineMultiDistance_param_7]; mov.u32 %r20, %ctaid.x; mov.u32 %r1, %ntid.x; mov.u32 %r21, %tid.x; mad.lo.s32 %r31, %r1, %r20, %r21; setp.ge.u32 %p1, %r31, %r19; - @%p1 bra $L__BB58_9; + @%p1 bra $L__BB59_9; mov.u32 %r22, %ntid.y; mov.u32 %r23, %ctaid.y; @@ -4259,16 +4367,16 @@ $L__BB57_21: cvta.to.global.u64 %rd4, %rd10; cvta.to.global.u64 %rd5, %rd11; -$L__BB58_2: +$L__BB59_2: setp.ge.u32 %p2, %r3, %r18; - @%p2 bra $L__BB58_8; + @%p2 bra $L__BB59_8; cvt.u64.u32 %rd6, %r31; mov.u32 %r32, %r3; -$L__BB58_4: +$L__BB59_4: setp.ge.u32 %p3, %r5, %r17; - @%p3 bra $L__BB58_7; + @%p3 bra $L__BB59_7; mul.wide.u32 %rd12, %r32, 8; add.s64 %rd13, %rd1, %rd12; @@ -4281,7 +4389,7 @@ $L__BB58_4: mad.lo.s32 %r33, %r32, %r17, %r5; mov.u32 %r34, %r5; -$L__BB58_6: +$L__BB59_6: mul.wide.u32 %rd18, %r34, 8; add.s64 %rd19, %rd2, %rd18; ld.global.nc.u64 %rd20, [%rd19]; @@ -4300,19 +4408,105 @@ $L__BB58_6: add.s32 %r33, %r33, %r7; add.s32 %r34, %r34, %r7; setp.lt.u32 %p4, %r34, %r17; - @%p4 bra $L__BB58_6; + @%p4 bra $L__BB59_6; -$L__BB58_7: +$L__BB59_7: add.s32 %r32, %r32, %r6; setp.lt.u32 %p5, %r32, %r18; - @%p5 bra $L__BB58_4; + @%p5 bra $L__BB59_4; -$L__BB58_8: +$L__BB59_8: add.s32 %r31, %r31, %r4; setp.lt.u32 %p6, %r31, %r19; - @%p6 bra $L__BB58_2; + @%p6 bra $L__BB59_2; -$L__BB58_9: +$L__BB59_9: + ret; + +} + // .globl CosineDistances +.visible .entry CosineDistances( + .param .u64 CosineDistances_param_0, + .param .u64 CosineDistances_param_1, + .param .u64 CosineDistances_param_2, + .param .u64 CosineDistances_param_3, + .param .u64 CosineDistances_param_4, + .param .u32 CosineDistances_param_5, + .param .u32 CosineDistances_param_6 +) +{ + .reg .pred %p<5>; + .reg .f32 %f<9>; + .reg .b32 %r<21>; + .reg .b64 %rd<24>; + + + ld.param.u64 %rd7, [CosineDistances_param_0]; + ld.param.u64 %rd8, [CosineDistances_param_1]; + ld.param.u64 %rd9, [CosineDistances_param_2]; + ld.param.u64 %rd10, [CosineDistances_param_3]; + ld.param.u64 %rd11, [CosineDistances_param_4]; + ld.param.u32 %r10, [CosineDistances_param_5]; + ld.param.u32 %r11, [CosineDistances_param_6]; + mov.u32 %r12, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r13, %tid.x; + mad.lo.s32 %r19, %r1, %r12, %r13; + setp.ge.u32 %p1, %r19, %r11; + @%p1 bra $L__BB60_6; + + mov.u32 %r14, %ntid.y; + mov.u32 %r15, %ctaid.y; + mov.u32 %r16, %tid.y; + mad.lo.s32 %r3, %r14, %r15, %r16; + mov.u32 %r17, %nctaid.x; + mul.lo.s32 %r4, %r1, %r17; + mov.u32 %r18, %nctaid.y; + mul.lo.s32 %r5, %r14, %r18; + cvta.to.global.u64 %rd1, %rd7; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd9; + cvta.to.global.u64 %rd4, %rd10; + cvta.to.global.u64 %rd5, %rd11; + +$L__BB60_2: + setp.ge.u32 %p2, %r3, %r10; + @%p2 bra $L__BB60_5; + + cvt.u64.u32 %rd6, %r19; + mul.wide.u32 %rd12, %r19, 4; + add.s64 %rd13, %rd1, %rd12; + ld.global.nc.f32 %f1, [%rd13]; + mul.ftz.f32 %f2, %f1, %f1; + mov.u32 %r20, %r3; + +$L__BB60_4: + mul.wide.u32 %rd14, %r20, 8; + add.s64 %rd15, %rd2, %rd14; + ld.global.nc.u64 %rd16, [%rd15]; + cvta.to.global.u64 %rd17, %rd16; + shl.b64 %rd18, %rd6, 2; + add.s64 %rd19, %rd17, %rd18; + ld.global.f32 %f3, [%rd19]; + mul.wide.u32 %rd20, %r20, 4; + add.s64 %rd21, %rd3, %rd20; + atom.global.add.f32 %f4, [%rd21], %f2; + add.s64 %rd22, %rd4, %rd20; + mul.ftz.f32 %f5, %f1, %f3; + atom.global.add.f32 %f6, [%rd22], %f5; + add.s64 %rd23, %rd5, %rd20; + mul.ftz.f32 %f7, %f3, %f3; + atom.global.add.f32 %f8, [%rd23], %f7; + add.s32 %r20, %r20, %r5; + setp.lt.u32 %p3, %r20, %r10; + @%p3 bra $L__BB60_4; + +$L__BB60_5: + add.s32 %r19, %r19, %r4; + setp.lt.u32 %p4, %r19, %r11; + @%p4 bra $L__BB60_2; + +$L__BB60_6: ret; } @@ -4340,7 +4534,7 @@ $L__BB58_9: mov.u32 %r2, %tid.x; mad.lo.s32 %r3, %r22, %r1, %r2; setp.ge.u32 %p1, %r3, %r20; - @%p1 bra $L__BB59_2; + @%p1 bra $L__BB61_2; cvta.to.global.u64 %rd3, %rd1; mul.lo.s32 %r23, %r3, %r21; @@ -4352,17 +4546,17 @@ $L__BB58_9: add.s32 %r26, %r25, %r24; st.shared.f32 [%r26], %f8; -$L__BB59_2: +$L__BB61_2: bar.sync 0; setp.ne.s32 %p2, %r2, 0; - @%p2 bra $L__BB59_11; + @%p2 bra $L__BB61_11; shl.b32 %r4, %r1, 10; sub.s32 %r27, %r20, %r4; min.u32 %r5, %r27, 1024; setp.eq.s32 %p3, %r5, 0; mov.f32 %f25, 0f00000000; - @%p3 bra $L__BB59_10; + @%p3 bra $L__BB61_10; not.b32 %r29, %r20; add.s32 %r30, %r4, %r29; @@ -4373,7 +4567,7 @@ $L__BB59_2: setp.lt.u32 %p4, %r33, 3; mov.f32 %f25, 0f00000000; mov.u32 %r45, 0; - @%p4 bra $L__BB59_7; + @%p4 bra $L__BB61_7; add.s32 %r36, %r4, -1; sub.s32 %r37, %r36, %r20; @@ -4382,7 +4576,7 @@ $L__BB59_2: neg.s32 %r42, %r39; mov.u32 %r43, _ZZ9SumValuesE5block; -$L__BB59_6: +$L__BB61_6: ld.shared.f32 %f13, [%r43]; add.ftz.f32 %f14, %f25, %f13; ld.shared.f32 %f15, [%r43+4]; @@ -4395,32 +4589,32 @@ $L__BB59_6: add.s32 %r43, %r43, 16; add.s32 %r42, %r42, -4; setp.ne.s32 %p5, %r42, 1; - @%p5 bra $L__BB59_6; + @%p5 bra $L__BB61_6; -$L__BB59_7: +$L__BB61_7: setp.eq.s32 %p6, %r47, 0; - @%p6 bra $L__BB59_10; + @%p6 bra $L__BB61_10; shl.b32 %r40, %r45, 2; mov.u32 %r41, _ZZ9SumValuesE5block; add.s32 %r46, %r41, %r40; -$L__BB59_9: +$L__BB61_9: .pragma "nounroll"; ld.shared.f32 %f20, [%r46]; add.ftz.f32 %f25, %f25, %f20; add.s32 %r46, %r46, 4; add.s32 %r47, %r47, -1; setp.ne.s32 %p7, %r47, 0; - @%p7 bra $L__BB59_9; + @%p7 bra $L__BB61_9; -$L__BB59_10: +$L__BB61_10: cvta.to.global.u64 %rd6, %rd2; mul.wide.u32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f25; -$L__BB59_11: +$L__BB61_11: ret; } diff --git a/BrightData.Cuda/cuda/brightwire_70.ptx b/BrightData.Cuda/cuda/brightwire_70.ptx index f2a86115..ca8ad5e5 100644 --- a/BrightData.Cuda/cuda/brightwire_70.ptx +++ b/BrightData.Cuda/cuda/brightwire_70.ptx @@ -4022,200 +4022,308 @@ $L__BB56_3: ret; } - // .globl CalculateDistances -.visible .entry CalculateDistances( - .param .u64 CalculateDistances_param_0, - .param .u64 CalculateDistances_param_1, - .param .u64 CalculateDistances_param_2, - .param .u32 CalculateDistances_param_3, - .param .u32 CalculateDistances_param_4, - .param .u32 CalculateDistances_param_5, - .param .u32 CalculateDistances_param_6 + // .globl CalculateMultiDistances +.visible .entry CalculateMultiDistances( + .param .u64 CalculateMultiDistances_param_0, + .param .u64 CalculateMultiDistances_param_1, + .param .u64 CalculateMultiDistances_param_2, + .param .u32 CalculateMultiDistances_param_3, + .param .u32 CalculateMultiDistances_param_4, + .param .u32 CalculateMultiDistances_param_5, + .param .u32 CalculateMultiDistances_param_6 ) { - .reg .pred %p<15>; - .reg .f32 %f<15>; - .reg .b32 %r<53>; - .reg .b64 %rd<46>; - - - ld.param.u64 %rd5, [CalculateDistances_param_0]; - ld.param.u64 %rd6, [CalculateDistances_param_1]; - ld.param.u64 %rd7, [CalculateDistances_param_2]; - ld.param.u32 %r27, [CalculateDistances_param_3]; - ld.param.u32 %r28, [CalculateDistances_param_4]; - ld.param.u32 %r29, [CalculateDistances_param_5]; - ld.param.u32 %r30, [CalculateDistances_param_6]; - cvta.to.global.u64 %rd1, %rd7; - cvta.to.global.u64 %rd2, %rd6; - cvta.to.global.u64 %rd3, %rd5; - mov.u32 %r31, %ctaid.x; + .reg .pred %p<13>; + .reg .f32 %f<12>; + .reg .b32 %r<49>; + .reg .b64 %rd<38>; + + + ld.param.u64 %rd6, [CalculateMultiDistances_param_0]; + ld.param.u64 %rd7, [CalculateMultiDistances_param_1]; + ld.param.u64 %rd8, [CalculateMultiDistances_param_2]; + ld.param.u32 %r25, [CalculateMultiDistances_param_3]; + ld.param.u32 %r26, [CalculateMultiDistances_param_4]; + ld.param.u32 %r27, [CalculateMultiDistances_param_5]; + ld.param.u32 %r28, [CalculateMultiDistances_param_6]; + cvta.to.global.u64 %rd1, %rd8; + cvta.to.global.u64 %rd2, %rd7; + cvta.to.global.u64 %rd3, %rd6; + mov.u32 %r29, %ctaid.x; mov.u32 %r1, %ntid.x; - mov.u32 %r32, %tid.x; - mad.lo.s32 %r45, %r1, %r31, %r32; - setp.ge.u32 %p1, %r45, %r29; - @%p1 bra $L__BB57_21; - - mov.u32 %r33, %ntid.y; - mov.u32 %r34, %ctaid.y; - mov.u32 %r35, %tid.y; - mad.lo.s32 %r3, %r33, %r34, %r35; - mov.u32 %r36, %nctaid.x; - mul.lo.s32 %r4, %r1, %r36; - mov.u32 %r37, %ctaid.z; - mov.u32 %r38, %ntid.z; - mov.u32 %r39, %tid.z; - mad.lo.s32 %r5, %r38, %r37, %r39; - mov.u32 %r40, %nctaid.y; - mul.lo.s32 %r6, %r33, %r40; - mov.u32 %r41, %nctaid.z; - mul.lo.s32 %r7, %r38, %r41; + mov.u32 %r30, %tid.x; + mad.lo.s32 %r42, %r1, %r29, %r30; + setp.ge.u32 %p1, %r42, %r27; + @%p1 bra $L__BB57_18; + + mov.u32 %r31, %ntid.y; + mov.u32 %r32, %ctaid.y; + mov.u32 %r33, %tid.y; + mad.lo.s32 %r3, %r31, %r32, %r33; + mov.u32 %r34, %nctaid.x; + mul.lo.s32 %r4, %r1, %r34; + mov.u32 %r35, %ctaid.z; + mov.u32 %r36, %ntid.z; + mov.u32 %r37, %tid.z; + mad.lo.s32 %r5, %r36, %r35, %r37; + mov.u32 %r38, %nctaid.y; + mul.lo.s32 %r6, %r31, %r38; + mov.u32 %r39, %nctaid.z; + mul.lo.s32 %r7, %r36, %r39; $L__BB57_2: - setp.ge.u32 %p2, %r3, %r28; - @%p2 bra $L__BB57_20; + setp.ge.u32 %p2, %r3, %r26; + @%p2 bra $L__BB57_17; - setp.eq.s32 %p3, %r30, 0; - cvt.u64.u32 %rd4, %r45; - @%p3 bra $L__BB57_15; + setp.eq.s32 %p3, %r28, 0; + cvt.u64.u32 %rd4, %r42; + @%p3 bra $L__BB57_12; - mov.u32 %r46, %r3; + mov.u32 %r43, %r3; $L__BB57_5: - setp.ge.u32 %p4, %r5, %r27; - @%p4 bra $L__BB57_14; - - mul.wide.u32 %rd8, %r46, 8; - add.s64 %rd9, %rd3, %rd8; - ld.global.nc.u64 %rd10, [%rd9]; - cvta.to.global.u64 %rd11, %rd10; - shl.b64 %rd12, %rd4, 2; - add.s64 %rd13, %rd11, %rd12; - ld.global.f32 %f1, [%rd13]; - mul.lo.s32 %r10, %r46, %r27; - setp.eq.s32 %p5, %r30, 1; - @%p5 bra $L__BB57_10; - - setp.ne.s32 %p6, %r30, 2; - @%p6 bra $L__BB57_12; + setp.ge.u32 %p4, %r5, %r25; + @%p4 bra $L__BB57_11; - add.s32 %r47, %r5, %r10; - mov.u32 %r48, %r5; + setp.eq.s32 %p5, %r28, 2; + mul.wide.u32 %rd9, %r43, 8; + add.s64 %rd10, %rd3, %rd9; + ld.global.nc.u64 %rd11, [%rd10]; + cvta.to.global.u64 %rd12, %rd11; + shl.b64 %rd13, %rd4, 2; + add.s64 %rd5, %rd12, %rd13; + mul.lo.s32 %r10, %r43, %r25; + @%p5 bra $L__BB57_9; + bra.uni $L__BB57_7; $L__BB57_9: - mul.wide.u32 %rd14, %r48, 8; - add.s64 %rd15, %rd2, %rd14; - ld.global.nc.u64 %rd16, [%rd15]; - cvta.to.global.u64 %rd17, %rd16; - add.s64 %rd19, %rd17, %rd12; - ld.global.f32 %f3, [%rd19]; - sub.ftz.f32 %f4, %f1, %f3; - abs.ftz.f32 %f5, %f4; - mul.wide.u32 %rd20, %r47, 4; - add.s64 %rd21, %rd1, %rd20; - atom.global.add.f32 %f6, [%rd21], %f5; - add.s32 %r47, %r47, %r7; - add.s32 %r48, %r48, %r7; - setp.lt.u32 %p7, %r48, %r27; - @%p7 bra $L__BB57_9; - bra.uni $L__BB57_14; + ld.global.f32 %f1, [%rd5]; + mov.u32 %r46, %r5; $L__BB57_10: - mov.u32 %r49, %r5; + mul.wide.u32 %rd16, %r46, 8; + add.s64 %rd17, %rd2, %rd16; + ld.global.nc.u64 %rd18, [%rd17]; + cvta.to.global.u64 %rd19, %rd18; + add.s64 %rd21, %rd19, %rd13; + ld.global.f32 %f4, [%rd21]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + add.s32 %r40, %r46, %r10; + mul.wide.u32 %rd22, %r40, 4; + add.s64 %rd23, %rd1, %rd22; + atom.global.add.f32 %f7, [%rd23], %f6; + add.s32 %r46, %r46, %r7; + setp.lt.u32 %p7, %r46, %r25; + @%p7 bra $L__BB57_10; + bra.uni $L__BB57_11; + +$L__BB57_7: + add.s32 %r44, %r5, %r10; + mov.u32 %r45, %r5; + +$L__BB57_8: + mul.wide.u32 %rd14, %r44, 4; + add.s64 %rd15, %rd1, %rd14; + atom.global.add.f32 %f3, [%rd15], 0f00000000; + add.s32 %r44, %r44, %r7; + add.s32 %r45, %r45, %r7; + setp.lt.u32 %p6, %r45, %r25; + @%p6 bra $L__BB57_8; $L__BB57_11: - mul.wide.u32 %rd22, %r49, 8; - add.s64 %rd23, %rd2, %rd22; - ld.global.nc.u64 %rd24, [%rd23]; - cvta.to.global.u64 %rd25, %rd24; - add.s64 %rd27, %rd25, %rd12; - ld.global.f32 %f7, [%rd27]; - mul.ftz.f32 %f8, %f1, %f7; - add.s32 %r42, %r49, %r10; - mul.wide.u32 %rd28, %r42, 4; - add.s64 %rd29, %rd1, %rd28; - atom.global.add.f32 %f9, [%rd29], %f8; - add.s32 %r49, %r49, %r7; - setp.lt.u32 %p8, %r49, %r27; - @%p8 bra $L__BB57_11; - bra.uni $L__BB57_14; + add.s32 %r43, %r43, %r6; + setp.lt.u32 %p8, %r43, %r26; + @%p8 bra $L__BB57_5; + bra.uni $L__BB57_17; $L__BB57_12: - mov.u32 %r50, %r5; + mov.u32 %r47, %r3; $L__BB57_13: - add.s32 %r43, %r50, %r10; - mul.wide.u32 %rd30, %r43, 4; - add.s64 %rd31, %rd1, %rd30; - atom.global.add.f32 %f10, [%rd31], 0f00000000; - add.s32 %r50, %r50, %r7; - setp.lt.u32 %p9, %r50, %r27; - @%p9 bra $L__BB57_13; - -$L__BB57_14: - add.s32 %r46, %r46, %r6; - setp.lt.u32 %p10, %r46, %r28; - @%p10 bra $L__BB57_5; - bra.uni $L__BB57_20; + setp.ge.u32 %p9, %r5, %r25; + @%p9 bra $L__BB57_16; + + mul.wide.u32 %rd24, %r47, 8; + add.s64 %rd25, %rd3, %rd24; + ld.global.nc.u64 %rd26, [%rd25]; + cvta.to.global.u64 %rd27, %rd26; + shl.b64 %rd28, %rd4, 2; + add.s64 %rd29, %rd27, %rd28; + ld.global.f32 %f2, [%rd29]; + mul.lo.s32 %r20, %r47, %r25; + mov.u32 %r48, %r5; $L__BB57_15: - mov.u32 %r51, %r3; + mul.wide.u32 %rd30, %r48, 8; + add.s64 %rd31, %rd2, %rd30; + ld.global.nc.u64 %rd32, [%rd31]; + cvta.to.global.u64 %rd33, %rd32; + add.s64 %rd35, %rd33, %rd28; + ld.global.f32 %f8, [%rd35]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + add.s32 %r41, %r48, %r20; + mul.wide.u32 %rd36, %r41, 4; + add.s64 %rd37, %rd1, %rd36; + atom.global.add.f32 %f11, [%rd37], %f10; + add.s32 %r48, %r48, %r7; + setp.lt.u32 %p10, %r48, %r25; + @%p10 bra $L__BB57_15; $L__BB57_16: - setp.ge.u32 %p11, %r5, %r27; - @%p11 bra $L__BB57_19; - - mul.wide.u32 %rd32, %r51, 8; - add.s64 %rd33, %rd3, %rd32; - ld.global.nc.u64 %rd34, [%rd33]; - cvta.to.global.u64 %rd35, %rd34; - shl.b64 %rd36, %rd4, 2; - add.s64 %rd37, %rd35, %rd36; - ld.global.f32 %f2, [%rd37]; - mul.lo.s32 %r22, %r51, %r27; - mov.u32 %r52, %r5; + add.s32 %r47, %r47, %r6; + setp.lt.u32 %p11, %r47, %r26; + @%p11 bra $L__BB57_13; + +$L__BB57_17: + add.s32 %r42, %r42, %r4; + setp.lt.u32 %p12, %r42, %r27; + @%p12 bra $L__BB57_2; $L__BB57_18: - mul.wide.u32 %rd38, %r52, 8; - add.s64 %rd39, %rd2, %rd38; - ld.global.nc.u64 %rd40, [%rd39]; - cvta.to.global.u64 %rd41, %rd40; - add.s64 %rd43, %rd41, %rd36; - ld.global.f32 %f11, [%rd43]; - sub.ftz.f32 %f12, %f2, %f11; - mul.ftz.f32 %f13, %f12, %f12; - add.s32 %r44, %r52, %r22; - mul.wide.u32 %rd44, %r44, 4; - add.s64 %rd45, %rd1, %rd44; - atom.global.add.f32 %f14, [%rd45], %f13; - add.s32 %r52, %r52, %r7; - setp.lt.u32 %p12, %r52, %r27; - @%p12 bra $L__BB57_18; - -$L__BB57_19: - add.s32 %r51, %r51, %r6; - setp.lt.u32 %p13, %r51, %r28; - @%p13 bra $L__BB57_16; - -$L__BB57_20: - add.s32 %r45, %r45, %r4; - setp.lt.u32 %p14, %r45, %r29; - @%p14 bra $L__BB57_2; - -$L__BB57_21: ret; } - // .globl MultiCosineDistance -.visible .entry MultiCosineDistance( - .param .u64 MultiCosineDistance_param_0, - .param .u64 MultiCosineDistance_param_1, - .param .u64 MultiCosineDistance_param_2, - .param .u64 MultiCosineDistance_param_3, - .param .u64 MultiCosineDistance_param_4, - .param .u32 MultiCosineDistance_param_5, - .param .u32 MultiCosineDistance_param_6, - .param .u32 MultiCosineDistance_param_7 + // .globl CalculateDistances +.visible .entry CalculateDistances( + .param .u64 CalculateDistances_param_0, + .param .u64 CalculateDistances_param_1, + .param .u64 CalculateDistances_param_2, + .param .u32 CalculateDistances_param_3, + .param .u32 CalculateDistances_param_4, + .param .u32 CalculateDistances_param_5 +) +{ + .reg .pred %p<11>; + .reg .f32 %f<12>; + .reg .b32 %r<31>; + .reg .b64 %rd<31>; + + + ld.param.u64 %rd7, [CalculateDistances_param_0]; + ld.param.u64 %rd8, [CalculateDistances_param_1]; + ld.param.u64 %rd9, [CalculateDistances_param_2]; + ld.param.u32 %r16, [CalculateDistances_param_3]; + ld.param.u32 %r17, [CalculateDistances_param_4]; + ld.param.u32 %r18, [CalculateDistances_param_5]; + cvta.to.global.u64 %rd1, %rd9; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd7; + mov.u32 %r19, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r20, %tid.x; + mad.lo.s32 %r26, %r1, %r19, %r20; + setp.ge.u32 %p1, %r26, %r17; + @%p1 bra $L__BB58_15; + + mov.u32 %r21, %ntid.y; + mov.u32 %r22, %ctaid.y; + mov.u32 %r23, %tid.y; + mad.lo.s32 %r3, %r21, %r22, %r23; + mov.u32 %r24, %nctaid.x; + mul.lo.s32 %r4, %r1, %r24; + mov.u32 %r25, %nctaid.y; + mul.lo.s32 %r5, %r21, %r25; + setp.eq.s32 %p2, %r18, 0; + @%p2 bra $L__BB58_11; + +$L__BB58_3: + setp.ge.u32 %p3, %r3, %r16; + @%p3 bra $L__BB58_9; + + setp.eq.s32 %p4, %r18, 2; + cvt.u64.u32 %rd4, %r26; + mul.wide.u32 %rd10, %r26, 4; + add.s64 %rd5, %rd3, %rd10; + @%p4 bra $L__BB58_7; + bra.uni $L__BB58_5; + +$L__BB58_7: + ld.global.nc.f32 %f1, [%rd5]; + mov.u32 %r28, %r3; + +$L__BB58_8: + mul.wide.u32 %rd13, %r28, 8; + add.s64 %rd14, %rd2, %rd13; + ld.global.nc.u64 %rd15, [%rd14]; + cvta.to.global.u64 %rd16, %rd15; + shl.b64 %rd17, %rd4, 2; + add.s64 %rd18, %rd16, %rd17; + ld.global.f32 %f4, [%rd18]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + mul.wide.u32 %rd19, %r28, 4; + add.s64 %rd20, %rd1, %rd19; + atom.global.add.f32 %f7, [%rd20], %f6; + add.s32 %r28, %r28, %r5; + setp.lt.u32 %p6, %r28, %r16; + @%p6 bra $L__BB58_8; + bra.uni $L__BB58_9; + +$L__BB58_5: + mov.u32 %r27, %r3; + +$L__BB58_6: + mul.wide.u32 %rd11, %r27, 4; + add.s64 %rd12, %rd1, %rd11; + atom.global.add.f32 %f3, [%rd12], 0f00000000; + add.s32 %r27, %r27, %r5; + setp.lt.u32 %p5, %r27, %r16; + @%p5 bra $L__BB58_6; + +$L__BB58_9: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p7, %r26, %r17; + @%p7 bra $L__BB58_3; + bra.uni $L__BB58_15; + +$L__BB58_11: + setp.ge.u32 %p8, %r3, %r16; + @%p8 bra $L__BB58_14; + + cvt.u64.u32 %rd6, %r26; + mul.wide.u32 %rd21, %r26, 4; + add.s64 %rd22, %rd3, %rd21; + ld.global.nc.f32 %f2, [%rd22]; + mov.u32 %r30, %r3; + +$L__BB58_13: + mul.wide.u32 %rd23, %r30, 8; + add.s64 %rd24, %rd2, %rd23; + ld.global.nc.u64 %rd25, [%rd24]; + cvta.to.global.u64 %rd26, %rd25; + shl.b64 %rd27, %rd6, 2; + add.s64 %rd28, %rd26, %rd27; + ld.global.f32 %f8, [%rd28]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + mul.wide.u32 %rd29, %r30, 4; + add.s64 %rd30, %rd1, %rd29; + atom.global.add.f32 %f11, [%rd30], %f10; + add.s32 %r30, %r30, %r5; + setp.lt.u32 %p9, %r30, %r16; + @%p9 bra $L__BB58_13; + +$L__BB58_14: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p10, %r26, %r17; + @%p10 bra $L__BB58_11; + +$L__BB58_15: + ret; + +} + // .globl CosineMultiDistance +.visible .entry CosineMultiDistance( + .param .u64 CosineMultiDistance_param_0, + .param .u64 CosineMultiDistance_param_1, + .param .u64 CosineMultiDistance_param_2, + .param .u64 CosineMultiDistance_param_3, + .param .u64 CosineMultiDistance_param_4, + .param .u32 CosineMultiDistance_param_5, + .param .u32 CosineMultiDistance_param_6, + .param .u32 CosineMultiDistance_param_7 ) { .reg .pred %p<7>; @@ -4224,20 +4332,20 @@ $L__BB57_21: .reg .b64 %rd<28>; - ld.param.u64 %rd7, [MultiCosineDistance_param_0]; - ld.param.u64 %rd8, [MultiCosineDistance_param_1]; - ld.param.u64 %rd9, [MultiCosineDistance_param_2]; - ld.param.u64 %rd10, [MultiCosineDistance_param_3]; - ld.param.u64 %rd11, [MultiCosineDistance_param_4]; - ld.param.u32 %r17, [MultiCosineDistance_param_5]; - ld.param.u32 %r18, [MultiCosineDistance_param_6]; - ld.param.u32 %r19, [MultiCosineDistance_param_7]; + ld.param.u64 %rd7, [CosineMultiDistance_param_0]; + ld.param.u64 %rd8, [CosineMultiDistance_param_1]; + ld.param.u64 %rd9, [CosineMultiDistance_param_2]; + ld.param.u64 %rd10, [CosineMultiDistance_param_3]; + ld.param.u64 %rd11, [CosineMultiDistance_param_4]; + ld.param.u32 %r17, [CosineMultiDistance_param_5]; + ld.param.u32 %r18, [CosineMultiDistance_param_6]; + ld.param.u32 %r19, [CosineMultiDistance_param_7]; mov.u32 %r20, %ctaid.x; mov.u32 %r1, %ntid.x; mov.u32 %r21, %tid.x; mad.lo.s32 %r31, %r1, %r20, %r21; setp.ge.u32 %p1, %r31, %r19; - @%p1 bra $L__BB58_9; + @%p1 bra $L__BB59_9; mov.u32 %r22, %ntid.y; mov.u32 %r23, %ctaid.y; @@ -4259,16 +4367,16 @@ $L__BB57_21: cvta.to.global.u64 %rd4, %rd10; cvta.to.global.u64 %rd5, %rd11; -$L__BB58_2: +$L__BB59_2: setp.ge.u32 %p2, %r3, %r18; - @%p2 bra $L__BB58_8; + @%p2 bra $L__BB59_8; cvt.u64.u32 %rd6, %r31; mov.u32 %r32, %r3; -$L__BB58_4: +$L__BB59_4: setp.ge.u32 %p3, %r5, %r17; - @%p3 bra $L__BB58_7; + @%p3 bra $L__BB59_7; mul.wide.u32 %rd12, %r32, 8; add.s64 %rd13, %rd1, %rd12; @@ -4281,7 +4389,7 @@ $L__BB58_4: mad.lo.s32 %r33, %r32, %r17, %r5; mov.u32 %r34, %r5; -$L__BB58_6: +$L__BB59_6: mul.wide.u32 %rd18, %r34, 8; add.s64 %rd19, %rd2, %rd18; ld.global.nc.u64 %rd20, [%rd19]; @@ -4300,19 +4408,105 @@ $L__BB58_6: add.s32 %r33, %r33, %r7; add.s32 %r34, %r34, %r7; setp.lt.u32 %p4, %r34, %r17; - @%p4 bra $L__BB58_6; + @%p4 bra $L__BB59_6; -$L__BB58_7: +$L__BB59_7: add.s32 %r32, %r32, %r6; setp.lt.u32 %p5, %r32, %r18; - @%p5 bra $L__BB58_4; + @%p5 bra $L__BB59_4; -$L__BB58_8: +$L__BB59_8: add.s32 %r31, %r31, %r4; setp.lt.u32 %p6, %r31, %r19; - @%p6 bra $L__BB58_2; + @%p6 bra $L__BB59_2; -$L__BB58_9: +$L__BB59_9: + ret; + +} + // .globl CosineDistances +.visible .entry CosineDistances( + .param .u64 CosineDistances_param_0, + .param .u64 CosineDistances_param_1, + .param .u64 CosineDistances_param_2, + .param .u64 CosineDistances_param_3, + .param .u64 CosineDistances_param_4, + .param .u32 CosineDistances_param_5, + .param .u32 CosineDistances_param_6 +) +{ + .reg .pred %p<5>; + .reg .f32 %f<9>; + .reg .b32 %r<21>; + .reg .b64 %rd<24>; + + + ld.param.u64 %rd7, [CosineDistances_param_0]; + ld.param.u64 %rd8, [CosineDistances_param_1]; + ld.param.u64 %rd9, [CosineDistances_param_2]; + ld.param.u64 %rd10, [CosineDistances_param_3]; + ld.param.u64 %rd11, [CosineDistances_param_4]; + ld.param.u32 %r10, [CosineDistances_param_5]; + ld.param.u32 %r11, [CosineDistances_param_6]; + mov.u32 %r12, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r13, %tid.x; + mad.lo.s32 %r19, %r1, %r12, %r13; + setp.ge.u32 %p1, %r19, %r11; + @%p1 bra $L__BB60_6; + + mov.u32 %r14, %ntid.y; + mov.u32 %r15, %ctaid.y; + mov.u32 %r16, %tid.y; + mad.lo.s32 %r3, %r14, %r15, %r16; + mov.u32 %r17, %nctaid.x; + mul.lo.s32 %r4, %r1, %r17; + mov.u32 %r18, %nctaid.y; + mul.lo.s32 %r5, %r14, %r18; + cvta.to.global.u64 %rd1, %rd7; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd9; + cvta.to.global.u64 %rd4, %rd10; + cvta.to.global.u64 %rd5, %rd11; + +$L__BB60_2: + setp.ge.u32 %p2, %r3, %r10; + @%p2 bra $L__BB60_5; + + cvt.u64.u32 %rd6, %r19; + mul.wide.u32 %rd12, %r19, 4; + add.s64 %rd13, %rd1, %rd12; + ld.global.nc.f32 %f1, [%rd13]; + mul.ftz.f32 %f2, %f1, %f1; + mov.u32 %r20, %r3; + +$L__BB60_4: + mul.wide.u32 %rd14, %r20, 8; + add.s64 %rd15, %rd2, %rd14; + ld.global.nc.u64 %rd16, [%rd15]; + cvta.to.global.u64 %rd17, %rd16; + shl.b64 %rd18, %rd6, 2; + add.s64 %rd19, %rd17, %rd18; + ld.global.f32 %f3, [%rd19]; + mul.wide.u32 %rd20, %r20, 4; + add.s64 %rd21, %rd3, %rd20; + atom.global.add.f32 %f4, [%rd21], %f2; + add.s64 %rd22, %rd4, %rd20; + mul.ftz.f32 %f5, %f1, %f3; + atom.global.add.f32 %f6, [%rd22], %f5; + add.s64 %rd23, %rd5, %rd20; + mul.ftz.f32 %f7, %f3, %f3; + atom.global.add.f32 %f8, [%rd23], %f7; + add.s32 %r20, %r20, %r5; + setp.lt.u32 %p3, %r20, %r10; + @%p3 bra $L__BB60_4; + +$L__BB60_5: + add.s32 %r19, %r19, %r4; + setp.lt.u32 %p4, %r19, %r11; + @%p4 bra $L__BB60_2; + +$L__BB60_6: ret; } @@ -4340,7 +4534,7 @@ $L__BB58_9: mov.u32 %r2, %tid.x; mad.lo.s32 %r3, %r22, %r1, %r2; setp.ge.u32 %p1, %r3, %r20; - @%p1 bra $L__BB59_2; + @%p1 bra $L__BB61_2; cvta.to.global.u64 %rd3, %rd1; mul.lo.s32 %r23, %r3, %r21; @@ -4352,17 +4546,17 @@ $L__BB58_9: add.s32 %r26, %r25, %r24; st.shared.f32 [%r26], %f8; -$L__BB59_2: +$L__BB61_2: bar.sync 0; setp.ne.s32 %p2, %r2, 0; - @%p2 bra $L__BB59_11; + @%p2 bra $L__BB61_11; shl.b32 %r4, %r1, 10; sub.s32 %r27, %r20, %r4; min.u32 %r5, %r27, 1024; setp.eq.s32 %p3, %r5, 0; mov.f32 %f25, 0f00000000; - @%p3 bra $L__BB59_10; + @%p3 bra $L__BB61_10; not.b32 %r29, %r20; add.s32 %r30, %r4, %r29; @@ -4373,7 +4567,7 @@ $L__BB59_2: setp.lt.u32 %p4, %r33, 3; mov.f32 %f25, 0f00000000; mov.u32 %r45, 0; - @%p4 bra $L__BB59_7; + @%p4 bra $L__BB61_7; add.s32 %r36, %r4, -1; sub.s32 %r37, %r36, %r20; @@ -4382,7 +4576,7 @@ $L__BB59_2: neg.s32 %r42, %r39; mov.u32 %r43, _ZZ9SumValuesE5block; -$L__BB59_6: +$L__BB61_6: ld.shared.f32 %f13, [%r43]; add.ftz.f32 %f14, %f25, %f13; ld.shared.f32 %f15, [%r43+4]; @@ -4395,32 +4589,32 @@ $L__BB59_6: add.s32 %r43, %r43, 16; add.s32 %r42, %r42, -4; setp.ne.s32 %p5, %r42, 1; - @%p5 bra $L__BB59_6; + @%p5 bra $L__BB61_6; -$L__BB59_7: +$L__BB61_7: setp.eq.s32 %p6, %r47, 0; - @%p6 bra $L__BB59_10; + @%p6 bra $L__BB61_10; shl.b32 %r40, %r45, 2; mov.u32 %r41, _ZZ9SumValuesE5block; add.s32 %r46, %r41, %r40; -$L__BB59_9: +$L__BB61_9: .pragma "nounroll"; ld.shared.f32 %f20, [%r46]; add.ftz.f32 %f25, %f25, %f20; add.s32 %r46, %r46, 4; add.s32 %r47, %r47, -1; setp.ne.s32 %p7, %r47, 0; - @%p7 bra $L__BB59_9; + @%p7 bra $L__BB61_9; -$L__BB59_10: +$L__BB61_10: cvta.to.global.u64 %rd6, %rd2; mul.wide.u32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f25; -$L__BB59_11: +$L__BB61_11: ret; } diff --git a/BrightData.Cuda/cuda/brightwire_72.ptx b/BrightData.Cuda/cuda/brightwire_72.ptx index 0e508c92..d5e55a89 100644 --- a/BrightData.Cuda/cuda/brightwire_72.ptx +++ b/BrightData.Cuda/cuda/brightwire_72.ptx @@ -4022,200 +4022,308 @@ $L__BB56_3: ret; } - // .globl CalculateDistances -.visible .entry CalculateDistances( - .param .u64 CalculateDistances_param_0, - .param .u64 CalculateDistances_param_1, - .param .u64 CalculateDistances_param_2, - .param .u32 CalculateDistances_param_3, - .param .u32 CalculateDistances_param_4, - .param .u32 CalculateDistances_param_5, - .param .u32 CalculateDistances_param_6 + // .globl CalculateMultiDistances +.visible .entry CalculateMultiDistances( + .param .u64 CalculateMultiDistances_param_0, + .param .u64 CalculateMultiDistances_param_1, + .param .u64 CalculateMultiDistances_param_2, + .param .u32 CalculateMultiDistances_param_3, + .param .u32 CalculateMultiDistances_param_4, + .param .u32 CalculateMultiDistances_param_5, + .param .u32 CalculateMultiDistances_param_6 ) { - .reg .pred %p<15>; - .reg .f32 %f<15>; - .reg .b32 %r<53>; - .reg .b64 %rd<46>; - - - ld.param.u64 %rd5, [CalculateDistances_param_0]; - ld.param.u64 %rd6, [CalculateDistances_param_1]; - ld.param.u64 %rd7, [CalculateDistances_param_2]; - ld.param.u32 %r27, [CalculateDistances_param_3]; - ld.param.u32 %r28, [CalculateDistances_param_4]; - ld.param.u32 %r29, [CalculateDistances_param_5]; - ld.param.u32 %r30, [CalculateDistances_param_6]; - cvta.to.global.u64 %rd1, %rd7; - cvta.to.global.u64 %rd2, %rd6; - cvta.to.global.u64 %rd3, %rd5; - mov.u32 %r31, %ctaid.x; + .reg .pred %p<13>; + .reg .f32 %f<12>; + .reg .b32 %r<49>; + .reg .b64 %rd<38>; + + + ld.param.u64 %rd6, [CalculateMultiDistances_param_0]; + ld.param.u64 %rd7, [CalculateMultiDistances_param_1]; + ld.param.u64 %rd8, [CalculateMultiDistances_param_2]; + ld.param.u32 %r25, [CalculateMultiDistances_param_3]; + ld.param.u32 %r26, [CalculateMultiDistances_param_4]; + ld.param.u32 %r27, [CalculateMultiDistances_param_5]; + ld.param.u32 %r28, [CalculateMultiDistances_param_6]; + cvta.to.global.u64 %rd1, %rd8; + cvta.to.global.u64 %rd2, %rd7; + cvta.to.global.u64 %rd3, %rd6; + mov.u32 %r29, %ctaid.x; mov.u32 %r1, %ntid.x; - mov.u32 %r32, %tid.x; - mad.lo.s32 %r45, %r1, %r31, %r32; - setp.ge.u32 %p1, %r45, %r29; - @%p1 bra $L__BB57_21; - - mov.u32 %r33, %ntid.y; - mov.u32 %r34, %ctaid.y; - mov.u32 %r35, %tid.y; - mad.lo.s32 %r3, %r33, %r34, %r35; - mov.u32 %r36, %nctaid.x; - mul.lo.s32 %r4, %r1, %r36; - mov.u32 %r37, %ctaid.z; - mov.u32 %r38, %ntid.z; - mov.u32 %r39, %tid.z; - mad.lo.s32 %r5, %r38, %r37, %r39; - mov.u32 %r40, %nctaid.y; - mul.lo.s32 %r6, %r33, %r40; - mov.u32 %r41, %nctaid.z; - mul.lo.s32 %r7, %r38, %r41; + mov.u32 %r30, %tid.x; + mad.lo.s32 %r42, %r1, %r29, %r30; + setp.ge.u32 %p1, %r42, %r27; + @%p1 bra $L__BB57_18; + + mov.u32 %r31, %ntid.y; + mov.u32 %r32, %ctaid.y; + mov.u32 %r33, %tid.y; + mad.lo.s32 %r3, %r31, %r32, %r33; + mov.u32 %r34, %nctaid.x; + mul.lo.s32 %r4, %r1, %r34; + mov.u32 %r35, %ctaid.z; + mov.u32 %r36, %ntid.z; + mov.u32 %r37, %tid.z; + mad.lo.s32 %r5, %r36, %r35, %r37; + mov.u32 %r38, %nctaid.y; + mul.lo.s32 %r6, %r31, %r38; + mov.u32 %r39, %nctaid.z; + mul.lo.s32 %r7, %r36, %r39; $L__BB57_2: - setp.ge.u32 %p2, %r3, %r28; - @%p2 bra $L__BB57_20; + setp.ge.u32 %p2, %r3, %r26; + @%p2 bra $L__BB57_17; - setp.eq.s32 %p3, %r30, 0; - cvt.u64.u32 %rd4, %r45; - @%p3 bra $L__BB57_15; + setp.eq.s32 %p3, %r28, 0; + cvt.u64.u32 %rd4, %r42; + @%p3 bra $L__BB57_12; - mov.u32 %r46, %r3; + mov.u32 %r43, %r3; $L__BB57_5: - setp.ge.u32 %p4, %r5, %r27; - @%p4 bra $L__BB57_14; - - mul.wide.u32 %rd8, %r46, 8; - add.s64 %rd9, %rd3, %rd8; - ld.global.nc.u64 %rd10, [%rd9]; - cvta.to.global.u64 %rd11, %rd10; - shl.b64 %rd12, %rd4, 2; - add.s64 %rd13, %rd11, %rd12; - ld.global.f32 %f1, [%rd13]; - mul.lo.s32 %r10, %r46, %r27; - setp.eq.s32 %p5, %r30, 1; - @%p5 bra $L__BB57_10; - - setp.ne.s32 %p6, %r30, 2; - @%p6 bra $L__BB57_12; + setp.ge.u32 %p4, %r5, %r25; + @%p4 bra $L__BB57_11; - add.s32 %r47, %r5, %r10; - mov.u32 %r48, %r5; + setp.eq.s32 %p5, %r28, 2; + mul.wide.u32 %rd9, %r43, 8; + add.s64 %rd10, %rd3, %rd9; + ld.global.nc.u64 %rd11, [%rd10]; + cvta.to.global.u64 %rd12, %rd11; + shl.b64 %rd13, %rd4, 2; + add.s64 %rd5, %rd12, %rd13; + mul.lo.s32 %r10, %r43, %r25; + @%p5 bra $L__BB57_9; + bra.uni $L__BB57_7; $L__BB57_9: - mul.wide.u32 %rd14, %r48, 8; - add.s64 %rd15, %rd2, %rd14; - ld.global.nc.u64 %rd16, [%rd15]; - cvta.to.global.u64 %rd17, %rd16; - add.s64 %rd19, %rd17, %rd12; - ld.global.f32 %f3, [%rd19]; - sub.ftz.f32 %f4, %f1, %f3; - abs.ftz.f32 %f5, %f4; - mul.wide.u32 %rd20, %r47, 4; - add.s64 %rd21, %rd1, %rd20; - atom.global.add.f32 %f6, [%rd21], %f5; - add.s32 %r47, %r47, %r7; - add.s32 %r48, %r48, %r7; - setp.lt.u32 %p7, %r48, %r27; - @%p7 bra $L__BB57_9; - bra.uni $L__BB57_14; + ld.global.f32 %f1, [%rd5]; + mov.u32 %r46, %r5; $L__BB57_10: - mov.u32 %r49, %r5; + mul.wide.u32 %rd16, %r46, 8; + add.s64 %rd17, %rd2, %rd16; + ld.global.nc.u64 %rd18, [%rd17]; + cvta.to.global.u64 %rd19, %rd18; + add.s64 %rd21, %rd19, %rd13; + ld.global.f32 %f4, [%rd21]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + add.s32 %r40, %r46, %r10; + mul.wide.u32 %rd22, %r40, 4; + add.s64 %rd23, %rd1, %rd22; + atom.global.add.f32 %f7, [%rd23], %f6; + add.s32 %r46, %r46, %r7; + setp.lt.u32 %p7, %r46, %r25; + @%p7 bra $L__BB57_10; + bra.uni $L__BB57_11; + +$L__BB57_7: + add.s32 %r44, %r5, %r10; + mov.u32 %r45, %r5; + +$L__BB57_8: + mul.wide.u32 %rd14, %r44, 4; + add.s64 %rd15, %rd1, %rd14; + atom.global.add.f32 %f3, [%rd15], 0f00000000; + add.s32 %r44, %r44, %r7; + add.s32 %r45, %r45, %r7; + setp.lt.u32 %p6, %r45, %r25; + @%p6 bra $L__BB57_8; $L__BB57_11: - mul.wide.u32 %rd22, %r49, 8; - add.s64 %rd23, %rd2, %rd22; - ld.global.nc.u64 %rd24, [%rd23]; - cvta.to.global.u64 %rd25, %rd24; - add.s64 %rd27, %rd25, %rd12; - ld.global.f32 %f7, [%rd27]; - mul.ftz.f32 %f8, %f1, %f7; - add.s32 %r42, %r49, %r10; - mul.wide.u32 %rd28, %r42, 4; - add.s64 %rd29, %rd1, %rd28; - atom.global.add.f32 %f9, [%rd29], %f8; - add.s32 %r49, %r49, %r7; - setp.lt.u32 %p8, %r49, %r27; - @%p8 bra $L__BB57_11; - bra.uni $L__BB57_14; + add.s32 %r43, %r43, %r6; + setp.lt.u32 %p8, %r43, %r26; + @%p8 bra $L__BB57_5; + bra.uni $L__BB57_17; $L__BB57_12: - mov.u32 %r50, %r5; + mov.u32 %r47, %r3; $L__BB57_13: - add.s32 %r43, %r50, %r10; - mul.wide.u32 %rd30, %r43, 4; - add.s64 %rd31, %rd1, %rd30; - atom.global.add.f32 %f10, [%rd31], 0f00000000; - add.s32 %r50, %r50, %r7; - setp.lt.u32 %p9, %r50, %r27; - @%p9 bra $L__BB57_13; - -$L__BB57_14: - add.s32 %r46, %r46, %r6; - setp.lt.u32 %p10, %r46, %r28; - @%p10 bra $L__BB57_5; - bra.uni $L__BB57_20; + setp.ge.u32 %p9, %r5, %r25; + @%p9 bra $L__BB57_16; + + mul.wide.u32 %rd24, %r47, 8; + add.s64 %rd25, %rd3, %rd24; + ld.global.nc.u64 %rd26, [%rd25]; + cvta.to.global.u64 %rd27, %rd26; + shl.b64 %rd28, %rd4, 2; + add.s64 %rd29, %rd27, %rd28; + ld.global.f32 %f2, [%rd29]; + mul.lo.s32 %r20, %r47, %r25; + mov.u32 %r48, %r5; $L__BB57_15: - mov.u32 %r51, %r3; + mul.wide.u32 %rd30, %r48, 8; + add.s64 %rd31, %rd2, %rd30; + ld.global.nc.u64 %rd32, [%rd31]; + cvta.to.global.u64 %rd33, %rd32; + add.s64 %rd35, %rd33, %rd28; + ld.global.f32 %f8, [%rd35]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + add.s32 %r41, %r48, %r20; + mul.wide.u32 %rd36, %r41, 4; + add.s64 %rd37, %rd1, %rd36; + atom.global.add.f32 %f11, [%rd37], %f10; + add.s32 %r48, %r48, %r7; + setp.lt.u32 %p10, %r48, %r25; + @%p10 bra $L__BB57_15; $L__BB57_16: - setp.ge.u32 %p11, %r5, %r27; - @%p11 bra $L__BB57_19; - - mul.wide.u32 %rd32, %r51, 8; - add.s64 %rd33, %rd3, %rd32; - ld.global.nc.u64 %rd34, [%rd33]; - cvta.to.global.u64 %rd35, %rd34; - shl.b64 %rd36, %rd4, 2; - add.s64 %rd37, %rd35, %rd36; - ld.global.f32 %f2, [%rd37]; - mul.lo.s32 %r22, %r51, %r27; - mov.u32 %r52, %r5; + add.s32 %r47, %r47, %r6; + setp.lt.u32 %p11, %r47, %r26; + @%p11 bra $L__BB57_13; + +$L__BB57_17: + add.s32 %r42, %r42, %r4; + setp.lt.u32 %p12, %r42, %r27; + @%p12 bra $L__BB57_2; $L__BB57_18: - mul.wide.u32 %rd38, %r52, 8; - add.s64 %rd39, %rd2, %rd38; - ld.global.nc.u64 %rd40, [%rd39]; - cvta.to.global.u64 %rd41, %rd40; - add.s64 %rd43, %rd41, %rd36; - ld.global.f32 %f11, [%rd43]; - sub.ftz.f32 %f12, %f2, %f11; - mul.ftz.f32 %f13, %f12, %f12; - add.s32 %r44, %r52, %r22; - mul.wide.u32 %rd44, %r44, 4; - add.s64 %rd45, %rd1, %rd44; - atom.global.add.f32 %f14, [%rd45], %f13; - add.s32 %r52, %r52, %r7; - setp.lt.u32 %p12, %r52, %r27; - @%p12 bra $L__BB57_18; - -$L__BB57_19: - add.s32 %r51, %r51, %r6; - setp.lt.u32 %p13, %r51, %r28; - @%p13 bra $L__BB57_16; - -$L__BB57_20: - add.s32 %r45, %r45, %r4; - setp.lt.u32 %p14, %r45, %r29; - @%p14 bra $L__BB57_2; - -$L__BB57_21: ret; } - // .globl MultiCosineDistance -.visible .entry MultiCosineDistance( - .param .u64 MultiCosineDistance_param_0, - .param .u64 MultiCosineDistance_param_1, - .param .u64 MultiCosineDistance_param_2, - .param .u64 MultiCosineDistance_param_3, - .param .u64 MultiCosineDistance_param_4, - .param .u32 MultiCosineDistance_param_5, - .param .u32 MultiCosineDistance_param_6, - .param .u32 MultiCosineDistance_param_7 + // .globl CalculateDistances +.visible .entry CalculateDistances( + .param .u64 CalculateDistances_param_0, + .param .u64 CalculateDistances_param_1, + .param .u64 CalculateDistances_param_2, + .param .u32 CalculateDistances_param_3, + .param .u32 CalculateDistances_param_4, + .param .u32 CalculateDistances_param_5 +) +{ + .reg .pred %p<11>; + .reg .f32 %f<12>; + .reg .b32 %r<31>; + .reg .b64 %rd<31>; + + + ld.param.u64 %rd7, [CalculateDistances_param_0]; + ld.param.u64 %rd8, [CalculateDistances_param_1]; + ld.param.u64 %rd9, [CalculateDistances_param_2]; + ld.param.u32 %r16, [CalculateDistances_param_3]; + ld.param.u32 %r17, [CalculateDistances_param_4]; + ld.param.u32 %r18, [CalculateDistances_param_5]; + cvta.to.global.u64 %rd1, %rd9; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd7; + mov.u32 %r19, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r20, %tid.x; + mad.lo.s32 %r26, %r1, %r19, %r20; + setp.ge.u32 %p1, %r26, %r17; + @%p1 bra $L__BB58_15; + + mov.u32 %r21, %ntid.y; + mov.u32 %r22, %ctaid.y; + mov.u32 %r23, %tid.y; + mad.lo.s32 %r3, %r21, %r22, %r23; + mov.u32 %r24, %nctaid.x; + mul.lo.s32 %r4, %r1, %r24; + mov.u32 %r25, %nctaid.y; + mul.lo.s32 %r5, %r21, %r25; + setp.eq.s32 %p2, %r18, 0; + @%p2 bra $L__BB58_11; + +$L__BB58_3: + setp.ge.u32 %p3, %r3, %r16; + @%p3 bra $L__BB58_9; + + setp.eq.s32 %p4, %r18, 2; + cvt.u64.u32 %rd4, %r26; + mul.wide.u32 %rd10, %r26, 4; + add.s64 %rd5, %rd3, %rd10; + @%p4 bra $L__BB58_7; + bra.uni $L__BB58_5; + +$L__BB58_7: + ld.global.nc.f32 %f1, [%rd5]; + mov.u32 %r28, %r3; + +$L__BB58_8: + mul.wide.u32 %rd13, %r28, 8; + add.s64 %rd14, %rd2, %rd13; + ld.global.nc.u64 %rd15, [%rd14]; + cvta.to.global.u64 %rd16, %rd15; + shl.b64 %rd17, %rd4, 2; + add.s64 %rd18, %rd16, %rd17; + ld.global.f32 %f4, [%rd18]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + mul.wide.u32 %rd19, %r28, 4; + add.s64 %rd20, %rd1, %rd19; + atom.global.add.f32 %f7, [%rd20], %f6; + add.s32 %r28, %r28, %r5; + setp.lt.u32 %p6, %r28, %r16; + @%p6 bra $L__BB58_8; + bra.uni $L__BB58_9; + +$L__BB58_5: + mov.u32 %r27, %r3; + +$L__BB58_6: + mul.wide.u32 %rd11, %r27, 4; + add.s64 %rd12, %rd1, %rd11; + atom.global.add.f32 %f3, [%rd12], 0f00000000; + add.s32 %r27, %r27, %r5; + setp.lt.u32 %p5, %r27, %r16; + @%p5 bra $L__BB58_6; + +$L__BB58_9: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p7, %r26, %r17; + @%p7 bra $L__BB58_3; + bra.uni $L__BB58_15; + +$L__BB58_11: + setp.ge.u32 %p8, %r3, %r16; + @%p8 bra $L__BB58_14; + + cvt.u64.u32 %rd6, %r26; + mul.wide.u32 %rd21, %r26, 4; + add.s64 %rd22, %rd3, %rd21; + ld.global.nc.f32 %f2, [%rd22]; + mov.u32 %r30, %r3; + +$L__BB58_13: + mul.wide.u32 %rd23, %r30, 8; + add.s64 %rd24, %rd2, %rd23; + ld.global.nc.u64 %rd25, [%rd24]; + cvta.to.global.u64 %rd26, %rd25; + shl.b64 %rd27, %rd6, 2; + add.s64 %rd28, %rd26, %rd27; + ld.global.f32 %f8, [%rd28]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + mul.wide.u32 %rd29, %r30, 4; + add.s64 %rd30, %rd1, %rd29; + atom.global.add.f32 %f11, [%rd30], %f10; + add.s32 %r30, %r30, %r5; + setp.lt.u32 %p9, %r30, %r16; + @%p9 bra $L__BB58_13; + +$L__BB58_14: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p10, %r26, %r17; + @%p10 bra $L__BB58_11; + +$L__BB58_15: + ret; + +} + // .globl CosineMultiDistance +.visible .entry CosineMultiDistance( + .param .u64 CosineMultiDistance_param_0, + .param .u64 CosineMultiDistance_param_1, + .param .u64 CosineMultiDistance_param_2, + .param .u64 CosineMultiDistance_param_3, + .param .u64 CosineMultiDistance_param_4, + .param .u32 CosineMultiDistance_param_5, + .param .u32 CosineMultiDistance_param_6, + .param .u32 CosineMultiDistance_param_7 ) { .reg .pred %p<7>; @@ -4224,20 +4332,20 @@ $L__BB57_21: .reg .b64 %rd<28>; - ld.param.u64 %rd7, [MultiCosineDistance_param_0]; - ld.param.u64 %rd8, [MultiCosineDistance_param_1]; - ld.param.u64 %rd9, [MultiCosineDistance_param_2]; - ld.param.u64 %rd10, [MultiCosineDistance_param_3]; - ld.param.u64 %rd11, [MultiCosineDistance_param_4]; - ld.param.u32 %r17, [MultiCosineDistance_param_5]; - ld.param.u32 %r18, [MultiCosineDistance_param_6]; - ld.param.u32 %r19, [MultiCosineDistance_param_7]; + ld.param.u64 %rd7, [CosineMultiDistance_param_0]; + ld.param.u64 %rd8, [CosineMultiDistance_param_1]; + ld.param.u64 %rd9, [CosineMultiDistance_param_2]; + ld.param.u64 %rd10, [CosineMultiDistance_param_3]; + ld.param.u64 %rd11, [CosineMultiDistance_param_4]; + ld.param.u32 %r17, [CosineMultiDistance_param_5]; + ld.param.u32 %r18, [CosineMultiDistance_param_6]; + ld.param.u32 %r19, [CosineMultiDistance_param_7]; mov.u32 %r20, %ctaid.x; mov.u32 %r1, %ntid.x; mov.u32 %r21, %tid.x; mad.lo.s32 %r31, %r1, %r20, %r21; setp.ge.u32 %p1, %r31, %r19; - @%p1 bra $L__BB58_9; + @%p1 bra $L__BB59_9; mov.u32 %r22, %ntid.y; mov.u32 %r23, %ctaid.y; @@ -4259,16 +4367,16 @@ $L__BB57_21: cvta.to.global.u64 %rd4, %rd10; cvta.to.global.u64 %rd5, %rd11; -$L__BB58_2: +$L__BB59_2: setp.ge.u32 %p2, %r3, %r18; - @%p2 bra $L__BB58_8; + @%p2 bra $L__BB59_8; cvt.u64.u32 %rd6, %r31; mov.u32 %r32, %r3; -$L__BB58_4: +$L__BB59_4: setp.ge.u32 %p3, %r5, %r17; - @%p3 bra $L__BB58_7; + @%p3 bra $L__BB59_7; mul.wide.u32 %rd12, %r32, 8; add.s64 %rd13, %rd1, %rd12; @@ -4281,7 +4389,7 @@ $L__BB58_4: mad.lo.s32 %r33, %r32, %r17, %r5; mov.u32 %r34, %r5; -$L__BB58_6: +$L__BB59_6: mul.wide.u32 %rd18, %r34, 8; add.s64 %rd19, %rd2, %rd18; ld.global.nc.u64 %rd20, [%rd19]; @@ -4300,19 +4408,105 @@ $L__BB58_6: add.s32 %r33, %r33, %r7; add.s32 %r34, %r34, %r7; setp.lt.u32 %p4, %r34, %r17; - @%p4 bra $L__BB58_6; + @%p4 bra $L__BB59_6; -$L__BB58_7: +$L__BB59_7: add.s32 %r32, %r32, %r6; setp.lt.u32 %p5, %r32, %r18; - @%p5 bra $L__BB58_4; + @%p5 bra $L__BB59_4; -$L__BB58_8: +$L__BB59_8: add.s32 %r31, %r31, %r4; setp.lt.u32 %p6, %r31, %r19; - @%p6 bra $L__BB58_2; + @%p6 bra $L__BB59_2; -$L__BB58_9: +$L__BB59_9: + ret; + +} + // .globl CosineDistances +.visible .entry CosineDistances( + .param .u64 CosineDistances_param_0, + .param .u64 CosineDistances_param_1, + .param .u64 CosineDistances_param_2, + .param .u64 CosineDistances_param_3, + .param .u64 CosineDistances_param_4, + .param .u32 CosineDistances_param_5, + .param .u32 CosineDistances_param_6 +) +{ + .reg .pred %p<5>; + .reg .f32 %f<9>; + .reg .b32 %r<21>; + .reg .b64 %rd<24>; + + + ld.param.u64 %rd7, [CosineDistances_param_0]; + ld.param.u64 %rd8, [CosineDistances_param_1]; + ld.param.u64 %rd9, [CosineDistances_param_2]; + ld.param.u64 %rd10, [CosineDistances_param_3]; + ld.param.u64 %rd11, [CosineDistances_param_4]; + ld.param.u32 %r10, [CosineDistances_param_5]; + ld.param.u32 %r11, [CosineDistances_param_6]; + mov.u32 %r12, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r13, %tid.x; + mad.lo.s32 %r19, %r1, %r12, %r13; + setp.ge.u32 %p1, %r19, %r11; + @%p1 bra $L__BB60_6; + + mov.u32 %r14, %ntid.y; + mov.u32 %r15, %ctaid.y; + mov.u32 %r16, %tid.y; + mad.lo.s32 %r3, %r14, %r15, %r16; + mov.u32 %r17, %nctaid.x; + mul.lo.s32 %r4, %r1, %r17; + mov.u32 %r18, %nctaid.y; + mul.lo.s32 %r5, %r14, %r18; + cvta.to.global.u64 %rd1, %rd7; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd9; + cvta.to.global.u64 %rd4, %rd10; + cvta.to.global.u64 %rd5, %rd11; + +$L__BB60_2: + setp.ge.u32 %p2, %r3, %r10; + @%p2 bra $L__BB60_5; + + cvt.u64.u32 %rd6, %r19; + mul.wide.u32 %rd12, %r19, 4; + add.s64 %rd13, %rd1, %rd12; + ld.global.nc.f32 %f1, [%rd13]; + mul.ftz.f32 %f2, %f1, %f1; + mov.u32 %r20, %r3; + +$L__BB60_4: + mul.wide.u32 %rd14, %r20, 8; + add.s64 %rd15, %rd2, %rd14; + ld.global.nc.u64 %rd16, [%rd15]; + cvta.to.global.u64 %rd17, %rd16; + shl.b64 %rd18, %rd6, 2; + add.s64 %rd19, %rd17, %rd18; + ld.global.f32 %f3, [%rd19]; + mul.wide.u32 %rd20, %r20, 4; + add.s64 %rd21, %rd3, %rd20; + atom.global.add.f32 %f4, [%rd21], %f2; + add.s64 %rd22, %rd4, %rd20; + mul.ftz.f32 %f5, %f1, %f3; + atom.global.add.f32 %f6, [%rd22], %f5; + add.s64 %rd23, %rd5, %rd20; + mul.ftz.f32 %f7, %f3, %f3; + atom.global.add.f32 %f8, [%rd23], %f7; + add.s32 %r20, %r20, %r5; + setp.lt.u32 %p3, %r20, %r10; + @%p3 bra $L__BB60_4; + +$L__BB60_5: + add.s32 %r19, %r19, %r4; + setp.lt.u32 %p4, %r19, %r11; + @%p4 bra $L__BB60_2; + +$L__BB60_6: ret; } @@ -4340,7 +4534,7 @@ $L__BB58_9: mov.u32 %r2, %tid.x; mad.lo.s32 %r3, %r22, %r1, %r2; setp.ge.u32 %p1, %r3, %r20; - @%p1 bra $L__BB59_2; + @%p1 bra $L__BB61_2; cvta.to.global.u64 %rd3, %rd1; mul.lo.s32 %r23, %r3, %r21; @@ -4352,17 +4546,17 @@ $L__BB58_9: add.s32 %r26, %r25, %r24; st.shared.f32 [%r26], %f8; -$L__BB59_2: +$L__BB61_2: bar.sync 0; setp.ne.s32 %p2, %r2, 0; - @%p2 bra $L__BB59_11; + @%p2 bra $L__BB61_11; shl.b32 %r4, %r1, 10; sub.s32 %r27, %r20, %r4; min.u32 %r5, %r27, 1024; setp.eq.s32 %p3, %r5, 0; mov.f32 %f25, 0f00000000; - @%p3 bra $L__BB59_10; + @%p3 bra $L__BB61_10; not.b32 %r29, %r20; add.s32 %r30, %r4, %r29; @@ -4373,7 +4567,7 @@ $L__BB59_2: setp.lt.u32 %p4, %r33, 3; mov.f32 %f25, 0f00000000; mov.u32 %r45, 0; - @%p4 bra $L__BB59_7; + @%p4 bra $L__BB61_7; add.s32 %r36, %r4, -1; sub.s32 %r37, %r36, %r20; @@ -4382,7 +4576,7 @@ $L__BB59_2: neg.s32 %r42, %r39; mov.u32 %r43, _ZZ9SumValuesE5block; -$L__BB59_6: +$L__BB61_6: ld.shared.f32 %f13, [%r43]; add.ftz.f32 %f14, %f25, %f13; ld.shared.f32 %f15, [%r43+4]; @@ -4395,32 +4589,32 @@ $L__BB59_6: add.s32 %r43, %r43, 16; add.s32 %r42, %r42, -4; setp.ne.s32 %p5, %r42, 1; - @%p5 bra $L__BB59_6; + @%p5 bra $L__BB61_6; -$L__BB59_7: +$L__BB61_7: setp.eq.s32 %p6, %r47, 0; - @%p6 bra $L__BB59_10; + @%p6 bra $L__BB61_10; shl.b32 %r40, %r45, 2; mov.u32 %r41, _ZZ9SumValuesE5block; add.s32 %r46, %r41, %r40; -$L__BB59_9: +$L__BB61_9: .pragma "nounroll"; ld.shared.f32 %f20, [%r46]; add.ftz.f32 %f25, %f25, %f20; add.s32 %r46, %r46, 4; add.s32 %r47, %r47, -1; setp.ne.s32 %p7, %r47, 0; - @%p7 bra $L__BB59_9; + @%p7 bra $L__BB61_9; -$L__BB59_10: +$L__BB61_10: cvta.to.global.u64 %rd6, %rd2; mul.wide.u32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f25; -$L__BB59_11: +$L__BB61_11: ret; } diff --git a/BrightData.Cuda/cuda/brightwire_75.ptx b/BrightData.Cuda/cuda/brightwire_75.ptx index 92b2c519..d9f08c80 100644 --- a/BrightData.Cuda/cuda/brightwire_75.ptx +++ b/BrightData.Cuda/cuda/brightwire_75.ptx @@ -4022,200 +4022,308 @@ $L__BB56_3: ret; } - // .globl CalculateDistances -.visible .entry CalculateDistances( - .param .u64 CalculateDistances_param_0, - .param .u64 CalculateDistances_param_1, - .param .u64 CalculateDistances_param_2, - .param .u32 CalculateDistances_param_3, - .param .u32 CalculateDistances_param_4, - .param .u32 CalculateDistances_param_5, - .param .u32 CalculateDistances_param_6 + // .globl CalculateMultiDistances +.visible .entry CalculateMultiDistances( + .param .u64 CalculateMultiDistances_param_0, + .param .u64 CalculateMultiDistances_param_1, + .param .u64 CalculateMultiDistances_param_2, + .param .u32 CalculateMultiDistances_param_3, + .param .u32 CalculateMultiDistances_param_4, + .param .u32 CalculateMultiDistances_param_5, + .param .u32 CalculateMultiDistances_param_6 ) { - .reg .pred %p<15>; - .reg .f32 %f<15>; - .reg .b32 %r<53>; - .reg .b64 %rd<46>; - - - ld.param.u64 %rd5, [CalculateDistances_param_0]; - ld.param.u64 %rd6, [CalculateDistances_param_1]; - ld.param.u64 %rd7, [CalculateDistances_param_2]; - ld.param.u32 %r27, [CalculateDistances_param_3]; - ld.param.u32 %r28, [CalculateDistances_param_4]; - ld.param.u32 %r29, [CalculateDistances_param_5]; - ld.param.u32 %r30, [CalculateDistances_param_6]; - cvta.to.global.u64 %rd1, %rd7; - cvta.to.global.u64 %rd2, %rd6; - cvta.to.global.u64 %rd3, %rd5; - mov.u32 %r31, %ctaid.x; + .reg .pred %p<13>; + .reg .f32 %f<12>; + .reg .b32 %r<49>; + .reg .b64 %rd<38>; + + + ld.param.u64 %rd6, [CalculateMultiDistances_param_0]; + ld.param.u64 %rd7, [CalculateMultiDistances_param_1]; + ld.param.u64 %rd8, [CalculateMultiDistances_param_2]; + ld.param.u32 %r25, [CalculateMultiDistances_param_3]; + ld.param.u32 %r26, [CalculateMultiDistances_param_4]; + ld.param.u32 %r27, [CalculateMultiDistances_param_5]; + ld.param.u32 %r28, [CalculateMultiDistances_param_6]; + cvta.to.global.u64 %rd1, %rd8; + cvta.to.global.u64 %rd2, %rd7; + cvta.to.global.u64 %rd3, %rd6; + mov.u32 %r29, %ctaid.x; mov.u32 %r1, %ntid.x; - mov.u32 %r32, %tid.x; - mad.lo.s32 %r45, %r1, %r31, %r32; - setp.ge.u32 %p1, %r45, %r29; - @%p1 bra $L__BB57_21; - - mov.u32 %r33, %ntid.y; - mov.u32 %r34, %ctaid.y; - mov.u32 %r35, %tid.y; - mad.lo.s32 %r3, %r33, %r34, %r35; - mov.u32 %r36, %nctaid.x; - mul.lo.s32 %r4, %r1, %r36; - mov.u32 %r37, %ctaid.z; - mov.u32 %r38, %ntid.z; - mov.u32 %r39, %tid.z; - mad.lo.s32 %r5, %r38, %r37, %r39; - mov.u32 %r40, %nctaid.y; - mul.lo.s32 %r6, %r33, %r40; - mov.u32 %r41, %nctaid.z; - mul.lo.s32 %r7, %r38, %r41; + mov.u32 %r30, %tid.x; + mad.lo.s32 %r42, %r1, %r29, %r30; + setp.ge.u32 %p1, %r42, %r27; + @%p1 bra $L__BB57_18; + + mov.u32 %r31, %ntid.y; + mov.u32 %r32, %ctaid.y; + mov.u32 %r33, %tid.y; + mad.lo.s32 %r3, %r31, %r32, %r33; + mov.u32 %r34, %nctaid.x; + mul.lo.s32 %r4, %r1, %r34; + mov.u32 %r35, %ctaid.z; + mov.u32 %r36, %ntid.z; + mov.u32 %r37, %tid.z; + mad.lo.s32 %r5, %r36, %r35, %r37; + mov.u32 %r38, %nctaid.y; + mul.lo.s32 %r6, %r31, %r38; + mov.u32 %r39, %nctaid.z; + mul.lo.s32 %r7, %r36, %r39; $L__BB57_2: - setp.ge.u32 %p2, %r3, %r28; - @%p2 bra $L__BB57_20; + setp.ge.u32 %p2, %r3, %r26; + @%p2 bra $L__BB57_17; - setp.eq.s32 %p3, %r30, 0; - cvt.u64.u32 %rd4, %r45; - @%p3 bra $L__BB57_15; + setp.eq.s32 %p3, %r28, 0; + cvt.u64.u32 %rd4, %r42; + @%p3 bra $L__BB57_12; - mov.u32 %r46, %r3; + mov.u32 %r43, %r3; $L__BB57_5: - setp.ge.u32 %p4, %r5, %r27; - @%p4 bra $L__BB57_14; - - mul.wide.u32 %rd8, %r46, 8; - add.s64 %rd9, %rd3, %rd8; - ld.global.nc.u64 %rd10, [%rd9]; - cvta.to.global.u64 %rd11, %rd10; - shl.b64 %rd12, %rd4, 2; - add.s64 %rd13, %rd11, %rd12; - ld.global.f32 %f1, [%rd13]; - mul.lo.s32 %r10, %r46, %r27; - setp.eq.s32 %p5, %r30, 1; - @%p5 bra $L__BB57_10; - - setp.ne.s32 %p6, %r30, 2; - @%p6 bra $L__BB57_12; + setp.ge.u32 %p4, %r5, %r25; + @%p4 bra $L__BB57_11; - add.s32 %r47, %r5, %r10; - mov.u32 %r48, %r5; + setp.eq.s32 %p5, %r28, 2; + mul.wide.u32 %rd9, %r43, 8; + add.s64 %rd10, %rd3, %rd9; + ld.global.nc.u64 %rd11, [%rd10]; + cvta.to.global.u64 %rd12, %rd11; + shl.b64 %rd13, %rd4, 2; + add.s64 %rd5, %rd12, %rd13; + mul.lo.s32 %r10, %r43, %r25; + @%p5 bra $L__BB57_9; + bra.uni $L__BB57_7; $L__BB57_9: - mul.wide.u32 %rd14, %r48, 8; - add.s64 %rd15, %rd2, %rd14; - ld.global.nc.u64 %rd16, [%rd15]; - cvta.to.global.u64 %rd17, %rd16; - add.s64 %rd19, %rd17, %rd12; - ld.global.f32 %f3, [%rd19]; - sub.ftz.f32 %f4, %f1, %f3; - abs.ftz.f32 %f5, %f4; - mul.wide.u32 %rd20, %r47, 4; - add.s64 %rd21, %rd1, %rd20; - atom.global.add.f32 %f6, [%rd21], %f5; - add.s32 %r47, %r47, %r7; - add.s32 %r48, %r48, %r7; - setp.lt.u32 %p7, %r48, %r27; - @%p7 bra $L__BB57_9; - bra.uni $L__BB57_14; + ld.global.f32 %f1, [%rd5]; + mov.u32 %r46, %r5; $L__BB57_10: - mov.u32 %r49, %r5; + mul.wide.u32 %rd16, %r46, 8; + add.s64 %rd17, %rd2, %rd16; + ld.global.nc.u64 %rd18, [%rd17]; + cvta.to.global.u64 %rd19, %rd18; + add.s64 %rd21, %rd19, %rd13; + ld.global.f32 %f4, [%rd21]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + add.s32 %r40, %r46, %r10; + mul.wide.u32 %rd22, %r40, 4; + add.s64 %rd23, %rd1, %rd22; + atom.global.add.f32 %f7, [%rd23], %f6; + add.s32 %r46, %r46, %r7; + setp.lt.u32 %p7, %r46, %r25; + @%p7 bra $L__BB57_10; + bra.uni $L__BB57_11; + +$L__BB57_7: + add.s32 %r44, %r5, %r10; + mov.u32 %r45, %r5; + +$L__BB57_8: + mul.wide.u32 %rd14, %r44, 4; + add.s64 %rd15, %rd1, %rd14; + atom.global.add.f32 %f3, [%rd15], 0f00000000; + add.s32 %r44, %r44, %r7; + add.s32 %r45, %r45, %r7; + setp.lt.u32 %p6, %r45, %r25; + @%p6 bra $L__BB57_8; $L__BB57_11: - mul.wide.u32 %rd22, %r49, 8; - add.s64 %rd23, %rd2, %rd22; - ld.global.nc.u64 %rd24, [%rd23]; - cvta.to.global.u64 %rd25, %rd24; - add.s64 %rd27, %rd25, %rd12; - ld.global.f32 %f7, [%rd27]; - mul.ftz.f32 %f8, %f1, %f7; - add.s32 %r42, %r49, %r10; - mul.wide.u32 %rd28, %r42, 4; - add.s64 %rd29, %rd1, %rd28; - atom.global.add.f32 %f9, [%rd29], %f8; - add.s32 %r49, %r49, %r7; - setp.lt.u32 %p8, %r49, %r27; - @%p8 bra $L__BB57_11; - bra.uni $L__BB57_14; + add.s32 %r43, %r43, %r6; + setp.lt.u32 %p8, %r43, %r26; + @%p8 bra $L__BB57_5; + bra.uni $L__BB57_17; $L__BB57_12: - mov.u32 %r50, %r5; + mov.u32 %r47, %r3; $L__BB57_13: - add.s32 %r43, %r50, %r10; - mul.wide.u32 %rd30, %r43, 4; - add.s64 %rd31, %rd1, %rd30; - atom.global.add.f32 %f10, [%rd31], 0f00000000; - add.s32 %r50, %r50, %r7; - setp.lt.u32 %p9, %r50, %r27; - @%p9 bra $L__BB57_13; - -$L__BB57_14: - add.s32 %r46, %r46, %r6; - setp.lt.u32 %p10, %r46, %r28; - @%p10 bra $L__BB57_5; - bra.uni $L__BB57_20; + setp.ge.u32 %p9, %r5, %r25; + @%p9 bra $L__BB57_16; + + mul.wide.u32 %rd24, %r47, 8; + add.s64 %rd25, %rd3, %rd24; + ld.global.nc.u64 %rd26, [%rd25]; + cvta.to.global.u64 %rd27, %rd26; + shl.b64 %rd28, %rd4, 2; + add.s64 %rd29, %rd27, %rd28; + ld.global.f32 %f2, [%rd29]; + mul.lo.s32 %r20, %r47, %r25; + mov.u32 %r48, %r5; $L__BB57_15: - mov.u32 %r51, %r3; + mul.wide.u32 %rd30, %r48, 8; + add.s64 %rd31, %rd2, %rd30; + ld.global.nc.u64 %rd32, [%rd31]; + cvta.to.global.u64 %rd33, %rd32; + add.s64 %rd35, %rd33, %rd28; + ld.global.f32 %f8, [%rd35]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + add.s32 %r41, %r48, %r20; + mul.wide.u32 %rd36, %r41, 4; + add.s64 %rd37, %rd1, %rd36; + atom.global.add.f32 %f11, [%rd37], %f10; + add.s32 %r48, %r48, %r7; + setp.lt.u32 %p10, %r48, %r25; + @%p10 bra $L__BB57_15; $L__BB57_16: - setp.ge.u32 %p11, %r5, %r27; - @%p11 bra $L__BB57_19; - - mul.wide.u32 %rd32, %r51, 8; - add.s64 %rd33, %rd3, %rd32; - ld.global.nc.u64 %rd34, [%rd33]; - cvta.to.global.u64 %rd35, %rd34; - shl.b64 %rd36, %rd4, 2; - add.s64 %rd37, %rd35, %rd36; - ld.global.f32 %f2, [%rd37]; - mul.lo.s32 %r22, %r51, %r27; - mov.u32 %r52, %r5; + add.s32 %r47, %r47, %r6; + setp.lt.u32 %p11, %r47, %r26; + @%p11 bra $L__BB57_13; + +$L__BB57_17: + add.s32 %r42, %r42, %r4; + setp.lt.u32 %p12, %r42, %r27; + @%p12 bra $L__BB57_2; $L__BB57_18: - mul.wide.u32 %rd38, %r52, 8; - add.s64 %rd39, %rd2, %rd38; - ld.global.nc.u64 %rd40, [%rd39]; - cvta.to.global.u64 %rd41, %rd40; - add.s64 %rd43, %rd41, %rd36; - ld.global.f32 %f11, [%rd43]; - sub.ftz.f32 %f12, %f2, %f11; - mul.ftz.f32 %f13, %f12, %f12; - add.s32 %r44, %r52, %r22; - mul.wide.u32 %rd44, %r44, 4; - add.s64 %rd45, %rd1, %rd44; - atom.global.add.f32 %f14, [%rd45], %f13; - add.s32 %r52, %r52, %r7; - setp.lt.u32 %p12, %r52, %r27; - @%p12 bra $L__BB57_18; - -$L__BB57_19: - add.s32 %r51, %r51, %r6; - setp.lt.u32 %p13, %r51, %r28; - @%p13 bra $L__BB57_16; - -$L__BB57_20: - add.s32 %r45, %r45, %r4; - setp.lt.u32 %p14, %r45, %r29; - @%p14 bra $L__BB57_2; - -$L__BB57_21: ret; } - // .globl MultiCosineDistance -.visible .entry MultiCosineDistance( - .param .u64 MultiCosineDistance_param_0, - .param .u64 MultiCosineDistance_param_1, - .param .u64 MultiCosineDistance_param_2, - .param .u64 MultiCosineDistance_param_3, - .param .u64 MultiCosineDistance_param_4, - .param .u32 MultiCosineDistance_param_5, - .param .u32 MultiCosineDistance_param_6, - .param .u32 MultiCosineDistance_param_7 + // .globl CalculateDistances +.visible .entry CalculateDistances( + .param .u64 CalculateDistances_param_0, + .param .u64 CalculateDistances_param_1, + .param .u64 CalculateDistances_param_2, + .param .u32 CalculateDistances_param_3, + .param .u32 CalculateDistances_param_4, + .param .u32 CalculateDistances_param_5 +) +{ + .reg .pred %p<11>; + .reg .f32 %f<12>; + .reg .b32 %r<31>; + .reg .b64 %rd<31>; + + + ld.param.u64 %rd7, [CalculateDistances_param_0]; + ld.param.u64 %rd8, [CalculateDistances_param_1]; + ld.param.u64 %rd9, [CalculateDistances_param_2]; + ld.param.u32 %r16, [CalculateDistances_param_3]; + ld.param.u32 %r17, [CalculateDistances_param_4]; + ld.param.u32 %r18, [CalculateDistances_param_5]; + cvta.to.global.u64 %rd1, %rd9; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd7; + mov.u32 %r19, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r20, %tid.x; + mad.lo.s32 %r26, %r1, %r19, %r20; + setp.ge.u32 %p1, %r26, %r17; + @%p1 bra $L__BB58_15; + + mov.u32 %r21, %ntid.y; + mov.u32 %r22, %ctaid.y; + mov.u32 %r23, %tid.y; + mad.lo.s32 %r3, %r21, %r22, %r23; + mov.u32 %r24, %nctaid.x; + mul.lo.s32 %r4, %r1, %r24; + mov.u32 %r25, %nctaid.y; + mul.lo.s32 %r5, %r21, %r25; + setp.eq.s32 %p2, %r18, 0; + @%p2 bra $L__BB58_11; + +$L__BB58_3: + setp.ge.u32 %p3, %r3, %r16; + @%p3 bra $L__BB58_9; + + setp.eq.s32 %p4, %r18, 2; + cvt.u64.u32 %rd4, %r26; + mul.wide.u32 %rd10, %r26, 4; + add.s64 %rd5, %rd3, %rd10; + @%p4 bra $L__BB58_7; + bra.uni $L__BB58_5; + +$L__BB58_7: + ld.global.nc.f32 %f1, [%rd5]; + mov.u32 %r28, %r3; + +$L__BB58_8: + mul.wide.u32 %rd13, %r28, 8; + add.s64 %rd14, %rd2, %rd13; + ld.global.nc.u64 %rd15, [%rd14]; + cvta.to.global.u64 %rd16, %rd15; + shl.b64 %rd17, %rd4, 2; + add.s64 %rd18, %rd16, %rd17; + ld.global.f32 %f4, [%rd18]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + mul.wide.u32 %rd19, %r28, 4; + add.s64 %rd20, %rd1, %rd19; + atom.global.add.f32 %f7, [%rd20], %f6; + add.s32 %r28, %r28, %r5; + setp.lt.u32 %p6, %r28, %r16; + @%p6 bra $L__BB58_8; + bra.uni $L__BB58_9; + +$L__BB58_5: + mov.u32 %r27, %r3; + +$L__BB58_6: + mul.wide.u32 %rd11, %r27, 4; + add.s64 %rd12, %rd1, %rd11; + atom.global.add.f32 %f3, [%rd12], 0f00000000; + add.s32 %r27, %r27, %r5; + setp.lt.u32 %p5, %r27, %r16; + @%p5 bra $L__BB58_6; + +$L__BB58_9: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p7, %r26, %r17; + @%p7 bra $L__BB58_3; + bra.uni $L__BB58_15; + +$L__BB58_11: + setp.ge.u32 %p8, %r3, %r16; + @%p8 bra $L__BB58_14; + + cvt.u64.u32 %rd6, %r26; + mul.wide.u32 %rd21, %r26, 4; + add.s64 %rd22, %rd3, %rd21; + ld.global.nc.f32 %f2, [%rd22]; + mov.u32 %r30, %r3; + +$L__BB58_13: + mul.wide.u32 %rd23, %r30, 8; + add.s64 %rd24, %rd2, %rd23; + ld.global.nc.u64 %rd25, [%rd24]; + cvta.to.global.u64 %rd26, %rd25; + shl.b64 %rd27, %rd6, 2; + add.s64 %rd28, %rd26, %rd27; + ld.global.f32 %f8, [%rd28]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + mul.wide.u32 %rd29, %r30, 4; + add.s64 %rd30, %rd1, %rd29; + atom.global.add.f32 %f11, [%rd30], %f10; + add.s32 %r30, %r30, %r5; + setp.lt.u32 %p9, %r30, %r16; + @%p9 bra $L__BB58_13; + +$L__BB58_14: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p10, %r26, %r17; + @%p10 bra $L__BB58_11; + +$L__BB58_15: + ret; + +} + // .globl CosineMultiDistance +.visible .entry CosineMultiDistance( + .param .u64 CosineMultiDistance_param_0, + .param .u64 CosineMultiDistance_param_1, + .param .u64 CosineMultiDistance_param_2, + .param .u64 CosineMultiDistance_param_3, + .param .u64 CosineMultiDistance_param_4, + .param .u32 CosineMultiDistance_param_5, + .param .u32 CosineMultiDistance_param_6, + .param .u32 CosineMultiDistance_param_7 ) { .reg .pred %p<7>; @@ -4224,20 +4332,20 @@ $L__BB57_21: .reg .b64 %rd<28>; - ld.param.u64 %rd7, [MultiCosineDistance_param_0]; - ld.param.u64 %rd8, [MultiCosineDistance_param_1]; - ld.param.u64 %rd9, [MultiCosineDistance_param_2]; - ld.param.u64 %rd10, [MultiCosineDistance_param_3]; - ld.param.u64 %rd11, [MultiCosineDistance_param_4]; - ld.param.u32 %r17, [MultiCosineDistance_param_5]; - ld.param.u32 %r18, [MultiCosineDistance_param_6]; - ld.param.u32 %r19, [MultiCosineDistance_param_7]; + ld.param.u64 %rd7, [CosineMultiDistance_param_0]; + ld.param.u64 %rd8, [CosineMultiDistance_param_1]; + ld.param.u64 %rd9, [CosineMultiDistance_param_2]; + ld.param.u64 %rd10, [CosineMultiDistance_param_3]; + ld.param.u64 %rd11, [CosineMultiDistance_param_4]; + ld.param.u32 %r17, [CosineMultiDistance_param_5]; + ld.param.u32 %r18, [CosineMultiDistance_param_6]; + ld.param.u32 %r19, [CosineMultiDistance_param_7]; mov.u32 %r20, %ctaid.x; mov.u32 %r1, %ntid.x; mov.u32 %r21, %tid.x; mad.lo.s32 %r31, %r1, %r20, %r21; setp.ge.u32 %p1, %r31, %r19; - @%p1 bra $L__BB58_9; + @%p1 bra $L__BB59_9; mov.u32 %r22, %ntid.y; mov.u32 %r23, %ctaid.y; @@ -4259,16 +4367,16 @@ $L__BB57_21: cvta.to.global.u64 %rd4, %rd10; cvta.to.global.u64 %rd5, %rd11; -$L__BB58_2: +$L__BB59_2: setp.ge.u32 %p2, %r3, %r18; - @%p2 bra $L__BB58_8; + @%p2 bra $L__BB59_8; cvt.u64.u32 %rd6, %r31; mov.u32 %r32, %r3; -$L__BB58_4: +$L__BB59_4: setp.ge.u32 %p3, %r5, %r17; - @%p3 bra $L__BB58_7; + @%p3 bra $L__BB59_7; mul.wide.u32 %rd12, %r32, 8; add.s64 %rd13, %rd1, %rd12; @@ -4281,7 +4389,7 @@ $L__BB58_4: mad.lo.s32 %r33, %r32, %r17, %r5; mov.u32 %r34, %r5; -$L__BB58_6: +$L__BB59_6: mul.wide.u32 %rd18, %r34, 8; add.s64 %rd19, %rd2, %rd18; ld.global.nc.u64 %rd20, [%rd19]; @@ -4300,19 +4408,105 @@ $L__BB58_6: add.s32 %r33, %r33, %r7; add.s32 %r34, %r34, %r7; setp.lt.u32 %p4, %r34, %r17; - @%p4 bra $L__BB58_6; + @%p4 bra $L__BB59_6; -$L__BB58_7: +$L__BB59_7: add.s32 %r32, %r32, %r6; setp.lt.u32 %p5, %r32, %r18; - @%p5 bra $L__BB58_4; + @%p5 bra $L__BB59_4; -$L__BB58_8: +$L__BB59_8: add.s32 %r31, %r31, %r4; setp.lt.u32 %p6, %r31, %r19; - @%p6 bra $L__BB58_2; + @%p6 bra $L__BB59_2; -$L__BB58_9: +$L__BB59_9: + ret; + +} + // .globl CosineDistances +.visible .entry CosineDistances( + .param .u64 CosineDistances_param_0, + .param .u64 CosineDistances_param_1, + .param .u64 CosineDistances_param_2, + .param .u64 CosineDistances_param_3, + .param .u64 CosineDistances_param_4, + .param .u32 CosineDistances_param_5, + .param .u32 CosineDistances_param_6 +) +{ + .reg .pred %p<5>; + .reg .f32 %f<9>; + .reg .b32 %r<21>; + .reg .b64 %rd<24>; + + + ld.param.u64 %rd7, [CosineDistances_param_0]; + ld.param.u64 %rd8, [CosineDistances_param_1]; + ld.param.u64 %rd9, [CosineDistances_param_2]; + ld.param.u64 %rd10, [CosineDistances_param_3]; + ld.param.u64 %rd11, [CosineDistances_param_4]; + ld.param.u32 %r10, [CosineDistances_param_5]; + ld.param.u32 %r11, [CosineDistances_param_6]; + mov.u32 %r12, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r13, %tid.x; + mad.lo.s32 %r19, %r1, %r12, %r13; + setp.ge.u32 %p1, %r19, %r11; + @%p1 bra $L__BB60_6; + + mov.u32 %r14, %ntid.y; + mov.u32 %r15, %ctaid.y; + mov.u32 %r16, %tid.y; + mad.lo.s32 %r3, %r14, %r15, %r16; + mov.u32 %r17, %nctaid.x; + mul.lo.s32 %r4, %r1, %r17; + mov.u32 %r18, %nctaid.y; + mul.lo.s32 %r5, %r14, %r18; + cvta.to.global.u64 %rd1, %rd7; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd9; + cvta.to.global.u64 %rd4, %rd10; + cvta.to.global.u64 %rd5, %rd11; + +$L__BB60_2: + setp.ge.u32 %p2, %r3, %r10; + @%p2 bra $L__BB60_5; + + cvt.u64.u32 %rd6, %r19; + mul.wide.u32 %rd12, %r19, 4; + add.s64 %rd13, %rd1, %rd12; + ld.global.nc.f32 %f1, [%rd13]; + mul.ftz.f32 %f2, %f1, %f1; + mov.u32 %r20, %r3; + +$L__BB60_4: + mul.wide.u32 %rd14, %r20, 8; + add.s64 %rd15, %rd2, %rd14; + ld.global.nc.u64 %rd16, [%rd15]; + cvta.to.global.u64 %rd17, %rd16; + shl.b64 %rd18, %rd6, 2; + add.s64 %rd19, %rd17, %rd18; + ld.global.f32 %f3, [%rd19]; + mul.wide.u32 %rd20, %r20, 4; + add.s64 %rd21, %rd3, %rd20; + atom.global.add.f32 %f4, [%rd21], %f2; + add.s64 %rd22, %rd4, %rd20; + mul.ftz.f32 %f5, %f1, %f3; + atom.global.add.f32 %f6, [%rd22], %f5; + add.s64 %rd23, %rd5, %rd20; + mul.ftz.f32 %f7, %f3, %f3; + atom.global.add.f32 %f8, [%rd23], %f7; + add.s32 %r20, %r20, %r5; + setp.lt.u32 %p3, %r20, %r10; + @%p3 bra $L__BB60_4; + +$L__BB60_5: + add.s32 %r19, %r19, %r4; + setp.lt.u32 %p4, %r19, %r11; + @%p4 bra $L__BB60_2; + +$L__BB60_6: ret; } @@ -4340,7 +4534,7 @@ $L__BB58_9: mov.u32 %r2, %tid.x; mad.lo.s32 %r3, %r22, %r1, %r2; setp.ge.u32 %p1, %r3, %r20; - @%p1 bra $L__BB59_2; + @%p1 bra $L__BB61_2; cvta.to.global.u64 %rd3, %rd1; mul.lo.s32 %r23, %r3, %r21; @@ -4352,17 +4546,17 @@ $L__BB58_9: add.s32 %r26, %r25, %r24; st.shared.f32 [%r26], %f8; -$L__BB59_2: +$L__BB61_2: bar.sync 0; setp.ne.s32 %p2, %r2, 0; - @%p2 bra $L__BB59_11; + @%p2 bra $L__BB61_11; shl.b32 %r4, %r1, 10; sub.s32 %r27, %r20, %r4; min.u32 %r5, %r27, 1024; setp.eq.s32 %p3, %r5, 0; mov.f32 %f25, 0f00000000; - @%p3 bra $L__BB59_10; + @%p3 bra $L__BB61_10; not.b32 %r29, %r20; add.s32 %r30, %r4, %r29; @@ -4373,7 +4567,7 @@ $L__BB59_2: setp.lt.u32 %p4, %r33, 3; mov.f32 %f25, 0f00000000; mov.u32 %r45, 0; - @%p4 bra $L__BB59_7; + @%p4 bra $L__BB61_7; add.s32 %r36, %r4, -1; sub.s32 %r37, %r36, %r20; @@ -4382,7 +4576,7 @@ $L__BB59_2: neg.s32 %r42, %r39; mov.u32 %r43, _ZZ9SumValuesE5block; -$L__BB59_6: +$L__BB61_6: ld.shared.f32 %f13, [%r43]; add.ftz.f32 %f14, %f25, %f13; ld.shared.f32 %f15, [%r43+4]; @@ -4395,32 +4589,32 @@ $L__BB59_6: add.s32 %r43, %r43, 16; add.s32 %r42, %r42, -4; setp.ne.s32 %p5, %r42, 1; - @%p5 bra $L__BB59_6; + @%p5 bra $L__BB61_6; -$L__BB59_7: +$L__BB61_7: setp.eq.s32 %p6, %r47, 0; - @%p6 bra $L__BB59_10; + @%p6 bra $L__BB61_10; shl.b32 %r40, %r45, 2; mov.u32 %r41, _ZZ9SumValuesE5block; add.s32 %r46, %r41, %r40; -$L__BB59_9: +$L__BB61_9: .pragma "nounroll"; ld.shared.f32 %f20, [%r46]; add.ftz.f32 %f25, %f25, %f20; add.s32 %r46, %r46, 4; add.s32 %r47, %r47, -1; setp.ne.s32 %p7, %r47, 0; - @%p7 bra $L__BB59_9; + @%p7 bra $L__BB61_9; -$L__BB59_10: +$L__BB61_10: cvta.to.global.u64 %rd6, %rd2; mul.wide.u32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f25; -$L__BB59_11: +$L__BB61_11: ret; } diff --git a/BrightData.Cuda/cuda/brightwire_80.ptx b/BrightData.Cuda/cuda/brightwire_80.ptx index 2f211aad..1ff5501c 100644 --- a/BrightData.Cuda/cuda/brightwire_80.ptx +++ b/BrightData.Cuda/cuda/brightwire_80.ptx @@ -4022,200 +4022,308 @@ $L__BB56_3: ret; } - // .globl CalculateDistances -.visible .entry CalculateDistances( - .param .u64 CalculateDistances_param_0, - .param .u64 CalculateDistances_param_1, - .param .u64 CalculateDistances_param_2, - .param .u32 CalculateDistances_param_3, - .param .u32 CalculateDistances_param_4, - .param .u32 CalculateDistances_param_5, - .param .u32 CalculateDistances_param_6 + // .globl CalculateMultiDistances +.visible .entry CalculateMultiDistances( + .param .u64 CalculateMultiDistances_param_0, + .param .u64 CalculateMultiDistances_param_1, + .param .u64 CalculateMultiDistances_param_2, + .param .u32 CalculateMultiDistances_param_3, + .param .u32 CalculateMultiDistances_param_4, + .param .u32 CalculateMultiDistances_param_5, + .param .u32 CalculateMultiDistances_param_6 ) { - .reg .pred %p<15>; - .reg .f32 %f<15>; - .reg .b32 %r<53>; - .reg .b64 %rd<46>; - - - ld.param.u64 %rd5, [CalculateDistances_param_0]; - ld.param.u64 %rd6, [CalculateDistances_param_1]; - ld.param.u64 %rd7, [CalculateDistances_param_2]; - ld.param.u32 %r27, [CalculateDistances_param_3]; - ld.param.u32 %r28, [CalculateDistances_param_4]; - ld.param.u32 %r29, [CalculateDistances_param_5]; - ld.param.u32 %r30, [CalculateDistances_param_6]; - cvta.to.global.u64 %rd1, %rd7; - cvta.to.global.u64 %rd2, %rd6; - cvta.to.global.u64 %rd3, %rd5; - mov.u32 %r31, %ctaid.x; + .reg .pred %p<13>; + .reg .f32 %f<12>; + .reg .b32 %r<49>; + .reg .b64 %rd<38>; + + + ld.param.u64 %rd6, [CalculateMultiDistances_param_0]; + ld.param.u64 %rd7, [CalculateMultiDistances_param_1]; + ld.param.u64 %rd8, [CalculateMultiDistances_param_2]; + ld.param.u32 %r25, [CalculateMultiDistances_param_3]; + ld.param.u32 %r26, [CalculateMultiDistances_param_4]; + ld.param.u32 %r27, [CalculateMultiDistances_param_5]; + ld.param.u32 %r28, [CalculateMultiDistances_param_6]; + cvta.to.global.u64 %rd1, %rd8; + cvta.to.global.u64 %rd2, %rd7; + cvta.to.global.u64 %rd3, %rd6; + mov.u32 %r29, %ctaid.x; mov.u32 %r1, %ntid.x; - mov.u32 %r32, %tid.x; - mad.lo.s32 %r45, %r1, %r31, %r32; - setp.ge.u32 %p1, %r45, %r29; - @%p1 bra $L__BB57_21; - - mov.u32 %r33, %ntid.y; - mov.u32 %r34, %ctaid.y; - mov.u32 %r35, %tid.y; - mad.lo.s32 %r3, %r33, %r34, %r35; - mov.u32 %r36, %nctaid.x; - mul.lo.s32 %r4, %r1, %r36; - mov.u32 %r37, %ctaid.z; - mov.u32 %r38, %ntid.z; - mov.u32 %r39, %tid.z; - mad.lo.s32 %r5, %r38, %r37, %r39; - mov.u32 %r40, %nctaid.y; - mul.lo.s32 %r6, %r33, %r40; - mov.u32 %r41, %nctaid.z; - mul.lo.s32 %r7, %r38, %r41; + mov.u32 %r30, %tid.x; + mad.lo.s32 %r42, %r1, %r29, %r30; + setp.ge.u32 %p1, %r42, %r27; + @%p1 bra $L__BB57_18; + + mov.u32 %r31, %ntid.y; + mov.u32 %r32, %ctaid.y; + mov.u32 %r33, %tid.y; + mad.lo.s32 %r3, %r31, %r32, %r33; + mov.u32 %r34, %nctaid.x; + mul.lo.s32 %r4, %r1, %r34; + mov.u32 %r35, %ctaid.z; + mov.u32 %r36, %ntid.z; + mov.u32 %r37, %tid.z; + mad.lo.s32 %r5, %r36, %r35, %r37; + mov.u32 %r38, %nctaid.y; + mul.lo.s32 %r6, %r31, %r38; + mov.u32 %r39, %nctaid.z; + mul.lo.s32 %r7, %r36, %r39; $L__BB57_2: - setp.ge.u32 %p2, %r3, %r28; - @%p2 bra $L__BB57_20; + setp.ge.u32 %p2, %r3, %r26; + @%p2 bra $L__BB57_17; - setp.eq.s32 %p3, %r30, 0; - cvt.u64.u32 %rd4, %r45; - @%p3 bra $L__BB57_15; + setp.eq.s32 %p3, %r28, 0; + cvt.u64.u32 %rd4, %r42; + @%p3 bra $L__BB57_12; - mov.u32 %r46, %r3; + mov.u32 %r43, %r3; $L__BB57_5: - setp.ge.u32 %p4, %r5, %r27; - @%p4 bra $L__BB57_14; - - mul.wide.u32 %rd8, %r46, 8; - add.s64 %rd9, %rd3, %rd8; - ld.global.nc.u64 %rd10, [%rd9]; - cvta.to.global.u64 %rd11, %rd10; - shl.b64 %rd12, %rd4, 2; - add.s64 %rd13, %rd11, %rd12; - ld.global.f32 %f1, [%rd13]; - mul.lo.s32 %r10, %r46, %r27; - setp.eq.s32 %p5, %r30, 1; - @%p5 bra $L__BB57_10; - - setp.ne.s32 %p6, %r30, 2; - @%p6 bra $L__BB57_12; + setp.ge.u32 %p4, %r5, %r25; + @%p4 bra $L__BB57_11; - add.s32 %r47, %r5, %r10; - mov.u32 %r48, %r5; + setp.eq.s32 %p5, %r28, 2; + mul.wide.u32 %rd9, %r43, 8; + add.s64 %rd10, %rd3, %rd9; + ld.global.nc.u64 %rd11, [%rd10]; + cvta.to.global.u64 %rd12, %rd11; + shl.b64 %rd13, %rd4, 2; + add.s64 %rd5, %rd12, %rd13; + mul.lo.s32 %r10, %r43, %r25; + @%p5 bra $L__BB57_9; + bra.uni $L__BB57_7; $L__BB57_9: - mul.wide.u32 %rd14, %r48, 8; - add.s64 %rd15, %rd2, %rd14; - ld.global.nc.u64 %rd16, [%rd15]; - cvta.to.global.u64 %rd17, %rd16; - add.s64 %rd19, %rd17, %rd12; - ld.global.f32 %f3, [%rd19]; - sub.ftz.f32 %f4, %f1, %f3; - abs.ftz.f32 %f5, %f4; - mul.wide.u32 %rd20, %r47, 4; - add.s64 %rd21, %rd1, %rd20; - atom.global.add.f32 %f6, [%rd21], %f5; - add.s32 %r47, %r47, %r7; - add.s32 %r48, %r48, %r7; - setp.lt.u32 %p7, %r48, %r27; - @%p7 bra $L__BB57_9; - bra.uni $L__BB57_14; + ld.global.f32 %f1, [%rd5]; + mov.u32 %r46, %r5; $L__BB57_10: - mov.u32 %r49, %r5; + mul.wide.u32 %rd16, %r46, 8; + add.s64 %rd17, %rd2, %rd16; + ld.global.nc.u64 %rd18, [%rd17]; + cvta.to.global.u64 %rd19, %rd18; + add.s64 %rd21, %rd19, %rd13; + ld.global.f32 %f4, [%rd21]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + add.s32 %r40, %r46, %r10; + mul.wide.u32 %rd22, %r40, 4; + add.s64 %rd23, %rd1, %rd22; + atom.global.add.f32 %f7, [%rd23], %f6; + add.s32 %r46, %r46, %r7; + setp.lt.u32 %p7, %r46, %r25; + @%p7 bra $L__BB57_10; + bra.uni $L__BB57_11; + +$L__BB57_7: + add.s32 %r44, %r5, %r10; + mov.u32 %r45, %r5; + +$L__BB57_8: + mul.wide.u32 %rd14, %r44, 4; + add.s64 %rd15, %rd1, %rd14; + atom.global.add.f32 %f3, [%rd15], 0f00000000; + add.s32 %r44, %r44, %r7; + add.s32 %r45, %r45, %r7; + setp.lt.u32 %p6, %r45, %r25; + @%p6 bra $L__BB57_8; $L__BB57_11: - mul.wide.u32 %rd22, %r49, 8; - add.s64 %rd23, %rd2, %rd22; - ld.global.nc.u64 %rd24, [%rd23]; - cvta.to.global.u64 %rd25, %rd24; - add.s64 %rd27, %rd25, %rd12; - ld.global.f32 %f7, [%rd27]; - mul.ftz.f32 %f8, %f1, %f7; - add.s32 %r42, %r49, %r10; - mul.wide.u32 %rd28, %r42, 4; - add.s64 %rd29, %rd1, %rd28; - atom.global.add.f32 %f9, [%rd29], %f8; - add.s32 %r49, %r49, %r7; - setp.lt.u32 %p8, %r49, %r27; - @%p8 bra $L__BB57_11; - bra.uni $L__BB57_14; + add.s32 %r43, %r43, %r6; + setp.lt.u32 %p8, %r43, %r26; + @%p8 bra $L__BB57_5; + bra.uni $L__BB57_17; $L__BB57_12: - mov.u32 %r50, %r5; + mov.u32 %r47, %r3; $L__BB57_13: - add.s32 %r43, %r50, %r10; - mul.wide.u32 %rd30, %r43, 4; - add.s64 %rd31, %rd1, %rd30; - atom.global.add.f32 %f10, [%rd31], 0f00000000; - add.s32 %r50, %r50, %r7; - setp.lt.u32 %p9, %r50, %r27; - @%p9 bra $L__BB57_13; - -$L__BB57_14: - add.s32 %r46, %r46, %r6; - setp.lt.u32 %p10, %r46, %r28; - @%p10 bra $L__BB57_5; - bra.uni $L__BB57_20; + setp.ge.u32 %p9, %r5, %r25; + @%p9 bra $L__BB57_16; + + mul.wide.u32 %rd24, %r47, 8; + add.s64 %rd25, %rd3, %rd24; + ld.global.nc.u64 %rd26, [%rd25]; + cvta.to.global.u64 %rd27, %rd26; + shl.b64 %rd28, %rd4, 2; + add.s64 %rd29, %rd27, %rd28; + ld.global.f32 %f2, [%rd29]; + mul.lo.s32 %r20, %r47, %r25; + mov.u32 %r48, %r5; $L__BB57_15: - mov.u32 %r51, %r3; + mul.wide.u32 %rd30, %r48, 8; + add.s64 %rd31, %rd2, %rd30; + ld.global.nc.u64 %rd32, [%rd31]; + cvta.to.global.u64 %rd33, %rd32; + add.s64 %rd35, %rd33, %rd28; + ld.global.f32 %f8, [%rd35]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + add.s32 %r41, %r48, %r20; + mul.wide.u32 %rd36, %r41, 4; + add.s64 %rd37, %rd1, %rd36; + atom.global.add.f32 %f11, [%rd37], %f10; + add.s32 %r48, %r48, %r7; + setp.lt.u32 %p10, %r48, %r25; + @%p10 bra $L__BB57_15; $L__BB57_16: - setp.ge.u32 %p11, %r5, %r27; - @%p11 bra $L__BB57_19; - - mul.wide.u32 %rd32, %r51, 8; - add.s64 %rd33, %rd3, %rd32; - ld.global.nc.u64 %rd34, [%rd33]; - cvta.to.global.u64 %rd35, %rd34; - shl.b64 %rd36, %rd4, 2; - add.s64 %rd37, %rd35, %rd36; - ld.global.f32 %f2, [%rd37]; - mul.lo.s32 %r22, %r51, %r27; - mov.u32 %r52, %r5; + add.s32 %r47, %r47, %r6; + setp.lt.u32 %p11, %r47, %r26; + @%p11 bra $L__BB57_13; + +$L__BB57_17: + add.s32 %r42, %r42, %r4; + setp.lt.u32 %p12, %r42, %r27; + @%p12 bra $L__BB57_2; $L__BB57_18: - mul.wide.u32 %rd38, %r52, 8; - add.s64 %rd39, %rd2, %rd38; - ld.global.nc.u64 %rd40, [%rd39]; - cvta.to.global.u64 %rd41, %rd40; - add.s64 %rd43, %rd41, %rd36; - ld.global.f32 %f11, [%rd43]; - sub.ftz.f32 %f12, %f2, %f11; - mul.ftz.f32 %f13, %f12, %f12; - add.s32 %r44, %r52, %r22; - mul.wide.u32 %rd44, %r44, 4; - add.s64 %rd45, %rd1, %rd44; - atom.global.add.f32 %f14, [%rd45], %f13; - add.s32 %r52, %r52, %r7; - setp.lt.u32 %p12, %r52, %r27; - @%p12 bra $L__BB57_18; - -$L__BB57_19: - add.s32 %r51, %r51, %r6; - setp.lt.u32 %p13, %r51, %r28; - @%p13 bra $L__BB57_16; - -$L__BB57_20: - add.s32 %r45, %r45, %r4; - setp.lt.u32 %p14, %r45, %r29; - @%p14 bra $L__BB57_2; - -$L__BB57_21: ret; } - // .globl MultiCosineDistance -.visible .entry MultiCosineDistance( - .param .u64 MultiCosineDistance_param_0, - .param .u64 MultiCosineDistance_param_1, - .param .u64 MultiCosineDistance_param_2, - .param .u64 MultiCosineDistance_param_3, - .param .u64 MultiCosineDistance_param_4, - .param .u32 MultiCosineDistance_param_5, - .param .u32 MultiCosineDistance_param_6, - .param .u32 MultiCosineDistance_param_7 + // .globl CalculateDistances +.visible .entry CalculateDistances( + .param .u64 CalculateDistances_param_0, + .param .u64 CalculateDistances_param_1, + .param .u64 CalculateDistances_param_2, + .param .u32 CalculateDistances_param_3, + .param .u32 CalculateDistances_param_4, + .param .u32 CalculateDistances_param_5 +) +{ + .reg .pred %p<11>; + .reg .f32 %f<12>; + .reg .b32 %r<31>; + .reg .b64 %rd<31>; + + + ld.param.u64 %rd7, [CalculateDistances_param_0]; + ld.param.u64 %rd8, [CalculateDistances_param_1]; + ld.param.u64 %rd9, [CalculateDistances_param_2]; + ld.param.u32 %r16, [CalculateDistances_param_3]; + ld.param.u32 %r17, [CalculateDistances_param_4]; + ld.param.u32 %r18, [CalculateDistances_param_5]; + cvta.to.global.u64 %rd1, %rd9; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd7; + mov.u32 %r19, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r20, %tid.x; + mad.lo.s32 %r26, %r1, %r19, %r20; + setp.ge.u32 %p1, %r26, %r17; + @%p1 bra $L__BB58_15; + + mov.u32 %r21, %ntid.y; + mov.u32 %r22, %ctaid.y; + mov.u32 %r23, %tid.y; + mad.lo.s32 %r3, %r21, %r22, %r23; + mov.u32 %r24, %nctaid.x; + mul.lo.s32 %r4, %r1, %r24; + mov.u32 %r25, %nctaid.y; + mul.lo.s32 %r5, %r21, %r25; + setp.eq.s32 %p2, %r18, 0; + @%p2 bra $L__BB58_11; + +$L__BB58_3: + setp.ge.u32 %p3, %r3, %r16; + @%p3 bra $L__BB58_9; + + setp.eq.s32 %p4, %r18, 2; + cvt.u64.u32 %rd4, %r26; + mul.wide.u32 %rd10, %r26, 4; + add.s64 %rd5, %rd3, %rd10; + @%p4 bra $L__BB58_7; + bra.uni $L__BB58_5; + +$L__BB58_7: + ld.global.nc.f32 %f1, [%rd5]; + mov.u32 %r28, %r3; + +$L__BB58_8: + mul.wide.u32 %rd13, %r28, 8; + add.s64 %rd14, %rd2, %rd13; + ld.global.nc.u64 %rd15, [%rd14]; + cvta.to.global.u64 %rd16, %rd15; + shl.b64 %rd17, %rd4, 2; + add.s64 %rd18, %rd16, %rd17; + ld.global.f32 %f4, [%rd18]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + mul.wide.u32 %rd19, %r28, 4; + add.s64 %rd20, %rd1, %rd19; + atom.global.add.f32 %f7, [%rd20], %f6; + add.s32 %r28, %r28, %r5; + setp.lt.u32 %p6, %r28, %r16; + @%p6 bra $L__BB58_8; + bra.uni $L__BB58_9; + +$L__BB58_5: + mov.u32 %r27, %r3; + +$L__BB58_6: + mul.wide.u32 %rd11, %r27, 4; + add.s64 %rd12, %rd1, %rd11; + atom.global.add.f32 %f3, [%rd12], 0f00000000; + add.s32 %r27, %r27, %r5; + setp.lt.u32 %p5, %r27, %r16; + @%p5 bra $L__BB58_6; + +$L__BB58_9: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p7, %r26, %r17; + @%p7 bra $L__BB58_3; + bra.uni $L__BB58_15; + +$L__BB58_11: + setp.ge.u32 %p8, %r3, %r16; + @%p8 bra $L__BB58_14; + + cvt.u64.u32 %rd6, %r26; + mul.wide.u32 %rd21, %r26, 4; + add.s64 %rd22, %rd3, %rd21; + ld.global.nc.f32 %f2, [%rd22]; + mov.u32 %r30, %r3; + +$L__BB58_13: + mul.wide.u32 %rd23, %r30, 8; + add.s64 %rd24, %rd2, %rd23; + ld.global.nc.u64 %rd25, [%rd24]; + cvta.to.global.u64 %rd26, %rd25; + shl.b64 %rd27, %rd6, 2; + add.s64 %rd28, %rd26, %rd27; + ld.global.f32 %f8, [%rd28]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + mul.wide.u32 %rd29, %r30, 4; + add.s64 %rd30, %rd1, %rd29; + atom.global.add.f32 %f11, [%rd30], %f10; + add.s32 %r30, %r30, %r5; + setp.lt.u32 %p9, %r30, %r16; + @%p9 bra $L__BB58_13; + +$L__BB58_14: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p10, %r26, %r17; + @%p10 bra $L__BB58_11; + +$L__BB58_15: + ret; + +} + // .globl CosineMultiDistance +.visible .entry CosineMultiDistance( + .param .u64 CosineMultiDistance_param_0, + .param .u64 CosineMultiDistance_param_1, + .param .u64 CosineMultiDistance_param_2, + .param .u64 CosineMultiDistance_param_3, + .param .u64 CosineMultiDistance_param_4, + .param .u32 CosineMultiDistance_param_5, + .param .u32 CosineMultiDistance_param_6, + .param .u32 CosineMultiDistance_param_7 ) { .reg .pred %p<7>; @@ -4224,20 +4332,20 @@ $L__BB57_21: .reg .b64 %rd<28>; - ld.param.u64 %rd7, [MultiCosineDistance_param_0]; - ld.param.u64 %rd8, [MultiCosineDistance_param_1]; - ld.param.u64 %rd9, [MultiCosineDistance_param_2]; - ld.param.u64 %rd10, [MultiCosineDistance_param_3]; - ld.param.u64 %rd11, [MultiCosineDistance_param_4]; - ld.param.u32 %r17, [MultiCosineDistance_param_5]; - ld.param.u32 %r18, [MultiCosineDistance_param_6]; - ld.param.u32 %r19, [MultiCosineDistance_param_7]; + ld.param.u64 %rd7, [CosineMultiDistance_param_0]; + ld.param.u64 %rd8, [CosineMultiDistance_param_1]; + ld.param.u64 %rd9, [CosineMultiDistance_param_2]; + ld.param.u64 %rd10, [CosineMultiDistance_param_3]; + ld.param.u64 %rd11, [CosineMultiDistance_param_4]; + ld.param.u32 %r17, [CosineMultiDistance_param_5]; + ld.param.u32 %r18, [CosineMultiDistance_param_6]; + ld.param.u32 %r19, [CosineMultiDistance_param_7]; mov.u32 %r20, %ctaid.x; mov.u32 %r1, %ntid.x; mov.u32 %r21, %tid.x; mad.lo.s32 %r31, %r1, %r20, %r21; setp.ge.u32 %p1, %r31, %r19; - @%p1 bra $L__BB58_9; + @%p1 bra $L__BB59_9; mov.u32 %r22, %ntid.y; mov.u32 %r23, %ctaid.y; @@ -4259,16 +4367,16 @@ $L__BB57_21: cvta.to.global.u64 %rd4, %rd10; cvta.to.global.u64 %rd5, %rd11; -$L__BB58_2: +$L__BB59_2: setp.ge.u32 %p2, %r3, %r18; - @%p2 bra $L__BB58_8; + @%p2 bra $L__BB59_8; cvt.u64.u32 %rd6, %r31; mov.u32 %r32, %r3; -$L__BB58_4: +$L__BB59_4: setp.ge.u32 %p3, %r5, %r17; - @%p3 bra $L__BB58_7; + @%p3 bra $L__BB59_7; mul.wide.u32 %rd12, %r32, 8; add.s64 %rd13, %rd1, %rd12; @@ -4281,7 +4389,7 @@ $L__BB58_4: mad.lo.s32 %r33, %r32, %r17, %r5; mov.u32 %r34, %r5; -$L__BB58_6: +$L__BB59_6: mul.wide.u32 %rd18, %r34, 8; add.s64 %rd19, %rd2, %rd18; ld.global.nc.u64 %rd20, [%rd19]; @@ -4300,19 +4408,105 @@ $L__BB58_6: add.s32 %r33, %r33, %r7; add.s32 %r34, %r34, %r7; setp.lt.u32 %p4, %r34, %r17; - @%p4 bra $L__BB58_6; + @%p4 bra $L__BB59_6; -$L__BB58_7: +$L__BB59_7: add.s32 %r32, %r32, %r6; setp.lt.u32 %p5, %r32, %r18; - @%p5 bra $L__BB58_4; + @%p5 bra $L__BB59_4; -$L__BB58_8: +$L__BB59_8: add.s32 %r31, %r31, %r4; setp.lt.u32 %p6, %r31, %r19; - @%p6 bra $L__BB58_2; + @%p6 bra $L__BB59_2; -$L__BB58_9: +$L__BB59_9: + ret; + +} + // .globl CosineDistances +.visible .entry CosineDistances( + .param .u64 CosineDistances_param_0, + .param .u64 CosineDistances_param_1, + .param .u64 CosineDistances_param_2, + .param .u64 CosineDistances_param_3, + .param .u64 CosineDistances_param_4, + .param .u32 CosineDistances_param_5, + .param .u32 CosineDistances_param_6 +) +{ + .reg .pred %p<5>; + .reg .f32 %f<9>; + .reg .b32 %r<21>; + .reg .b64 %rd<24>; + + + ld.param.u64 %rd7, [CosineDistances_param_0]; + ld.param.u64 %rd8, [CosineDistances_param_1]; + ld.param.u64 %rd9, [CosineDistances_param_2]; + ld.param.u64 %rd10, [CosineDistances_param_3]; + ld.param.u64 %rd11, [CosineDistances_param_4]; + ld.param.u32 %r10, [CosineDistances_param_5]; + ld.param.u32 %r11, [CosineDistances_param_6]; + mov.u32 %r12, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r13, %tid.x; + mad.lo.s32 %r19, %r1, %r12, %r13; + setp.ge.u32 %p1, %r19, %r11; + @%p1 bra $L__BB60_6; + + mov.u32 %r14, %ntid.y; + mov.u32 %r15, %ctaid.y; + mov.u32 %r16, %tid.y; + mad.lo.s32 %r3, %r14, %r15, %r16; + mov.u32 %r17, %nctaid.x; + mul.lo.s32 %r4, %r1, %r17; + mov.u32 %r18, %nctaid.y; + mul.lo.s32 %r5, %r14, %r18; + cvta.to.global.u64 %rd1, %rd7; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd9; + cvta.to.global.u64 %rd4, %rd10; + cvta.to.global.u64 %rd5, %rd11; + +$L__BB60_2: + setp.ge.u32 %p2, %r3, %r10; + @%p2 bra $L__BB60_5; + + cvt.u64.u32 %rd6, %r19; + mul.wide.u32 %rd12, %r19, 4; + add.s64 %rd13, %rd1, %rd12; + ld.global.nc.f32 %f1, [%rd13]; + mul.ftz.f32 %f2, %f1, %f1; + mov.u32 %r20, %r3; + +$L__BB60_4: + mul.wide.u32 %rd14, %r20, 8; + add.s64 %rd15, %rd2, %rd14; + ld.global.nc.u64 %rd16, [%rd15]; + cvta.to.global.u64 %rd17, %rd16; + shl.b64 %rd18, %rd6, 2; + add.s64 %rd19, %rd17, %rd18; + ld.global.f32 %f3, [%rd19]; + mul.wide.u32 %rd20, %r20, 4; + add.s64 %rd21, %rd3, %rd20; + atom.global.add.f32 %f4, [%rd21], %f2; + add.s64 %rd22, %rd4, %rd20; + mul.ftz.f32 %f5, %f1, %f3; + atom.global.add.f32 %f6, [%rd22], %f5; + add.s64 %rd23, %rd5, %rd20; + mul.ftz.f32 %f7, %f3, %f3; + atom.global.add.f32 %f8, [%rd23], %f7; + add.s32 %r20, %r20, %r5; + setp.lt.u32 %p3, %r20, %r10; + @%p3 bra $L__BB60_4; + +$L__BB60_5: + add.s32 %r19, %r19, %r4; + setp.lt.u32 %p4, %r19, %r11; + @%p4 bra $L__BB60_2; + +$L__BB60_6: ret; } @@ -4340,7 +4534,7 @@ $L__BB58_9: mov.u32 %r2, %tid.x; mad.lo.s32 %r3, %r22, %r1, %r2; setp.ge.u32 %p1, %r3, %r20; - @%p1 bra $L__BB59_2; + @%p1 bra $L__BB61_2; cvta.to.global.u64 %rd3, %rd1; mul.lo.s32 %r23, %r3, %r21; @@ -4352,17 +4546,17 @@ $L__BB58_9: add.s32 %r26, %r25, %r24; st.shared.f32 [%r26], %f8; -$L__BB59_2: +$L__BB61_2: bar.sync 0; setp.ne.s32 %p2, %r2, 0; - @%p2 bra $L__BB59_11; + @%p2 bra $L__BB61_11; shl.b32 %r4, %r1, 10; sub.s32 %r27, %r20, %r4; min.u32 %r5, %r27, 1024; setp.eq.s32 %p3, %r5, 0; mov.f32 %f25, 0f00000000; - @%p3 bra $L__BB59_10; + @%p3 bra $L__BB61_10; not.b32 %r29, %r20; add.s32 %r30, %r4, %r29; @@ -4373,7 +4567,7 @@ $L__BB59_2: setp.lt.u32 %p4, %r33, 3; mov.f32 %f25, 0f00000000; mov.u32 %r45, 0; - @%p4 bra $L__BB59_7; + @%p4 bra $L__BB61_7; add.s32 %r36, %r4, -1; sub.s32 %r37, %r36, %r20; @@ -4382,7 +4576,7 @@ $L__BB59_2: neg.s32 %r42, %r39; mov.u32 %r43, _ZZ9SumValuesE5block; -$L__BB59_6: +$L__BB61_6: ld.shared.f32 %f13, [%r43]; add.ftz.f32 %f14, %f25, %f13; ld.shared.f32 %f15, [%r43+4]; @@ -4395,32 +4589,32 @@ $L__BB59_6: add.s32 %r43, %r43, 16; add.s32 %r42, %r42, -4; setp.ne.s32 %p5, %r42, 1; - @%p5 bra $L__BB59_6; + @%p5 bra $L__BB61_6; -$L__BB59_7: +$L__BB61_7: setp.eq.s32 %p6, %r47, 0; - @%p6 bra $L__BB59_10; + @%p6 bra $L__BB61_10; shl.b32 %r40, %r45, 2; mov.u32 %r41, _ZZ9SumValuesE5block; add.s32 %r46, %r41, %r40; -$L__BB59_9: +$L__BB61_9: .pragma "nounroll"; ld.shared.f32 %f20, [%r46]; add.ftz.f32 %f25, %f25, %f20; add.s32 %r46, %r46, 4; add.s32 %r47, %r47, -1; setp.ne.s32 %p7, %r47, 0; - @%p7 bra $L__BB59_9; + @%p7 bra $L__BB61_9; -$L__BB59_10: +$L__BB61_10: cvta.to.global.u64 %rd6, %rd2; mul.wide.u32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f25; -$L__BB59_11: +$L__BB61_11: ret; } diff --git a/BrightData.Cuda/cuda/brightwire_86.ptx b/BrightData.Cuda/cuda/brightwire_86.ptx index b9d46f59..0fb9cfae 100644 --- a/BrightData.Cuda/cuda/brightwire_86.ptx +++ b/BrightData.Cuda/cuda/brightwire_86.ptx @@ -4022,200 +4022,308 @@ $L__BB56_3: ret; } - // .globl CalculateDistances -.visible .entry CalculateDistances( - .param .u64 CalculateDistances_param_0, - .param .u64 CalculateDistances_param_1, - .param .u64 CalculateDistances_param_2, - .param .u32 CalculateDistances_param_3, - .param .u32 CalculateDistances_param_4, - .param .u32 CalculateDistances_param_5, - .param .u32 CalculateDistances_param_6 + // .globl CalculateMultiDistances +.visible .entry CalculateMultiDistances( + .param .u64 CalculateMultiDistances_param_0, + .param .u64 CalculateMultiDistances_param_1, + .param .u64 CalculateMultiDistances_param_2, + .param .u32 CalculateMultiDistances_param_3, + .param .u32 CalculateMultiDistances_param_4, + .param .u32 CalculateMultiDistances_param_5, + .param .u32 CalculateMultiDistances_param_6 ) { - .reg .pred %p<15>; - .reg .f32 %f<15>; - .reg .b32 %r<53>; - .reg .b64 %rd<46>; - - - ld.param.u64 %rd5, [CalculateDistances_param_0]; - ld.param.u64 %rd6, [CalculateDistances_param_1]; - ld.param.u64 %rd7, [CalculateDistances_param_2]; - ld.param.u32 %r27, [CalculateDistances_param_3]; - ld.param.u32 %r28, [CalculateDistances_param_4]; - ld.param.u32 %r29, [CalculateDistances_param_5]; - ld.param.u32 %r30, [CalculateDistances_param_6]; - cvta.to.global.u64 %rd1, %rd7; - cvta.to.global.u64 %rd2, %rd6; - cvta.to.global.u64 %rd3, %rd5; - mov.u32 %r31, %ctaid.x; + .reg .pred %p<13>; + .reg .f32 %f<12>; + .reg .b32 %r<49>; + .reg .b64 %rd<38>; + + + ld.param.u64 %rd6, [CalculateMultiDistances_param_0]; + ld.param.u64 %rd7, [CalculateMultiDistances_param_1]; + ld.param.u64 %rd8, [CalculateMultiDistances_param_2]; + ld.param.u32 %r25, [CalculateMultiDistances_param_3]; + ld.param.u32 %r26, [CalculateMultiDistances_param_4]; + ld.param.u32 %r27, [CalculateMultiDistances_param_5]; + ld.param.u32 %r28, [CalculateMultiDistances_param_6]; + cvta.to.global.u64 %rd1, %rd8; + cvta.to.global.u64 %rd2, %rd7; + cvta.to.global.u64 %rd3, %rd6; + mov.u32 %r29, %ctaid.x; mov.u32 %r1, %ntid.x; - mov.u32 %r32, %tid.x; - mad.lo.s32 %r45, %r1, %r31, %r32; - setp.ge.u32 %p1, %r45, %r29; - @%p1 bra $L__BB57_21; - - mov.u32 %r33, %ntid.y; - mov.u32 %r34, %ctaid.y; - mov.u32 %r35, %tid.y; - mad.lo.s32 %r3, %r33, %r34, %r35; - mov.u32 %r36, %nctaid.x; - mul.lo.s32 %r4, %r1, %r36; - mov.u32 %r37, %ctaid.z; - mov.u32 %r38, %ntid.z; - mov.u32 %r39, %tid.z; - mad.lo.s32 %r5, %r38, %r37, %r39; - mov.u32 %r40, %nctaid.y; - mul.lo.s32 %r6, %r33, %r40; - mov.u32 %r41, %nctaid.z; - mul.lo.s32 %r7, %r38, %r41; + mov.u32 %r30, %tid.x; + mad.lo.s32 %r42, %r1, %r29, %r30; + setp.ge.u32 %p1, %r42, %r27; + @%p1 bra $L__BB57_18; + + mov.u32 %r31, %ntid.y; + mov.u32 %r32, %ctaid.y; + mov.u32 %r33, %tid.y; + mad.lo.s32 %r3, %r31, %r32, %r33; + mov.u32 %r34, %nctaid.x; + mul.lo.s32 %r4, %r1, %r34; + mov.u32 %r35, %ctaid.z; + mov.u32 %r36, %ntid.z; + mov.u32 %r37, %tid.z; + mad.lo.s32 %r5, %r36, %r35, %r37; + mov.u32 %r38, %nctaid.y; + mul.lo.s32 %r6, %r31, %r38; + mov.u32 %r39, %nctaid.z; + mul.lo.s32 %r7, %r36, %r39; $L__BB57_2: - setp.ge.u32 %p2, %r3, %r28; - @%p2 bra $L__BB57_20; + setp.ge.u32 %p2, %r3, %r26; + @%p2 bra $L__BB57_17; - setp.eq.s32 %p3, %r30, 0; - cvt.u64.u32 %rd4, %r45; - @%p3 bra $L__BB57_15; + setp.eq.s32 %p3, %r28, 0; + cvt.u64.u32 %rd4, %r42; + @%p3 bra $L__BB57_12; - mov.u32 %r46, %r3; + mov.u32 %r43, %r3; $L__BB57_5: - setp.ge.u32 %p4, %r5, %r27; - @%p4 bra $L__BB57_14; - - mul.wide.u32 %rd8, %r46, 8; - add.s64 %rd9, %rd3, %rd8; - ld.global.nc.u64 %rd10, [%rd9]; - cvta.to.global.u64 %rd11, %rd10; - shl.b64 %rd12, %rd4, 2; - add.s64 %rd13, %rd11, %rd12; - ld.global.f32 %f1, [%rd13]; - mul.lo.s32 %r10, %r46, %r27; - setp.eq.s32 %p5, %r30, 1; - @%p5 bra $L__BB57_10; - - setp.ne.s32 %p6, %r30, 2; - @%p6 bra $L__BB57_12; + setp.ge.u32 %p4, %r5, %r25; + @%p4 bra $L__BB57_11; - add.s32 %r47, %r5, %r10; - mov.u32 %r48, %r5; + setp.eq.s32 %p5, %r28, 2; + mul.wide.u32 %rd9, %r43, 8; + add.s64 %rd10, %rd3, %rd9; + ld.global.nc.u64 %rd11, [%rd10]; + cvta.to.global.u64 %rd12, %rd11; + shl.b64 %rd13, %rd4, 2; + add.s64 %rd5, %rd12, %rd13; + mul.lo.s32 %r10, %r43, %r25; + @%p5 bra $L__BB57_9; + bra.uni $L__BB57_7; $L__BB57_9: - mul.wide.u32 %rd14, %r48, 8; - add.s64 %rd15, %rd2, %rd14; - ld.global.nc.u64 %rd16, [%rd15]; - cvta.to.global.u64 %rd17, %rd16; - add.s64 %rd19, %rd17, %rd12; - ld.global.f32 %f3, [%rd19]; - sub.ftz.f32 %f4, %f1, %f3; - abs.ftz.f32 %f5, %f4; - mul.wide.u32 %rd20, %r47, 4; - add.s64 %rd21, %rd1, %rd20; - atom.global.add.f32 %f6, [%rd21], %f5; - add.s32 %r47, %r47, %r7; - add.s32 %r48, %r48, %r7; - setp.lt.u32 %p7, %r48, %r27; - @%p7 bra $L__BB57_9; - bra.uni $L__BB57_14; + ld.global.f32 %f1, [%rd5]; + mov.u32 %r46, %r5; $L__BB57_10: - mov.u32 %r49, %r5; + mul.wide.u32 %rd16, %r46, 8; + add.s64 %rd17, %rd2, %rd16; + ld.global.nc.u64 %rd18, [%rd17]; + cvta.to.global.u64 %rd19, %rd18; + add.s64 %rd21, %rd19, %rd13; + ld.global.f32 %f4, [%rd21]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + add.s32 %r40, %r46, %r10; + mul.wide.u32 %rd22, %r40, 4; + add.s64 %rd23, %rd1, %rd22; + atom.global.add.f32 %f7, [%rd23], %f6; + add.s32 %r46, %r46, %r7; + setp.lt.u32 %p7, %r46, %r25; + @%p7 bra $L__BB57_10; + bra.uni $L__BB57_11; + +$L__BB57_7: + add.s32 %r44, %r5, %r10; + mov.u32 %r45, %r5; + +$L__BB57_8: + mul.wide.u32 %rd14, %r44, 4; + add.s64 %rd15, %rd1, %rd14; + atom.global.add.f32 %f3, [%rd15], 0f00000000; + add.s32 %r44, %r44, %r7; + add.s32 %r45, %r45, %r7; + setp.lt.u32 %p6, %r45, %r25; + @%p6 bra $L__BB57_8; $L__BB57_11: - mul.wide.u32 %rd22, %r49, 8; - add.s64 %rd23, %rd2, %rd22; - ld.global.nc.u64 %rd24, [%rd23]; - cvta.to.global.u64 %rd25, %rd24; - add.s64 %rd27, %rd25, %rd12; - ld.global.f32 %f7, [%rd27]; - mul.ftz.f32 %f8, %f1, %f7; - add.s32 %r42, %r49, %r10; - mul.wide.u32 %rd28, %r42, 4; - add.s64 %rd29, %rd1, %rd28; - atom.global.add.f32 %f9, [%rd29], %f8; - add.s32 %r49, %r49, %r7; - setp.lt.u32 %p8, %r49, %r27; - @%p8 bra $L__BB57_11; - bra.uni $L__BB57_14; + add.s32 %r43, %r43, %r6; + setp.lt.u32 %p8, %r43, %r26; + @%p8 bra $L__BB57_5; + bra.uni $L__BB57_17; $L__BB57_12: - mov.u32 %r50, %r5; + mov.u32 %r47, %r3; $L__BB57_13: - add.s32 %r43, %r50, %r10; - mul.wide.u32 %rd30, %r43, 4; - add.s64 %rd31, %rd1, %rd30; - atom.global.add.f32 %f10, [%rd31], 0f00000000; - add.s32 %r50, %r50, %r7; - setp.lt.u32 %p9, %r50, %r27; - @%p9 bra $L__BB57_13; - -$L__BB57_14: - add.s32 %r46, %r46, %r6; - setp.lt.u32 %p10, %r46, %r28; - @%p10 bra $L__BB57_5; - bra.uni $L__BB57_20; + setp.ge.u32 %p9, %r5, %r25; + @%p9 bra $L__BB57_16; + + mul.wide.u32 %rd24, %r47, 8; + add.s64 %rd25, %rd3, %rd24; + ld.global.nc.u64 %rd26, [%rd25]; + cvta.to.global.u64 %rd27, %rd26; + shl.b64 %rd28, %rd4, 2; + add.s64 %rd29, %rd27, %rd28; + ld.global.f32 %f2, [%rd29]; + mul.lo.s32 %r20, %r47, %r25; + mov.u32 %r48, %r5; $L__BB57_15: - mov.u32 %r51, %r3; + mul.wide.u32 %rd30, %r48, 8; + add.s64 %rd31, %rd2, %rd30; + ld.global.nc.u64 %rd32, [%rd31]; + cvta.to.global.u64 %rd33, %rd32; + add.s64 %rd35, %rd33, %rd28; + ld.global.f32 %f8, [%rd35]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + add.s32 %r41, %r48, %r20; + mul.wide.u32 %rd36, %r41, 4; + add.s64 %rd37, %rd1, %rd36; + atom.global.add.f32 %f11, [%rd37], %f10; + add.s32 %r48, %r48, %r7; + setp.lt.u32 %p10, %r48, %r25; + @%p10 bra $L__BB57_15; $L__BB57_16: - setp.ge.u32 %p11, %r5, %r27; - @%p11 bra $L__BB57_19; - - mul.wide.u32 %rd32, %r51, 8; - add.s64 %rd33, %rd3, %rd32; - ld.global.nc.u64 %rd34, [%rd33]; - cvta.to.global.u64 %rd35, %rd34; - shl.b64 %rd36, %rd4, 2; - add.s64 %rd37, %rd35, %rd36; - ld.global.f32 %f2, [%rd37]; - mul.lo.s32 %r22, %r51, %r27; - mov.u32 %r52, %r5; + add.s32 %r47, %r47, %r6; + setp.lt.u32 %p11, %r47, %r26; + @%p11 bra $L__BB57_13; + +$L__BB57_17: + add.s32 %r42, %r42, %r4; + setp.lt.u32 %p12, %r42, %r27; + @%p12 bra $L__BB57_2; $L__BB57_18: - mul.wide.u32 %rd38, %r52, 8; - add.s64 %rd39, %rd2, %rd38; - ld.global.nc.u64 %rd40, [%rd39]; - cvta.to.global.u64 %rd41, %rd40; - add.s64 %rd43, %rd41, %rd36; - ld.global.f32 %f11, [%rd43]; - sub.ftz.f32 %f12, %f2, %f11; - mul.ftz.f32 %f13, %f12, %f12; - add.s32 %r44, %r52, %r22; - mul.wide.u32 %rd44, %r44, 4; - add.s64 %rd45, %rd1, %rd44; - atom.global.add.f32 %f14, [%rd45], %f13; - add.s32 %r52, %r52, %r7; - setp.lt.u32 %p12, %r52, %r27; - @%p12 bra $L__BB57_18; - -$L__BB57_19: - add.s32 %r51, %r51, %r6; - setp.lt.u32 %p13, %r51, %r28; - @%p13 bra $L__BB57_16; - -$L__BB57_20: - add.s32 %r45, %r45, %r4; - setp.lt.u32 %p14, %r45, %r29; - @%p14 bra $L__BB57_2; - -$L__BB57_21: ret; } - // .globl MultiCosineDistance -.visible .entry MultiCosineDistance( - .param .u64 MultiCosineDistance_param_0, - .param .u64 MultiCosineDistance_param_1, - .param .u64 MultiCosineDistance_param_2, - .param .u64 MultiCosineDistance_param_3, - .param .u64 MultiCosineDistance_param_4, - .param .u32 MultiCosineDistance_param_5, - .param .u32 MultiCosineDistance_param_6, - .param .u32 MultiCosineDistance_param_7 + // .globl CalculateDistances +.visible .entry CalculateDistances( + .param .u64 CalculateDistances_param_0, + .param .u64 CalculateDistances_param_1, + .param .u64 CalculateDistances_param_2, + .param .u32 CalculateDistances_param_3, + .param .u32 CalculateDistances_param_4, + .param .u32 CalculateDistances_param_5 +) +{ + .reg .pred %p<11>; + .reg .f32 %f<12>; + .reg .b32 %r<31>; + .reg .b64 %rd<31>; + + + ld.param.u64 %rd7, [CalculateDistances_param_0]; + ld.param.u64 %rd8, [CalculateDistances_param_1]; + ld.param.u64 %rd9, [CalculateDistances_param_2]; + ld.param.u32 %r16, [CalculateDistances_param_3]; + ld.param.u32 %r17, [CalculateDistances_param_4]; + ld.param.u32 %r18, [CalculateDistances_param_5]; + cvta.to.global.u64 %rd1, %rd9; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd7; + mov.u32 %r19, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r20, %tid.x; + mad.lo.s32 %r26, %r1, %r19, %r20; + setp.ge.u32 %p1, %r26, %r17; + @%p1 bra $L__BB58_15; + + mov.u32 %r21, %ntid.y; + mov.u32 %r22, %ctaid.y; + mov.u32 %r23, %tid.y; + mad.lo.s32 %r3, %r21, %r22, %r23; + mov.u32 %r24, %nctaid.x; + mul.lo.s32 %r4, %r1, %r24; + mov.u32 %r25, %nctaid.y; + mul.lo.s32 %r5, %r21, %r25; + setp.eq.s32 %p2, %r18, 0; + @%p2 bra $L__BB58_11; + +$L__BB58_3: + setp.ge.u32 %p3, %r3, %r16; + @%p3 bra $L__BB58_9; + + setp.eq.s32 %p4, %r18, 2; + cvt.u64.u32 %rd4, %r26; + mul.wide.u32 %rd10, %r26, 4; + add.s64 %rd5, %rd3, %rd10; + @%p4 bra $L__BB58_7; + bra.uni $L__BB58_5; + +$L__BB58_7: + ld.global.nc.f32 %f1, [%rd5]; + mov.u32 %r28, %r3; + +$L__BB58_8: + mul.wide.u32 %rd13, %r28, 8; + add.s64 %rd14, %rd2, %rd13; + ld.global.nc.u64 %rd15, [%rd14]; + cvta.to.global.u64 %rd16, %rd15; + shl.b64 %rd17, %rd4, 2; + add.s64 %rd18, %rd16, %rd17; + ld.global.f32 %f4, [%rd18]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + mul.wide.u32 %rd19, %r28, 4; + add.s64 %rd20, %rd1, %rd19; + atom.global.add.f32 %f7, [%rd20], %f6; + add.s32 %r28, %r28, %r5; + setp.lt.u32 %p6, %r28, %r16; + @%p6 bra $L__BB58_8; + bra.uni $L__BB58_9; + +$L__BB58_5: + mov.u32 %r27, %r3; + +$L__BB58_6: + mul.wide.u32 %rd11, %r27, 4; + add.s64 %rd12, %rd1, %rd11; + atom.global.add.f32 %f3, [%rd12], 0f00000000; + add.s32 %r27, %r27, %r5; + setp.lt.u32 %p5, %r27, %r16; + @%p5 bra $L__BB58_6; + +$L__BB58_9: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p7, %r26, %r17; + @%p7 bra $L__BB58_3; + bra.uni $L__BB58_15; + +$L__BB58_11: + setp.ge.u32 %p8, %r3, %r16; + @%p8 bra $L__BB58_14; + + cvt.u64.u32 %rd6, %r26; + mul.wide.u32 %rd21, %r26, 4; + add.s64 %rd22, %rd3, %rd21; + ld.global.nc.f32 %f2, [%rd22]; + mov.u32 %r30, %r3; + +$L__BB58_13: + mul.wide.u32 %rd23, %r30, 8; + add.s64 %rd24, %rd2, %rd23; + ld.global.nc.u64 %rd25, [%rd24]; + cvta.to.global.u64 %rd26, %rd25; + shl.b64 %rd27, %rd6, 2; + add.s64 %rd28, %rd26, %rd27; + ld.global.f32 %f8, [%rd28]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + mul.wide.u32 %rd29, %r30, 4; + add.s64 %rd30, %rd1, %rd29; + atom.global.add.f32 %f11, [%rd30], %f10; + add.s32 %r30, %r30, %r5; + setp.lt.u32 %p9, %r30, %r16; + @%p9 bra $L__BB58_13; + +$L__BB58_14: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p10, %r26, %r17; + @%p10 bra $L__BB58_11; + +$L__BB58_15: + ret; + +} + // .globl CosineMultiDistance +.visible .entry CosineMultiDistance( + .param .u64 CosineMultiDistance_param_0, + .param .u64 CosineMultiDistance_param_1, + .param .u64 CosineMultiDistance_param_2, + .param .u64 CosineMultiDistance_param_3, + .param .u64 CosineMultiDistance_param_4, + .param .u32 CosineMultiDistance_param_5, + .param .u32 CosineMultiDistance_param_6, + .param .u32 CosineMultiDistance_param_7 ) { .reg .pred %p<7>; @@ -4224,20 +4332,20 @@ $L__BB57_21: .reg .b64 %rd<28>; - ld.param.u64 %rd7, [MultiCosineDistance_param_0]; - ld.param.u64 %rd8, [MultiCosineDistance_param_1]; - ld.param.u64 %rd9, [MultiCosineDistance_param_2]; - ld.param.u64 %rd10, [MultiCosineDistance_param_3]; - ld.param.u64 %rd11, [MultiCosineDistance_param_4]; - ld.param.u32 %r17, [MultiCosineDistance_param_5]; - ld.param.u32 %r18, [MultiCosineDistance_param_6]; - ld.param.u32 %r19, [MultiCosineDistance_param_7]; + ld.param.u64 %rd7, [CosineMultiDistance_param_0]; + ld.param.u64 %rd8, [CosineMultiDistance_param_1]; + ld.param.u64 %rd9, [CosineMultiDistance_param_2]; + ld.param.u64 %rd10, [CosineMultiDistance_param_3]; + ld.param.u64 %rd11, [CosineMultiDistance_param_4]; + ld.param.u32 %r17, [CosineMultiDistance_param_5]; + ld.param.u32 %r18, [CosineMultiDistance_param_6]; + ld.param.u32 %r19, [CosineMultiDistance_param_7]; mov.u32 %r20, %ctaid.x; mov.u32 %r1, %ntid.x; mov.u32 %r21, %tid.x; mad.lo.s32 %r31, %r1, %r20, %r21; setp.ge.u32 %p1, %r31, %r19; - @%p1 bra $L__BB58_9; + @%p1 bra $L__BB59_9; mov.u32 %r22, %ntid.y; mov.u32 %r23, %ctaid.y; @@ -4259,16 +4367,16 @@ $L__BB57_21: cvta.to.global.u64 %rd4, %rd10; cvta.to.global.u64 %rd5, %rd11; -$L__BB58_2: +$L__BB59_2: setp.ge.u32 %p2, %r3, %r18; - @%p2 bra $L__BB58_8; + @%p2 bra $L__BB59_8; cvt.u64.u32 %rd6, %r31; mov.u32 %r32, %r3; -$L__BB58_4: +$L__BB59_4: setp.ge.u32 %p3, %r5, %r17; - @%p3 bra $L__BB58_7; + @%p3 bra $L__BB59_7; mul.wide.u32 %rd12, %r32, 8; add.s64 %rd13, %rd1, %rd12; @@ -4281,7 +4389,7 @@ $L__BB58_4: mad.lo.s32 %r33, %r32, %r17, %r5; mov.u32 %r34, %r5; -$L__BB58_6: +$L__BB59_6: mul.wide.u32 %rd18, %r34, 8; add.s64 %rd19, %rd2, %rd18; ld.global.nc.u64 %rd20, [%rd19]; @@ -4300,19 +4408,105 @@ $L__BB58_6: add.s32 %r33, %r33, %r7; add.s32 %r34, %r34, %r7; setp.lt.u32 %p4, %r34, %r17; - @%p4 bra $L__BB58_6; + @%p4 bra $L__BB59_6; -$L__BB58_7: +$L__BB59_7: add.s32 %r32, %r32, %r6; setp.lt.u32 %p5, %r32, %r18; - @%p5 bra $L__BB58_4; + @%p5 bra $L__BB59_4; -$L__BB58_8: +$L__BB59_8: add.s32 %r31, %r31, %r4; setp.lt.u32 %p6, %r31, %r19; - @%p6 bra $L__BB58_2; + @%p6 bra $L__BB59_2; -$L__BB58_9: +$L__BB59_9: + ret; + +} + // .globl CosineDistances +.visible .entry CosineDistances( + .param .u64 CosineDistances_param_0, + .param .u64 CosineDistances_param_1, + .param .u64 CosineDistances_param_2, + .param .u64 CosineDistances_param_3, + .param .u64 CosineDistances_param_4, + .param .u32 CosineDistances_param_5, + .param .u32 CosineDistances_param_6 +) +{ + .reg .pred %p<5>; + .reg .f32 %f<9>; + .reg .b32 %r<21>; + .reg .b64 %rd<24>; + + + ld.param.u64 %rd7, [CosineDistances_param_0]; + ld.param.u64 %rd8, [CosineDistances_param_1]; + ld.param.u64 %rd9, [CosineDistances_param_2]; + ld.param.u64 %rd10, [CosineDistances_param_3]; + ld.param.u64 %rd11, [CosineDistances_param_4]; + ld.param.u32 %r10, [CosineDistances_param_5]; + ld.param.u32 %r11, [CosineDistances_param_6]; + mov.u32 %r12, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r13, %tid.x; + mad.lo.s32 %r19, %r1, %r12, %r13; + setp.ge.u32 %p1, %r19, %r11; + @%p1 bra $L__BB60_6; + + mov.u32 %r14, %ntid.y; + mov.u32 %r15, %ctaid.y; + mov.u32 %r16, %tid.y; + mad.lo.s32 %r3, %r14, %r15, %r16; + mov.u32 %r17, %nctaid.x; + mul.lo.s32 %r4, %r1, %r17; + mov.u32 %r18, %nctaid.y; + mul.lo.s32 %r5, %r14, %r18; + cvta.to.global.u64 %rd1, %rd7; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd9; + cvta.to.global.u64 %rd4, %rd10; + cvta.to.global.u64 %rd5, %rd11; + +$L__BB60_2: + setp.ge.u32 %p2, %r3, %r10; + @%p2 bra $L__BB60_5; + + cvt.u64.u32 %rd6, %r19; + mul.wide.u32 %rd12, %r19, 4; + add.s64 %rd13, %rd1, %rd12; + ld.global.nc.f32 %f1, [%rd13]; + mul.ftz.f32 %f2, %f1, %f1; + mov.u32 %r20, %r3; + +$L__BB60_4: + mul.wide.u32 %rd14, %r20, 8; + add.s64 %rd15, %rd2, %rd14; + ld.global.nc.u64 %rd16, [%rd15]; + cvta.to.global.u64 %rd17, %rd16; + shl.b64 %rd18, %rd6, 2; + add.s64 %rd19, %rd17, %rd18; + ld.global.f32 %f3, [%rd19]; + mul.wide.u32 %rd20, %r20, 4; + add.s64 %rd21, %rd3, %rd20; + atom.global.add.f32 %f4, [%rd21], %f2; + add.s64 %rd22, %rd4, %rd20; + mul.ftz.f32 %f5, %f1, %f3; + atom.global.add.f32 %f6, [%rd22], %f5; + add.s64 %rd23, %rd5, %rd20; + mul.ftz.f32 %f7, %f3, %f3; + atom.global.add.f32 %f8, [%rd23], %f7; + add.s32 %r20, %r20, %r5; + setp.lt.u32 %p3, %r20, %r10; + @%p3 bra $L__BB60_4; + +$L__BB60_5: + add.s32 %r19, %r19, %r4; + setp.lt.u32 %p4, %r19, %r11; + @%p4 bra $L__BB60_2; + +$L__BB60_6: ret; } @@ -4340,7 +4534,7 @@ $L__BB58_9: mov.u32 %r2, %tid.x; mad.lo.s32 %r3, %r22, %r1, %r2; setp.ge.u32 %p1, %r3, %r20; - @%p1 bra $L__BB59_2; + @%p1 bra $L__BB61_2; cvta.to.global.u64 %rd3, %rd1; mul.lo.s32 %r23, %r3, %r21; @@ -4352,17 +4546,17 @@ $L__BB58_9: add.s32 %r26, %r25, %r24; st.shared.f32 [%r26], %f8; -$L__BB59_2: +$L__BB61_2: bar.sync 0; setp.ne.s32 %p2, %r2, 0; - @%p2 bra $L__BB59_11; + @%p2 bra $L__BB61_11; shl.b32 %r4, %r1, 10; sub.s32 %r27, %r20, %r4; min.u32 %r5, %r27, 1024; setp.eq.s32 %p3, %r5, 0; mov.f32 %f25, 0f00000000; - @%p3 bra $L__BB59_10; + @%p3 bra $L__BB61_10; not.b32 %r29, %r20; add.s32 %r30, %r4, %r29; @@ -4373,7 +4567,7 @@ $L__BB59_2: setp.lt.u32 %p4, %r33, 3; mov.f32 %f25, 0f00000000; mov.u32 %r45, 0; - @%p4 bra $L__BB59_7; + @%p4 bra $L__BB61_7; add.s32 %r36, %r4, -1; sub.s32 %r37, %r36, %r20; @@ -4382,7 +4576,7 @@ $L__BB59_2: neg.s32 %r42, %r39; mov.u32 %r43, _ZZ9SumValuesE5block; -$L__BB59_6: +$L__BB61_6: ld.shared.f32 %f13, [%r43]; add.ftz.f32 %f14, %f25, %f13; ld.shared.f32 %f15, [%r43+4]; @@ -4395,32 +4589,32 @@ $L__BB59_6: add.s32 %r43, %r43, 16; add.s32 %r42, %r42, -4; setp.ne.s32 %p5, %r42, 1; - @%p5 bra $L__BB59_6; + @%p5 bra $L__BB61_6; -$L__BB59_7: +$L__BB61_7: setp.eq.s32 %p6, %r47, 0; - @%p6 bra $L__BB59_10; + @%p6 bra $L__BB61_10; shl.b32 %r40, %r45, 2; mov.u32 %r41, _ZZ9SumValuesE5block; add.s32 %r46, %r41, %r40; -$L__BB59_9: +$L__BB61_9: .pragma "nounroll"; ld.shared.f32 %f20, [%r46]; add.ftz.f32 %f25, %f25, %f20; add.s32 %r46, %r46, 4; add.s32 %r47, %r47, -1; setp.ne.s32 %p7, %r47, 0; - @%p7 bra $L__BB59_9; + @%p7 bra $L__BB61_9; -$L__BB59_10: +$L__BB61_10: cvta.to.global.u64 %rd6, %rd2; mul.wide.u32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f25; -$L__BB59_11: +$L__BB61_11: ret; } diff --git a/BrightData.Cuda/cuda/brightwire_87.ptx b/BrightData.Cuda/cuda/brightwire_87.ptx index 74abd754..1401facf 100644 --- a/BrightData.Cuda/cuda/brightwire_87.ptx +++ b/BrightData.Cuda/cuda/brightwire_87.ptx @@ -4022,200 +4022,308 @@ $L__BB56_3: ret; } - // .globl CalculateDistances -.visible .entry CalculateDistances( - .param .u64 CalculateDistances_param_0, - .param .u64 CalculateDistances_param_1, - .param .u64 CalculateDistances_param_2, - .param .u32 CalculateDistances_param_3, - .param .u32 CalculateDistances_param_4, - .param .u32 CalculateDistances_param_5, - .param .u32 CalculateDistances_param_6 + // .globl CalculateMultiDistances +.visible .entry CalculateMultiDistances( + .param .u64 CalculateMultiDistances_param_0, + .param .u64 CalculateMultiDistances_param_1, + .param .u64 CalculateMultiDistances_param_2, + .param .u32 CalculateMultiDistances_param_3, + .param .u32 CalculateMultiDistances_param_4, + .param .u32 CalculateMultiDistances_param_5, + .param .u32 CalculateMultiDistances_param_6 ) { - .reg .pred %p<15>; - .reg .f32 %f<15>; - .reg .b32 %r<53>; - .reg .b64 %rd<46>; - - - ld.param.u64 %rd5, [CalculateDistances_param_0]; - ld.param.u64 %rd6, [CalculateDistances_param_1]; - ld.param.u64 %rd7, [CalculateDistances_param_2]; - ld.param.u32 %r27, [CalculateDistances_param_3]; - ld.param.u32 %r28, [CalculateDistances_param_4]; - ld.param.u32 %r29, [CalculateDistances_param_5]; - ld.param.u32 %r30, [CalculateDistances_param_6]; - cvta.to.global.u64 %rd1, %rd7; - cvta.to.global.u64 %rd2, %rd6; - cvta.to.global.u64 %rd3, %rd5; - mov.u32 %r31, %ctaid.x; + .reg .pred %p<13>; + .reg .f32 %f<12>; + .reg .b32 %r<49>; + .reg .b64 %rd<38>; + + + ld.param.u64 %rd6, [CalculateMultiDistances_param_0]; + ld.param.u64 %rd7, [CalculateMultiDistances_param_1]; + ld.param.u64 %rd8, [CalculateMultiDistances_param_2]; + ld.param.u32 %r25, [CalculateMultiDistances_param_3]; + ld.param.u32 %r26, [CalculateMultiDistances_param_4]; + ld.param.u32 %r27, [CalculateMultiDistances_param_5]; + ld.param.u32 %r28, [CalculateMultiDistances_param_6]; + cvta.to.global.u64 %rd1, %rd8; + cvta.to.global.u64 %rd2, %rd7; + cvta.to.global.u64 %rd3, %rd6; + mov.u32 %r29, %ctaid.x; mov.u32 %r1, %ntid.x; - mov.u32 %r32, %tid.x; - mad.lo.s32 %r45, %r1, %r31, %r32; - setp.ge.u32 %p1, %r45, %r29; - @%p1 bra $L__BB57_21; - - mov.u32 %r33, %ntid.y; - mov.u32 %r34, %ctaid.y; - mov.u32 %r35, %tid.y; - mad.lo.s32 %r3, %r33, %r34, %r35; - mov.u32 %r36, %nctaid.x; - mul.lo.s32 %r4, %r1, %r36; - mov.u32 %r37, %ctaid.z; - mov.u32 %r38, %ntid.z; - mov.u32 %r39, %tid.z; - mad.lo.s32 %r5, %r38, %r37, %r39; - mov.u32 %r40, %nctaid.y; - mul.lo.s32 %r6, %r33, %r40; - mov.u32 %r41, %nctaid.z; - mul.lo.s32 %r7, %r38, %r41; + mov.u32 %r30, %tid.x; + mad.lo.s32 %r42, %r1, %r29, %r30; + setp.ge.u32 %p1, %r42, %r27; + @%p1 bra $L__BB57_18; + + mov.u32 %r31, %ntid.y; + mov.u32 %r32, %ctaid.y; + mov.u32 %r33, %tid.y; + mad.lo.s32 %r3, %r31, %r32, %r33; + mov.u32 %r34, %nctaid.x; + mul.lo.s32 %r4, %r1, %r34; + mov.u32 %r35, %ctaid.z; + mov.u32 %r36, %ntid.z; + mov.u32 %r37, %tid.z; + mad.lo.s32 %r5, %r36, %r35, %r37; + mov.u32 %r38, %nctaid.y; + mul.lo.s32 %r6, %r31, %r38; + mov.u32 %r39, %nctaid.z; + mul.lo.s32 %r7, %r36, %r39; $L__BB57_2: - setp.ge.u32 %p2, %r3, %r28; - @%p2 bra $L__BB57_20; + setp.ge.u32 %p2, %r3, %r26; + @%p2 bra $L__BB57_17; - setp.eq.s32 %p3, %r30, 0; - cvt.u64.u32 %rd4, %r45; - @%p3 bra $L__BB57_15; + setp.eq.s32 %p3, %r28, 0; + cvt.u64.u32 %rd4, %r42; + @%p3 bra $L__BB57_12; - mov.u32 %r46, %r3; + mov.u32 %r43, %r3; $L__BB57_5: - setp.ge.u32 %p4, %r5, %r27; - @%p4 bra $L__BB57_14; - - mul.wide.u32 %rd8, %r46, 8; - add.s64 %rd9, %rd3, %rd8; - ld.global.nc.u64 %rd10, [%rd9]; - cvta.to.global.u64 %rd11, %rd10; - shl.b64 %rd12, %rd4, 2; - add.s64 %rd13, %rd11, %rd12; - ld.global.f32 %f1, [%rd13]; - mul.lo.s32 %r10, %r46, %r27; - setp.eq.s32 %p5, %r30, 1; - @%p5 bra $L__BB57_10; - - setp.ne.s32 %p6, %r30, 2; - @%p6 bra $L__BB57_12; + setp.ge.u32 %p4, %r5, %r25; + @%p4 bra $L__BB57_11; - add.s32 %r47, %r5, %r10; - mov.u32 %r48, %r5; + setp.eq.s32 %p5, %r28, 2; + mul.wide.u32 %rd9, %r43, 8; + add.s64 %rd10, %rd3, %rd9; + ld.global.nc.u64 %rd11, [%rd10]; + cvta.to.global.u64 %rd12, %rd11; + shl.b64 %rd13, %rd4, 2; + add.s64 %rd5, %rd12, %rd13; + mul.lo.s32 %r10, %r43, %r25; + @%p5 bra $L__BB57_9; + bra.uni $L__BB57_7; $L__BB57_9: - mul.wide.u32 %rd14, %r48, 8; - add.s64 %rd15, %rd2, %rd14; - ld.global.nc.u64 %rd16, [%rd15]; - cvta.to.global.u64 %rd17, %rd16; - add.s64 %rd19, %rd17, %rd12; - ld.global.f32 %f3, [%rd19]; - sub.ftz.f32 %f4, %f1, %f3; - abs.ftz.f32 %f5, %f4; - mul.wide.u32 %rd20, %r47, 4; - add.s64 %rd21, %rd1, %rd20; - atom.global.add.f32 %f6, [%rd21], %f5; - add.s32 %r47, %r47, %r7; - add.s32 %r48, %r48, %r7; - setp.lt.u32 %p7, %r48, %r27; - @%p7 bra $L__BB57_9; - bra.uni $L__BB57_14; + ld.global.f32 %f1, [%rd5]; + mov.u32 %r46, %r5; $L__BB57_10: - mov.u32 %r49, %r5; + mul.wide.u32 %rd16, %r46, 8; + add.s64 %rd17, %rd2, %rd16; + ld.global.nc.u64 %rd18, [%rd17]; + cvta.to.global.u64 %rd19, %rd18; + add.s64 %rd21, %rd19, %rd13; + ld.global.f32 %f4, [%rd21]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + add.s32 %r40, %r46, %r10; + mul.wide.u32 %rd22, %r40, 4; + add.s64 %rd23, %rd1, %rd22; + atom.global.add.f32 %f7, [%rd23], %f6; + add.s32 %r46, %r46, %r7; + setp.lt.u32 %p7, %r46, %r25; + @%p7 bra $L__BB57_10; + bra.uni $L__BB57_11; + +$L__BB57_7: + add.s32 %r44, %r5, %r10; + mov.u32 %r45, %r5; + +$L__BB57_8: + mul.wide.u32 %rd14, %r44, 4; + add.s64 %rd15, %rd1, %rd14; + atom.global.add.f32 %f3, [%rd15], 0f00000000; + add.s32 %r44, %r44, %r7; + add.s32 %r45, %r45, %r7; + setp.lt.u32 %p6, %r45, %r25; + @%p6 bra $L__BB57_8; $L__BB57_11: - mul.wide.u32 %rd22, %r49, 8; - add.s64 %rd23, %rd2, %rd22; - ld.global.nc.u64 %rd24, [%rd23]; - cvta.to.global.u64 %rd25, %rd24; - add.s64 %rd27, %rd25, %rd12; - ld.global.f32 %f7, [%rd27]; - mul.ftz.f32 %f8, %f1, %f7; - add.s32 %r42, %r49, %r10; - mul.wide.u32 %rd28, %r42, 4; - add.s64 %rd29, %rd1, %rd28; - atom.global.add.f32 %f9, [%rd29], %f8; - add.s32 %r49, %r49, %r7; - setp.lt.u32 %p8, %r49, %r27; - @%p8 bra $L__BB57_11; - bra.uni $L__BB57_14; + add.s32 %r43, %r43, %r6; + setp.lt.u32 %p8, %r43, %r26; + @%p8 bra $L__BB57_5; + bra.uni $L__BB57_17; $L__BB57_12: - mov.u32 %r50, %r5; + mov.u32 %r47, %r3; $L__BB57_13: - add.s32 %r43, %r50, %r10; - mul.wide.u32 %rd30, %r43, 4; - add.s64 %rd31, %rd1, %rd30; - atom.global.add.f32 %f10, [%rd31], 0f00000000; - add.s32 %r50, %r50, %r7; - setp.lt.u32 %p9, %r50, %r27; - @%p9 bra $L__BB57_13; - -$L__BB57_14: - add.s32 %r46, %r46, %r6; - setp.lt.u32 %p10, %r46, %r28; - @%p10 bra $L__BB57_5; - bra.uni $L__BB57_20; + setp.ge.u32 %p9, %r5, %r25; + @%p9 bra $L__BB57_16; + + mul.wide.u32 %rd24, %r47, 8; + add.s64 %rd25, %rd3, %rd24; + ld.global.nc.u64 %rd26, [%rd25]; + cvta.to.global.u64 %rd27, %rd26; + shl.b64 %rd28, %rd4, 2; + add.s64 %rd29, %rd27, %rd28; + ld.global.f32 %f2, [%rd29]; + mul.lo.s32 %r20, %r47, %r25; + mov.u32 %r48, %r5; $L__BB57_15: - mov.u32 %r51, %r3; + mul.wide.u32 %rd30, %r48, 8; + add.s64 %rd31, %rd2, %rd30; + ld.global.nc.u64 %rd32, [%rd31]; + cvta.to.global.u64 %rd33, %rd32; + add.s64 %rd35, %rd33, %rd28; + ld.global.f32 %f8, [%rd35]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + add.s32 %r41, %r48, %r20; + mul.wide.u32 %rd36, %r41, 4; + add.s64 %rd37, %rd1, %rd36; + atom.global.add.f32 %f11, [%rd37], %f10; + add.s32 %r48, %r48, %r7; + setp.lt.u32 %p10, %r48, %r25; + @%p10 bra $L__BB57_15; $L__BB57_16: - setp.ge.u32 %p11, %r5, %r27; - @%p11 bra $L__BB57_19; - - mul.wide.u32 %rd32, %r51, 8; - add.s64 %rd33, %rd3, %rd32; - ld.global.nc.u64 %rd34, [%rd33]; - cvta.to.global.u64 %rd35, %rd34; - shl.b64 %rd36, %rd4, 2; - add.s64 %rd37, %rd35, %rd36; - ld.global.f32 %f2, [%rd37]; - mul.lo.s32 %r22, %r51, %r27; - mov.u32 %r52, %r5; + add.s32 %r47, %r47, %r6; + setp.lt.u32 %p11, %r47, %r26; + @%p11 bra $L__BB57_13; + +$L__BB57_17: + add.s32 %r42, %r42, %r4; + setp.lt.u32 %p12, %r42, %r27; + @%p12 bra $L__BB57_2; $L__BB57_18: - mul.wide.u32 %rd38, %r52, 8; - add.s64 %rd39, %rd2, %rd38; - ld.global.nc.u64 %rd40, [%rd39]; - cvta.to.global.u64 %rd41, %rd40; - add.s64 %rd43, %rd41, %rd36; - ld.global.f32 %f11, [%rd43]; - sub.ftz.f32 %f12, %f2, %f11; - mul.ftz.f32 %f13, %f12, %f12; - add.s32 %r44, %r52, %r22; - mul.wide.u32 %rd44, %r44, 4; - add.s64 %rd45, %rd1, %rd44; - atom.global.add.f32 %f14, [%rd45], %f13; - add.s32 %r52, %r52, %r7; - setp.lt.u32 %p12, %r52, %r27; - @%p12 bra $L__BB57_18; - -$L__BB57_19: - add.s32 %r51, %r51, %r6; - setp.lt.u32 %p13, %r51, %r28; - @%p13 bra $L__BB57_16; - -$L__BB57_20: - add.s32 %r45, %r45, %r4; - setp.lt.u32 %p14, %r45, %r29; - @%p14 bra $L__BB57_2; - -$L__BB57_21: ret; } - // .globl MultiCosineDistance -.visible .entry MultiCosineDistance( - .param .u64 MultiCosineDistance_param_0, - .param .u64 MultiCosineDistance_param_1, - .param .u64 MultiCosineDistance_param_2, - .param .u64 MultiCosineDistance_param_3, - .param .u64 MultiCosineDistance_param_4, - .param .u32 MultiCosineDistance_param_5, - .param .u32 MultiCosineDistance_param_6, - .param .u32 MultiCosineDistance_param_7 + // .globl CalculateDistances +.visible .entry CalculateDistances( + .param .u64 CalculateDistances_param_0, + .param .u64 CalculateDistances_param_1, + .param .u64 CalculateDistances_param_2, + .param .u32 CalculateDistances_param_3, + .param .u32 CalculateDistances_param_4, + .param .u32 CalculateDistances_param_5 +) +{ + .reg .pred %p<11>; + .reg .f32 %f<12>; + .reg .b32 %r<31>; + .reg .b64 %rd<31>; + + + ld.param.u64 %rd7, [CalculateDistances_param_0]; + ld.param.u64 %rd8, [CalculateDistances_param_1]; + ld.param.u64 %rd9, [CalculateDistances_param_2]; + ld.param.u32 %r16, [CalculateDistances_param_3]; + ld.param.u32 %r17, [CalculateDistances_param_4]; + ld.param.u32 %r18, [CalculateDistances_param_5]; + cvta.to.global.u64 %rd1, %rd9; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd7; + mov.u32 %r19, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r20, %tid.x; + mad.lo.s32 %r26, %r1, %r19, %r20; + setp.ge.u32 %p1, %r26, %r17; + @%p1 bra $L__BB58_15; + + mov.u32 %r21, %ntid.y; + mov.u32 %r22, %ctaid.y; + mov.u32 %r23, %tid.y; + mad.lo.s32 %r3, %r21, %r22, %r23; + mov.u32 %r24, %nctaid.x; + mul.lo.s32 %r4, %r1, %r24; + mov.u32 %r25, %nctaid.y; + mul.lo.s32 %r5, %r21, %r25; + setp.eq.s32 %p2, %r18, 0; + @%p2 bra $L__BB58_11; + +$L__BB58_3: + setp.ge.u32 %p3, %r3, %r16; + @%p3 bra $L__BB58_9; + + setp.eq.s32 %p4, %r18, 2; + cvt.u64.u32 %rd4, %r26; + mul.wide.u32 %rd10, %r26, 4; + add.s64 %rd5, %rd3, %rd10; + @%p4 bra $L__BB58_7; + bra.uni $L__BB58_5; + +$L__BB58_7: + ld.global.nc.f32 %f1, [%rd5]; + mov.u32 %r28, %r3; + +$L__BB58_8: + mul.wide.u32 %rd13, %r28, 8; + add.s64 %rd14, %rd2, %rd13; + ld.global.nc.u64 %rd15, [%rd14]; + cvta.to.global.u64 %rd16, %rd15; + shl.b64 %rd17, %rd4, 2; + add.s64 %rd18, %rd16, %rd17; + ld.global.f32 %f4, [%rd18]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + mul.wide.u32 %rd19, %r28, 4; + add.s64 %rd20, %rd1, %rd19; + atom.global.add.f32 %f7, [%rd20], %f6; + add.s32 %r28, %r28, %r5; + setp.lt.u32 %p6, %r28, %r16; + @%p6 bra $L__BB58_8; + bra.uni $L__BB58_9; + +$L__BB58_5: + mov.u32 %r27, %r3; + +$L__BB58_6: + mul.wide.u32 %rd11, %r27, 4; + add.s64 %rd12, %rd1, %rd11; + atom.global.add.f32 %f3, [%rd12], 0f00000000; + add.s32 %r27, %r27, %r5; + setp.lt.u32 %p5, %r27, %r16; + @%p5 bra $L__BB58_6; + +$L__BB58_9: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p7, %r26, %r17; + @%p7 bra $L__BB58_3; + bra.uni $L__BB58_15; + +$L__BB58_11: + setp.ge.u32 %p8, %r3, %r16; + @%p8 bra $L__BB58_14; + + cvt.u64.u32 %rd6, %r26; + mul.wide.u32 %rd21, %r26, 4; + add.s64 %rd22, %rd3, %rd21; + ld.global.nc.f32 %f2, [%rd22]; + mov.u32 %r30, %r3; + +$L__BB58_13: + mul.wide.u32 %rd23, %r30, 8; + add.s64 %rd24, %rd2, %rd23; + ld.global.nc.u64 %rd25, [%rd24]; + cvta.to.global.u64 %rd26, %rd25; + shl.b64 %rd27, %rd6, 2; + add.s64 %rd28, %rd26, %rd27; + ld.global.f32 %f8, [%rd28]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + mul.wide.u32 %rd29, %r30, 4; + add.s64 %rd30, %rd1, %rd29; + atom.global.add.f32 %f11, [%rd30], %f10; + add.s32 %r30, %r30, %r5; + setp.lt.u32 %p9, %r30, %r16; + @%p9 bra $L__BB58_13; + +$L__BB58_14: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p10, %r26, %r17; + @%p10 bra $L__BB58_11; + +$L__BB58_15: + ret; + +} + // .globl CosineMultiDistance +.visible .entry CosineMultiDistance( + .param .u64 CosineMultiDistance_param_0, + .param .u64 CosineMultiDistance_param_1, + .param .u64 CosineMultiDistance_param_2, + .param .u64 CosineMultiDistance_param_3, + .param .u64 CosineMultiDistance_param_4, + .param .u32 CosineMultiDistance_param_5, + .param .u32 CosineMultiDistance_param_6, + .param .u32 CosineMultiDistance_param_7 ) { .reg .pred %p<7>; @@ -4224,20 +4332,20 @@ $L__BB57_21: .reg .b64 %rd<28>; - ld.param.u64 %rd7, [MultiCosineDistance_param_0]; - ld.param.u64 %rd8, [MultiCosineDistance_param_1]; - ld.param.u64 %rd9, [MultiCosineDistance_param_2]; - ld.param.u64 %rd10, [MultiCosineDistance_param_3]; - ld.param.u64 %rd11, [MultiCosineDistance_param_4]; - ld.param.u32 %r17, [MultiCosineDistance_param_5]; - ld.param.u32 %r18, [MultiCosineDistance_param_6]; - ld.param.u32 %r19, [MultiCosineDistance_param_7]; + ld.param.u64 %rd7, [CosineMultiDistance_param_0]; + ld.param.u64 %rd8, [CosineMultiDistance_param_1]; + ld.param.u64 %rd9, [CosineMultiDistance_param_2]; + ld.param.u64 %rd10, [CosineMultiDistance_param_3]; + ld.param.u64 %rd11, [CosineMultiDistance_param_4]; + ld.param.u32 %r17, [CosineMultiDistance_param_5]; + ld.param.u32 %r18, [CosineMultiDistance_param_6]; + ld.param.u32 %r19, [CosineMultiDistance_param_7]; mov.u32 %r20, %ctaid.x; mov.u32 %r1, %ntid.x; mov.u32 %r21, %tid.x; mad.lo.s32 %r31, %r1, %r20, %r21; setp.ge.u32 %p1, %r31, %r19; - @%p1 bra $L__BB58_9; + @%p1 bra $L__BB59_9; mov.u32 %r22, %ntid.y; mov.u32 %r23, %ctaid.y; @@ -4259,16 +4367,16 @@ $L__BB57_21: cvta.to.global.u64 %rd4, %rd10; cvta.to.global.u64 %rd5, %rd11; -$L__BB58_2: +$L__BB59_2: setp.ge.u32 %p2, %r3, %r18; - @%p2 bra $L__BB58_8; + @%p2 bra $L__BB59_8; cvt.u64.u32 %rd6, %r31; mov.u32 %r32, %r3; -$L__BB58_4: +$L__BB59_4: setp.ge.u32 %p3, %r5, %r17; - @%p3 bra $L__BB58_7; + @%p3 bra $L__BB59_7; mul.wide.u32 %rd12, %r32, 8; add.s64 %rd13, %rd1, %rd12; @@ -4281,7 +4389,7 @@ $L__BB58_4: mad.lo.s32 %r33, %r32, %r17, %r5; mov.u32 %r34, %r5; -$L__BB58_6: +$L__BB59_6: mul.wide.u32 %rd18, %r34, 8; add.s64 %rd19, %rd2, %rd18; ld.global.nc.u64 %rd20, [%rd19]; @@ -4300,19 +4408,105 @@ $L__BB58_6: add.s32 %r33, %r33, %r7; add.s32 %r34, %r34, %r7; setp.lt.u32 %p4, %r34, %r17; - @%p4 bra $L__BB58_6; + @%p4 bra $L__BB59_6; -$L__BB58_7: +$L__BB59_7: add.s32 %r32, %r32, %r6; setp.lt.u32 %p5, %r32, %r18; - @%p5 bra $L__BB58_4; + @%p5 bra $L__BB59_4; -$L__BB58_8: +$L__BB59_8: add.s32 %r31, %r31, %r4; setp.lt.u32 %p6, %r31, %r19; - @%p6 bra $L__BB58_2; + @%p6 bra $L__BB59_2; -$L__BB58_9: +$L__BB59_9: + ret; + +} + // .globl CosineDistances +.visible .entry CosineDistances( + .param .u64 CosineDistances_param_0, + .param .u64 CosineDistances_param_1, + .param .u64 CosineDistances_param_2, + .param .u64 CosineDistances_param_3, + .param .u64 CosineDistances_param_4, + .param .u32 CosineDistances_param_5, + .param .u32 CosineDistances_param_6 +) +{ + .reg .pred %p<5>; + .reg .f32 %f<9>; + .reg .b32 %r<21>; + .reg .b64 %rd<24>; + + + ld.param.u64 %rd7, [CosineDistances_param_0]; + ld.param.u64 %rd8, [CosineDistances_param_1]; + ld.param.u64 %rd9, [CosineDistances_param_2]; + ld.param.u64 %rd10, [CosineDistances_param_3]; + ld.param.u64 %rd11, [CosineDistances_param_4]; + ld.param.u32 %r10, [CosineDistances_param_5]; + ld.param.u32 %r11, [CosineDistances_param_6]; + mov.u32 %r12, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r13, %tid.x; + mad.lo.s32 %r19, %r1, %r12, %r13; + setp.ge.u32 %p1, %r19, %r11; + @%p1 bra $L__BB60_6; + + mov.u32 %r14, %ntid.y; + mov.u32 %r15, %ctaid.y; + mov.u32 %r16, %tid.y; + mad.lo.s32 %r3, %r14, %r15, %r16; + mov.u32 %r17, %nctaid.x; + mul.lo.s32 %r4, %r1, %r17; + mov.u32 %r18, %nctaid.y; + mul.lo.s32 %r5, %r14, %r18; + cvta.to.global.u64 %rd1, %rd7; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd9; + cvta.to.global.u64 %rd4, %rd10; + cvta.to.global.u64 %rd5, %rd11; + +$L__BB60_2: + setp.ge.u32 %p2, %r3, %r10; + @%p2 bra $L__BB60_5; + + cvt.u64.u32 %rd6, %r19; + mul.wide.u32 %rd12, %r19, 4; + add.s64 %rd13, %rd1, %rd12; + ld.global.nc.f32 %f1, [%rd13]; + mul.ftz.f32 %f2, %f1, %f1; + mov.u32 %r20, %r3; + +$L__BB60_4: + mul.wide.u32 %rd14, %r20, 8; + add.s64 %rd15, %rd2, %rd14; + ld.global.nc.u64 %rd16, [%rd15]; + cvta.to.global.u64 %rd17, %rd16; + shl.b64 %rd18, %rd6, 2; + add.s64 %rd19, %rd17, %rd18; + ld.global.f32 %f3, [%rd19]; + mul.wide.u32 %rd20, %r20, 4; + add.s64 %rd21, %rd3, %rd20; + atom.global.add.f32 %f4, [%rd21], %f2; + add.s64 %rd22, %rd4, %rd20; + mul.ftz.f32 %f5, %f1, %f3; + atom.global.add.f32 %f6, [%rd22], %f5; + add.s64 %rd23, %rd5, %rd20; + mul.ftz.f32 %f7, %f3, %f3; + atom.global.add.f32 %f8, [%rd23], %f7; + add.s32 %r20, %r20, %r5; + setp.lt.u32 %p3, %r20, %r10; + @%p3 bra $L__BB60_4; + +$L__BB60_5: + add.s32 %r19, %r19, %r4; + setp.lt.u32 %p4, %r19, %r11; + @%p4 bra $L__BB60_2; + +$L__BB60_6: ret; } @@ -4340,7 +4534,7 @@ $L__BB58_9: mov.u32 %r2, %tid.x; mad.lo.s32 %r3, %r22, %r1, %r2; setp.ge.u32 %p1, %r3, %r20; - @%p1 bra $L__BB59_2; + @%p1 bra $L__BB61_2; cvta.to.global.u64 %rd3, %rd1; mul.lo.s32 %r23, %r3, %r21; @@ -4352,17 +4546,17 @@ $L__BB58_9: add.s32 %r26, %r25, %r24; st.shared.f32 [%r26], %f8; -$L__BB59_2: +$L__BB61_2: bar.sync 0; setp.ne.s32 %p2, %r2, 0; - @%p2 bra $L__BB59_11; + @%p2 bra $L__BB61_11; shl.b32 %r4, %r1, 10; sub.s32 %r27, %r20, %r4; min.u32 %r5, %r27, 1024; setp.eq.s32 %p3, %r5, 0; mov.f32 %f25, 0f00000000; - @%p3 bra $L__BB59_10; + @%p3 bra $L__BB61_10; not.b32 %r29, %r20; add.s32 %r30, %r4, %r29; @@ -4373,7 +4567,7 @@ $L__BB59_2: setp.lt.u32 %p4, %r33, 3; mov.f32 %f25, 0f00000000; mov.u32 %r45, 0; - @%p4 bra $L__BB59_7; + @%p4 bra $L__BB61_7; add.s32 %r36, %r4, -1; sub.s32 %r37, %r36, %r20; @@ -4382,7 +4576,7 @@ $L__BB59_2: neg.s32 %r42, %r39; mov.u32 %r43, _ZZ9SumValuesE5block; -$L__BB59_6: +$L__BB61_6: ld.shared.f32 %f13, [%r43]; add.ftz.f32 %f14, %f25, %f13; ld.shared.f32 %f15, [%r43+4]; @@ -4395,32 +4589,32 @@ $L__BB59_6: add.s32 %r43, %r43, 16; add.s32 %r42, %r42, -4; setp.ne.s32 %p5, %r42, 1; - @%p5 bra $L__BB59_6; + @%p5 bra $L__BB61_6; -$L__BB59_7: +$L__BB61_7: setp.eq.s32 %p6, %r47, 0; - @%p6 bra $L__BB59_10; + @%p6 bra $L__BB61_10; shl.b32 %r40, %r45, 2; mov.u32 %r41, _ZZ9SumValuesE5block; add.s32 %r46, %r41, %r40; -$L__BB59_9: +$L__BB61_9: .pragma "nounroll"; ld.shared.f32 %f20, [%r46]; add.ftz.f32 %f25, %f25, %f20; add.s32 %r46, %r46, 4; add.s32 %r47, %r47, -1; setp.ne.s32 %p7, %r47, 0; - @%p7 bra $L__BB59_9; + @%p7 bra $L__BB61_9; -$L__BB59_10: +$L__BB61_10: cvta.to.global.u64 %rd6, %rd2; mul.wide.u32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f25; -$L__BB59_11: +$L__BB61_11: ret; } diff --git a/BrightData.Cuda/cuda/brightwire_89.ptx b/BrightData.Cuda/cuda/brightwire_89.ptx index 63e2f638..8dff67ac 100644 --- a/BrightData.Cuda/cuda/brightwire_89.ptx +++ b/BrightData.Cuda/cuda/brightwire_89.ptx @@ -4022,200 +4022,308 @@ $L__BB56_3: ret; } - // .globl CalculateDistances -.visible .entry CalculateDistances( - .param .u64 CalculateDistances_param_0, - .param .u64 CalculateDistances_param_1, - .param .u64 CalculateDistances_param_2, - .param .u32 CalculateDistances_param_3, - .param .u32 CalculateDistances_param_4, - .param .u32 CalculateDistances_param_5, - .param .u32 CalculateDistances_param_6 + // .globl CalculateMultiDistances +.visible .entry CalculateMultiDistances( + .param .u64 CalculateMultiDistances_param_0, + .param .u64 CalculateMultiDistances_param_1, + .param .u64 CalculateMultiDistances_param_2, + .param .u32 CalculateMultiDistances_param_3, + .param .u32 CalculateMultiDistances_param_4, + .param .u32 CalculateMultiDistances_param_5, + .param .u32 CalculateMultiDistances_param_6 ) { - .reg .pred %p<15>; - .reg .f32 %f<15>; - .reg .b32 %r<53>; - .reg .b64 %rd<46>; - - - ld.param.u64 %rd5, [CalculateDistances_param_0]; - ld.param.u64 %rd6, [CalculateDistances_param_1]; - ld.param.u64 %rd7, [CalculateDistances_param_2]; - ld.param.u32 %r27, [CalculateDistances_param_3]; - ld.param.u32 %r28, [CalculateDistances_param_4]; - ld.param.u32 %r29, [CalculateDistances_param_5]; - ld.param.u32 %r30, [CalculateDistances_param_6]; - cvta.to.global.u64 %rd1, %rd7; - cvta.to.global.u64 %rd2, %rd6; - cvta.to.global.u64 %rd3, %rd5; - mov.u32 %r31, %ctaid.x; + .reg .pred %p<13>; + .reg .f32 %f<12>; + .reg .b32 %r<49>; + .reg .b64 %rd<38>; + + + ld.param.u64 %rd6, [CalculateMultiDistances_param_0]; + ld.param.u64 %rd7, [CalculateMultiDistances_param_1]; + ld.param.u64 %rd8, [CalculateMultiDistances_param_2]; + ld.param.u32 %r25, [CalculateMultiDistances_param_3]; + ld.param.u32 %r26, [CalculateMultiDistances_param_4]; + ld.param.u32 %r27, [CalculateMultiDistances_param_5]; + ld.param.u32 %r28, [CalculateMultiDistances_param_6]; + cvta.to.global.u64 %rd1, %rd8; + cvta.to.global.u64 %rd2, %rd7; + cvta.to.global.u64 %rd3, %rd6; + mov.u32 %r29, %ctaid.x; mov.u32 %r1, %ntid.x; - mov.u32 %r32, %tid.x; - mad.lo.s32 %r45, %r1, %r31, %r32; - setp.ge.u32 %p1, %r45, %r29; - @%p1 bra $L__BB57_21; - - mov.u32 %r33, %ntid.y; - mov.u32 %r34, %ctaid.y; - mov.u32 %r35, %tid.y; - mad.lo.s32 %r3, %r33, %r34, %r35; - mov.u32 %r36, %nctaid.x; - mul.lo.s32 %r4, %r1, %r36; - mov.u32 %r37, %ctaid.z; - mov.u32 %r38, %ntid.z; - mov.u32 %r39, %tid.z; - mad.lo.s32 %r5, %r38, %r37, %r39; - mov.u32 %r40, %nctaid.y; - mul.lo.s32 %r6, %r33, %r40; - mov.u32 %r41, %nctaid.z; - mul.lo.s32 %r7, %r38, %r41; + mov.u32 %r30, %tid.x; + mad.lo.s32 %r42, %r1, %r29, %r30; + setp.ge.u32 %p1, %r42, %r27; + @%p1 bra $L__BB57_18; + + mov.u32 %r31, %ntid.y; + mov.u32 %r32, %ctaid.y; + mov.u32 %r33, %tid.y; + mad.lo.s32 %r3, %r31, %r32, %r33; + mov.u32 %r34, %nctaid.x; + mul.lo.s32 %r4, %r1, %r34; + mov.u32 %r35, %ctaid.z; + mov.u32 %r36, %ntid.z; + mov.u32 %r37, %tid.z; + mad.lo.s32 %r5, %r36, %r35, %r37; + mov.u32 %r38, %nctaid.y; + mul.lo.s32 %r6, %r31, %r38; + mov.u32 %r39, %nctaid.z; + mul.lo.s32 %r7, %r36, %r39; $L__BB57_2: - setp.ge.u32 %p2, %r3, %r28; - @%p2 bra $L__BB57_20; + setp.ge.u32 %p2, %r3, %r26; + @%p2 bra $L__BB57_17; - setp.eq.s32 %p3, %r30, 0; - cvt.u64.u32 %rd4, %r45; - @%p3 bra $L__BB57_15; + setp.eq.s32 %p3, %r28, 0; + cvt.u64.u32 %rd4, %r42; + @%p3 bra $L__BB57_12; - mov.u32 %r46, %r3; + mov.u32 %r43, %r3; $L__BB57_5: - setp.ge.u32 %p4, %r5, %r27; - @%p4 bra $L__BB57_14; - - mul.wide.u32 %rd8, %r46, 8; - add.s64 %rd9, %rd3, %rd8; - ld.global.nc.u64 %rd10, [%rd9]; - cvta.to.global.u64 %rd11, %rd10; - shl.b64 %rd12, %rd4, 2; - add.s64 %rd13, %rd11, %rd12; - ld.global.f32 %f1, [%rd13]; - mul.lo.s32 %r10, %r46, %r27; - setp.eq.s32 %p5, %r30, 1; - @%p5 bra $L__BB57_10; - - setp.ne.s32 %p6, %r30, 2; - @%p6 bra $L__BB57_12; + setp.ge.u32 %p4, %r5, %r25; + @%p4 bra $L__BB57_11; - add.s32 %r47, %r5, %r10; - mov.u32 %r48, %r5; + setp.eq.s32 %p5, %r28, 2; + mul.wide.u32 %rd9, %r43, 8; + add.s64 %rd10, %rd3, %rd9; + ld.global.nc.u64 %rd11, [%rd10]; + cvta.to.global.u64 %rd12, %rd11; + shl.b64 %rd13, %rd4, 2; + add.s64 %rd5, %rd12, %rd13; + mul.lo.s32 %r10, %r43, %r25; + @%p5 bra $L__BB57_9; + bra.uni $L__BB57_7; $L__BB57_9: - mul.wide.u32 %rd14, %r48, 8; - add.s64 %rd15, %rd2, %rd14; - ld.global.nc.u64 %rd16, [%rd15]; - cvta.to.global.u64 %rd17, %rd16; - add.s64 %rd19, %rd17, %rd12; - ld.global.f32 %f3, [%rd19]; - sub.ftz.f32 %f4, %f1, %f3; - abs.ftz.f32 %f5, %f4; - mul.wide.u32 %rd20, %r47, 4; - add.s64 %rd21, %rd1, %rd20; - atom.global.add.f32 %f6, [%rd21], %f5; - add.s32 %r47, %r47, %r7; - add.s32 %r48, %r48, %r7; - setp.lt.u32 %p7, %r48, %r27; - @%p7 bra $L__BB57_9; - bra.uni $L__BB57_14; + ld.global.f32 %f1, [%rd5]; + mov.u32 %r46, %r5; $L__BB57_10: - mov.u32 %r49, %r5; + mul.wide.u32 %rd16, %r46, 8; + add.s64 %rd17, %rd2, %rd16; + ld.global.nc.u64 %rd18, [%rd17]; + cvta.to.global.u64 %rd19, %rd18; + add.s64 %rd21, %rd19, %rd13; + ld.global.f32 %f4, [%rd21]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + add.s32 %r40, %r46, %r10; + mul.wide.u32 %rd22, %r40, 4; + add.s64 %rd23, %rd1, %rd22; + atom.global.add.f32 %f7, [%rd23], %f6; + add.s32 %r46, %r46, %r7; + setp.lt.u32 %p7, %r46, %r25; + @%p7 bra $L__BB57_10; + bra.uni $L__BB57_11; + +$L__BB57_7: + add.s32 %r44, %r5, %r10; + mov.u32 %r45, %r5; + +$L__BB57_8: + mul.wide.u32 %rd14, %r44, 4; + add.s64 %rd15, %rd1, %rd14; + atom.global.add.f32 %f3, [%rd15], 0f00000000; + add.s32 %r44, %r44, %r7; + add.s32 %r45, %r45, %r7; + setp.lt.u32 %p6, %r45, %r25; + @%p6 bra $L__BB57_8; $L__BB57_11: - mul.wide.u32 %rd22, %r49, 8; - add.s64 %rd23, %rd2, %rd22; - ld.global.nc.u64 %rd24, [%rd23]; - cvta.to.global.u64 %rd25, %rd24; - add.s64 %rd27, %rd25, %rd12; - ld.global.f32 %f7, [%rd27]; - mul.ftz.f32 %f8, %f1, %f7; - add.s32 %r42, %r49, %r10; - mul.wide.u32 %rd28, %r42, 4; - add.s64 %rd29, %rd1, %rd28; - atom.global.add.f32 %f9, [%rd29], %f8; - add.s32 %r49, %r49, %r7; - setp.lt.u32 %p8, %r49, %r27; - @%p8 bra $L__BB57_11; - bra.uni $L__BB57_14; + add.s32 %r43, %r43, %r6; + setp.lt.u32 %p8, %r43, %r26; + @%p8 bra $L__BB57_5; + bra.uni $L__BB57_17; $L__BB57_12: - mov.u32 %r50, %r5; + mov.u32 %r47, %r3; $L__BB57_13: - add.s32 %r43, %r50, %r10; - mul.wide.u32 %rd30, %r43, 4; - add.s64 %rd31, %rd1, %rd30; - atom.global.add.f32 %f10, [%rd31], 0f00000000; - add.s32 %r50, %r50, %r7; - setp.lt.u32 %p9, %r50, %r27; - @%p9 bra $L__BB57_13; - -$L__BB57_14: - add.s32 %r46, %r46, %r6; - setp.lt.u32 %p10, %r46, %r28; - @%p10 bra $L__BB57_5; - bra.uni $L__BB57_20; + setp.ge.u32 %p9, %r5, %r25; + @%p9 bra $L__BB57_16; + + mul.wide.u32 %rd24, %r47, 8; + add.s64 %rd25, %rd3, %rd24; + ld.global.nc.u64 %rd26, [%rd25]; + cvta.to.global.u64 %rd27, %rd26; + shl.b64 %rd28, %rd4, 2; + add.s64 %rd29, %rd27, %rd28; + ld.global.f32 %f2, [%rd29]; + mul.lo.s32 %r20, %r47, %r25; + mov.u32 %r48, %r5; $L__BB57_15: - mov.u32 %r51, %r3; + mul.wide.u32 %rd30, %r48, 8; + add.s64 %rd31, %rd2, %rd30; + ld.global.nc.u64 %rd32, [%rd31]; + cvta.to.global.u64 %rd33, %rd32; + add.s64 %rd35, %rd33, %rd28; + ld.global.f32 %f8, [%rd35]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + add.s32 %r41, %r48, %r20; + mul.wide.u32 %rd36, %r41, 4; + add.s64 %rd37, %rd1, %rd36; + atom.global.add.f32 %f11, [%rd37], %f10; + add.s32 %r48, %r48, %r7; + setp.lt.u32 %p10, %r48, %r25; + @%p10 bra $L__BB57_15; $L__BB57_16: - setp.ge.u32 %p11, %r5, %r27; - @%p11 bra $L__BB57_19; - - mul.wide.u32 %rd32, %r51, 8; - add.s64 %rd33, %rd3, %rd32; - ld.global.nc.u64 %rd34, [%rd33]; - cvta.to.global.u64 %rd35, %rd34; - shl.b64 %rd36, %rd4, 2; - add.s64 %rd37, %rd35, %rd36; - ld.global.f32 %f2, [%rd37]; - mul.lo.s32 %r22, %r51, %r27; - mov.u32 %r52, %r5; + add.s32 %r47, %r47, %r6; + setp.lt.u32 %p11, %r47, %r26; + @%p11 bra $L__BB57_13; + +$L__BB57_17: + add.s32 %r42, %r42, %r4; + setp.lt.u32 %p12, %r42, %r27; + @%p12 bra $L__BB57_2; $L__BB57_18: - mul.wide.u32 %rd38, %r52, 8; - add.s64 %rd39, %rd2, %rd38; - ld.global.nc.u64 %rd40, [%rd39]; - cvta.to.global.u64 %rd41, %rd40; - add.s64 %rd43, %rd41, %rd36; - ld.global.f32 %f11, [%rd43]; - sub.ftz.f32 %f12, %f2, %f11; - mul.ftz.f32 %f13, %f12, %f12; - add.s32 %r44, %r52, %r22; - mul.wide.u32 %rd44, %r44, 4; - add.s64 %rd45, %rd1, %rd44; - atom.global.add.f32 %f14, [%rd45], %f13; - add.s32 %r52, %r52, %r7; - setp.lt.u32 %p12, %r52, %r27; - @%p12 bra $L__BB57_18; - -$L__BB57_19: - add.s32 %r51, %r51, %r6; - setp.lt.u32 %p13, %r51, %r28; - @%p13 bra $L__BB57_16; - -$L__BB57_20: - add.s32 %r45, %r45, %r4; - setp.lt.u32 %p14, %r45, %r29; - @%p14 bra $L__BB57_2; - -$L__BB57_21: ret; } - // .globl MultiCosineDistance -.visible .entry MultiCosineDistance( - .param .u64 MultiCosineDistance_param_0, - .param .u64 MultiCosineDistance_param_1, - .param .u64 MultiCosineDistance_param_2, - .param .u64 MultiCosineDistance_param_3, - .param .u64 MultiCosineDistance_param_4, - .param .u32 MultiCosineDistance_param_5, - .param .u32 MultiCosineDistance_param_6, - .param .u32 MultiCosineDistance_param_7 + // .globl CalculateDistances +.visible .entry CalculateDistances( + .param .u64 CalculateDistances_param_0, + .param .u64 CalculateDistances_param_1, + .param .u64 CalculateDistances_param_2, + .param .u32 CalculateDistances_param_3, + .param .u32 CalculateDistances_param_4, + .param .u32 CalculateDistances_param_5 +) +{ + .reg .pred %p<11>; + .reg .f32 %f<12>; + .reg .b32 %r<31>; + .reg .b64 %rd<31>; + + + ld.param.u64 %rd7, [CalculateDistances_param_0]; + ld.param.u64 %rd8, [CalculateDistances_param_1]; + ld.param.u64 %rd9, [CalculateDistances_param_2]; + ld.param.u32 %r16, [CalculateDistances_param_3]; + ld.param.u32 %r17, [CalculateDistances_param_4]; + ld.param.u32 %r18, [CalculateDistances_param_5]; + cvta.to.global.u64 %rd1, %rd9; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd7; + mov.u32 %r19, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r20, %tid.x; + mad.lo.s32 %r26, %r1, %r19, %r20; + setp.ge.u32 %p1, %r26, %r17; + @%p1 bra $L__BB58_15; + + mov.u32 %r21, %ntid.y; + mov.u32 %r22, %ctaid.y; + mov.u32 %r23, %tid.y; + mad.lo.s32 %r3, %r21, %r22, %r23; + mov.u32 %r24, %nctaid.x; + mul.lo.s32 %r4, %r1, %r24; + mov.u32 %r25, %nctaid.y; + mul.lo.s32 %r5, %r21, %r25; + setp.eq.s32 %p2, %r18, 0; + @%p2 bra $L__BB58_11; + +$L__BB58_3: + setp.ge.u32 %p3, %r3, %r16; + @%p3 bra $L__BB58_9; + + setp.eq.s32 %p4, %r18, 2; + cvt.u64.u32 %rd4, %r26; + mul.wide.u32 %rd10, %r26, 4; + add.s64 %rd5, %rd3, %rd10; + @%p4 bra $L__BB58_7; + bra.uni $L__BB58_5; + +$L__BB58_7: + ld.global.nc.f32 %f1, [%rd5]; + mov.u32 %r28, %r3; + +$L__BB58_8: + mul.wide.u32 %rd13, %r28, 8; + add.s64 %rd14, %rd2, %rd13; + ld.global.nc.u64 %rd15, [%rd14]; + cvta.to.global.u64 %rd16, %rd15; + shl.b64 %rd17, %rd4, 2; + add.s64 %rd18, %rd16, %rd17; + ld.global.f32 %f4, [%rd18]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + mul.wide.u32 %rd19, %r28, 4; + add.s64 %rd20, %rd1, %rd19; + atom.global.add.f32 %f7, [%rd20], %f6; + add.s32 %r28, %r28, %r5; + setp.lt.u32 %p6, %r28, %r16; + @%p6 bra $L__BB58_8; + bra.uni $L__BB58_9; + +$L__BB58_5: + mov.u32 %r27, %r3; + +$L__BB58_6: + mul.wide.u32 %rd11, %r27, 4; + add.s64 %rd12, %rd1, %rd11; + atom.global.add.f32 %f3, [%rd12], 0f00000000; + add.s32 %r27, %r27, %r5; + setp.lt.u32 %p5, %r27, %r16; + @%p5 bra $L__BB58_6; + +$L__BB58_9: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p7, %r26, %r17; + @%p7 bra $L__BB58_3; + bra.uni $L__BB58_15; + +$L__BB58_11: + setp.ge.u32 %p8, %r3, %r16; + @%p8 bra $L__BB58_14; + + cvt.u64.u32 %rd6, %r26; + mul.wide.u32 %rd21, %r26, 4; + add.s64 %rd22, %rd3, %rd21; + ld.global.nc.f32 %f2, [%rd22]; + mov.u32 %r30, %r3; + +$L__BB58_13: + mul.wide.u32 %rd23, %r30, 8; + add.s64 %rd24, %rd2, %rd23; + ld.global.nc.u64 %rd25, [%rd24]; + cvta.to.global.u64 %rd26, %rd25; + shl.b64 %rd27, %rd6, 2; + add.s64 %rd28, %rd26, %rd27; + ld.global.f32 %f8, [%rd28]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + mul.wide.u32 %rd29, %r30, 4; + add.s64 %rd30, %rd1, %rd29; + atom.global.add.f32 %f11, [%rd30], %f10; + add.s32 %r30, %r30, %r5; + setp.lt.u32 %p9, %r30, %r16; + @%p9 bra $L__BB58_13; + +$L__BB58_14: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p10, %r26, %r17; + @%p10 bra $L__BB58_11; + +$L__BB58_15: + ret; + +} + // .globl CosineMultiDistance +.visible .entry CosineMultiDistance( + .param .u64 CosineMultiDistance_param_0, + .param .u64 CosineMultiDistance_param_1, + .param .u64 CosineMultiDistance_param_2, + .param .u64 CosineMultiDistance_param_3, + .param .u64 CosineMultiDistance_param_4, + .param .u32 CosineMultiDistance_param_5, + .param .u32 CosineMultiDistance_param_6, + .param .u32 CosineMultiDistance_param_7 ) { .reg .pred %p<7>; @@ -4224,20 +4332,20 @@ $L__BB57_21: .reg .b64 %rd<28>; - ld.param.u64 %rd7, [MultiCosineDistance_param_0]; - ld.param.u64 %rd8, [MultiCosineDistance_param_1]; - ld.param.u64 %rd9, [MultiCosineDistance_param_2]; - ld.param.u64 %rd10, [MultiCosineDistance_param_3]; - ld.param.u64 %rd11, [MultiCosineDistance_param_4]; - ld.param.u32 %r17, [MultiCosineDistance_param_5]; - ld.param.u32 %r18, [MultiCosineDistance_param_6]; - ld.param.u32 %r19, [MultiCosineDistance_param_7]; + ld.param.u64 %rd7, [CosineMultiDistance_param_0]; + ld.param.u64 %rd8, [CosineMultiDistance_param_1]; + ld.param.u64 %rd9, [CosineMultiDistance_param_2]; + ld.param.u64 %rd10, [CosineMultiDistance_param_3]; + ld.param.u64 %rd11, [CosineMultiDistance_param_4]; + ld.param.u32 %r17, [CosineMultiDistance_param_5]; + ld.param.u32 %r18, [CosineMultiDistance_param_6]; + ld.param.u32 %r19, [CosineMultiDistance_param_7]; mov.u32 %r20, %ctaid.x; mov.u32 %r1, %ntid.x; mov.u32 %r21, %tid.x; mad.lo.s32 %r31, %r1, %r20, %r21; setp.ge.u32 %p1, %r31, %r19; - @%p1 bra $L__BB58_9; + @%p1 bra $L__BB59_9; mov.u32 %r22, %ntid.y; mov.u32 %r23, %ctaid.y; @@ -4259,16 +4367,16 @@ $L__BB57_21: cvta.to.global.u64 %rd4, %rd10; cvta.to.global.u64 %rd5, %rd11; -$L__BB58_2: +$L__BB59_2: setp.ge.u32 %p2, %r3, %r18; - @%p2 bra $L__BB58_8; + @%p2 bra $L__BB59_8; cvt.u64.u32 %rd6, %r31; mov.u32 %r32, %r3; -$L__BB58_4: +$L__BB59_4: setp.ge.u32 %p3, %r5, %r17; - @%p3 bra $L__BB58_7; + @%p3 bra $L__BB59_7; mul.wide.u32 %rd12, %r32, 8; add.s64 %rd13, %rd1, %rd12; @@ -4281,7 +4389,7 @@ $L__BB58_4: mad.lo.s32 %r33, %r32, %r17, %r5; mov.u32 %r34, %r5; -$L__BB58_6: +$L__BB59_6: mul.wide.u32 %rd18, %r34, 8; add.s64 %rd19, %rd2, %rd18; ld.global.nc.u64 %rd20, [%rd19]; @@ -4300,19 +4408,105 @@ $L__BB58_6: add.s32 %r33, %r33, %r7; add.s32 %r34, %r34, %r7; setp.lt.u32 %p4, %r34, %r17; - @%p4 bra $L__BB58_6; + @%p4 bra $L__BB59_6; -$L__BB58_7: +$L__BB59_7: add.s32 %r32, %r32, %r6; setp.lt.u32 %p5, %r32, %r18; - @%p5 bra $L__BB58_4; + @%p5 bra $L__BB59_4; -$L__BB58_8: +$L__BB59_8: add.s32 %r31, %r31, %r4; setp.lt.u32 %p6, %r31, %r19; - @%p6 bra $L__BB58_2; + @%p6 bra $L__BB59_2; -$L__BB58_9: +$L__BB59_9: + ret; + +} + // .globl CosineDistances +.visible .entry CosineDistances( + .param .u64 CosineDistances_param_0, + .param .u64 CosineDistances_param_1, + .param .u64 CosineDistances_param_2, + .param .u64 CosineDistances_param_3, + .param .u64 CosineDistances_param_4, + .param .u32 CosineDistances_param_5, + .param .u32 CosineDistances_param_6 +) +{ + .reg .pred %p<5>; + .reg .f32 %f<9>; + .reg .b32 %r<21>; + .reg .b64 %rd<24>; + + + ld.param.u64 %rd7, [CosineDistances_param_0]; + ld.param.u64 %rd8, [CosineDistances_param_1]; + ld.param.u64 %rd9, [CosineDistances_param_2]; + ld.param.u64 %rd10, [CosineDistances_param_3]; + ld.param.u64 %rd11, [CosineDistances_param_4]; + ld.param.u32 %r10, [CosineDistances_param_5]; + ld.param.u32 %r11, [CosineDistances_param_6]; + mov.u32 %r12, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r13, %tid.x; + mad.lo.s32 %r19, %r1, %r12, %r13; + setp.ge.u32 %p1, %r19, %r11; + @%p1 bra $L__BB60_6; + + mov.u32 %r14, %ntid.y; + mov.u32 %r15, %ctaid.y; + mov.u32 %r16, %tid.y; + mad.lo.s32 %r3, %r14, %r15, %r16; + mov.u32 %r17, %nctaid.x; + mul.lo.s32 %r4, %r1, %r17; + mov.u32 %r18, %nctaid.y; + mul.lo.s32 %r5, %r14, %r18; + cvta.to.global.u64 %rd1, %rd7; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd9; + cvta.to.global.u64 %rd4, %rd10; + cvta.to.global.u64 %rd5, %rd11; + +$L__BB60_2: + setp.ge.u32 %p2, %r3, %r10; + @%p2 bra $L__BB60_5; + + cvt.u64.u32 %rd6, %r19; + mul.wide.u32 %rd12, %r19, 4; + add.s64 %rd13, %rd1, %rd12; + ld.global.nc.f32 %f1, [%rd13]; + mul.ftz.f32 %f2, %f1, %f1; + mov.u32 %r20, %r3; + +$L__BB60_4: + mul.wide.u32 %rd14, %r20, 8; + add.s64 %rd15, %rd2, %rd14; + ld.global.nc.u64 %rd16, [%rd15]; + cvta.to.global.u64 %rd17, %rd16; + shl.b64 %rd18, %rd6, 2; + add.s64 %rd19, %rd17, %rd18; + ld.global.f32 %f3, [%rd19]; + mul.wide.u32 %rd20, %r20, 4; + add.s64 %rd21, %rd3, %rd20; + atom.global.add.f32 %f4, [%rd21], %f2; + add.s64 %rd22, %rd4, %rd20; + mul.ftz.f32 %f5, %f1, %f3; + atom.global.add.f32 %f6, [%rd22], %f5; + add.s64 %rd23, %rd5, %rd20; + mul.ftz.f32 %f7, %f3, %f3; + atom.global.add.f32 %f8, [%rd23], %f7; + add.s32 %r20, %r20, %r5; + setp.lt.u32 %p3, %r20, %r10; + @%p3 bra $L__BB60_4; + +$L__BB60_5: + add.s32 %r19, %r19, %r4; + setp.lt.u32 %p4, %r19, %r11; + @%p4 bra $L__BB60_2; + +$L__BB60_6: ret; } @@ -4340,7 +4534,7 @@ $L__BB58_9: mov.u32 %r2, %tid.x; mad.lo.s32 %r3, %r22, %r1, %r2; setp.ge.u32 %p1, %r3, %r20; - @%p1 bra $L__BB59_2; + @%p1 bra $L__BB61_2; cvta.to.global.u64 %rd3, %rd1; mul.lo.s32 %r23, %r3, %r21; @@ -4352,17 +4546,17 @@ $L__BB58_9: add.s32 %r26, %r25, %r24; st.shared.f32 [%r26], %f8; -$L__BB59_2: +$L__BB61_2: bar.sync 0; setp.ne.s32 %p2, %r2, 0; - @%p2 bra $L__BB59_11; + @%p2 bra $L__BB61_11; shl.b32 %r4, %r1, 10; sub.s32 %r27, %r20, %r4; min.u32 %r5, %r27, 1024; setp.eq.s32 %p3, %r5, 0; mov.f32 %f25, 0f00000000; - @%p3 bra $L__BB59_10; + @%p3 bra $L__BB61_10; not.b32 %r29, %r20; add.s32 %r30, %r4, %r29; @@ -4373,7 +4567,7 @@ $L__BB59_2: setp.lt.u32 %p4, %r33, 3; mov.f32 %f25, 0f00000000; mov.u32 %r45, 0; - @%p4 bra $L__BB59_7; + @%p4 bra $L__BB61_7; add.s32 %r36, %r4, -1; sub.s32 %r37, %r36, %r20; @@ -4382,7 +4576,7 @@ $L__BB59_2: neg.s32 %r42, %r39; mov.u32 %r43, _ZZ9SumValuesE5block; -$L__BB59_6: +$L__BB61_6: ld.shared.f32 %f13, [%r43]; add.ftz.f32 %f14, %f25, %f13; ld.shared.f32 %f15, [%r43+4]; @@ -4395,32 +4589,32 @@ $L__BB59_6: add.s32 %r43, %r43, 16; add.s32 %r42, %r42, -4; setp.ne.s32 %p5, %r42, 1; - @%p5 bra $L__BB59_6; + @%p5 bra $L__BB61_6; -$L__BB59_7: +$L__BB61_7: setp.eq.s32 %p6, %r47, 0; - @%p6 bra $L__BB59_10; + @%p6 bra $L__BB61_10; shl.b32 %r40, %r45, 2; mov.u32 %r41, _ZZ9SumValuesE5block; add.s32 %r46, %r41, %r40; -$L__BB59_9: +$L__BB61_9: .pragma "nounroll"; ld.shared.f32 %f20, [%r46]; add.ftz.f32 %f25, %f25, %f20; add.s32 %r46, %r46, 4; add.s32 %r47, %r47, -1; setp.ne.s32 %p7, %r47, 0; - @%p7 bra $L__BB59_9; + @%p7 bra $L__BB61_9; -$L__BB59_10: +$L__BB61_10: cvta.to.global.u64 %rd6, %rd2; mul.wide.u32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f25; -$L__BB59_11: +$L__BB61_11: ret; } diff --git a/BrightData.Cuda/cuda/brightwire_90.ptx b/BrightData.Cuda/cuda/brightwire_90.ptx index b0423e2c..760cebcd 100644 --- a/BrightData.Cuda/cuda/brightwire_90.ptx +++ b/BrightData.Cuda/cuda/brightwire_90.ptx @@ -4022,200 +4022,308 @@ $L__BB56_3: ret; } - // .globl CalculateDistances -.visible .entry CalculateDistances( - .param .u64 CalculateDistances_param_0, - .param .u64 CalculateDistances_param_1, - .param .u64 CalculateDistances_param_2, - .param .u32 CalculateDistances_param_3, - .param .u32 CalculateDistances_param_4, - .param .u32 CalculateDistances_param_5, - .param .u32 CalculateDistances_param_6 + // .globl CalculateMultiDistances +.visible .entry CalculateMultiDistances( + .param .u64 CalculateMultiDistances_param_0, + .param .u64 CalculateMultiDistances_param_1, + .param .u64 CalculateMultiDistances_param_2, + .param .u32 CalculateMultiDistances_param_3, + .param .u32 CalculateMultiDistances_param_4, + .param .u32 CalculateMultiDistances_param_5, + .param .u32 CalculateMultiDistances_param_6 ) { - .reg .pred %p<15>; - .reg .f32 %f<15>; - .reg .b32 %r<53>; - .reg .b64 %rd<46>; - - - ld.param.u64 %rd5, [CalculateDistances_param_0]; - ld.param.u64 %rd6, [CalculateDistances_param_1]; - ld.param.u64 %rd7, [CalculateDistances_param_2]; - ld.param.u32 %r27, [CalculateDistances_param_3]; - ld.param.u32 %r28, [CalculateDistances_param_4]; - ld.param.u32 %r29, [CalculateDistances_param_5]; - ld.param.u32 %r30, [CalculateDistances_param_6]; - cvta.to.global.u64 %rd1, %rd7; - cvta.to.global.u64 %rd2, %rd6; - cvta.to.global.u64 %rd3, %rd5; - mov.u32 %r31, %ctaid.x; + .reg .pred %p<13>; + .reg .f32 %f<12>; + .reg .b32 %r<49>; + .reg .b64 %rd<38>; + + + ld.param.u64 %rd6, [CalculateMultiDistances_param_0]; + ld.param.u64 %rd7, [CalculateMultiDistances_param_1]; + ld.param.u64 %rd8, [CalculateMultiDistances_param_2]; + ld.param.u32 %r25, [CalculateMultiDistances_param_3]; + ld.param.u32 %r26, [CalculateMultiDistances_param_4]; + ld.param.u32 %r27, [CalculateMultiDistances_param_5]; + ld.param.u32 %r28, [CalculateMultiDistances_param_6]; + cvta.to.global.u64 %rd1, %rd8; + cvta.to.global.u64 %rd2, %rd7; + cvta.to.global.u64 %rd3, %rd6; + mov.u32 %r29, %ctaid.x; mov.u32 %r1, %ntid.x; - mov.u32 %r32, %tid.x; - mad.lo.s32 %r45, %r1, %r31, %r32; - setp.ge.u32 %p1, %r45, %r29; - @%p1 bra $L__BB57_21; - - mov.u32 %r33, %ntid.y; - mov.u32 %r34, %ctaid.y; - mov.u32 %r35, %tid.y; - mad.lo.s32 %r3, %r33, %r34, %r35; - mov.u32 %r36, %nctaid.x; - mul.lo.s32 %r4, %r1, %r36; - mov.u32 %r37, %ctaid.z; - mov.u32 %r38, %ntid.z; - mov.u32 %r39, %tid.z; - mad.lo.s32 %r5, %r38, %r37, %r39; - mov.u32 %r40, %nctaid.y; - mul.lo.s32 %r6, %r33, %r40; - mov.u32 %r41, %nctaid.z; - mul.lo.s32 %r7, %r38, %r41; + mov.u32 %r30, %tid.x; + mad.lo.s32 %r42, %r1, %r29, %r30; + setp.ge.u32 %p1, %r42, %r27; + @%p1 bra $L__BB57_18; + + mov.u32 %r31, %ntid.y; + mov.u32 %r32, %ctaid.y; + mov.u32 %r33, %tid.y; + mad.lo.s32 %r3, %r31, %r32, %r33; + mov.u32 %r34, %nctaid.x; + mul.lo.s32 %r4, %r1, %r34; + mov.u32 %r35, %ctaid.z; + mov.u32 %r36, %ntid.z; + mov.u32 %r37, %tid.z; + mad.lo.s32 %r5, %r36, %r35, %r37; + mov.u32 %r38, %nctaid.y; + mul.lo.s32 %r6, %r31, %r38; + mov.u32 %r39, %nctaid.z; + mul.lo.s32 %r7, %r36, %r39; $L__BB57_2: - setp.ge.u32 %p2, %r3, %r28; - @%p2 bra $L__BB57_20; + setp.ge.u32 %p2, %r3, %r26; + @%p2 bra $L__BB57_17; - setp.eq.s32 %p3, %r30, 0; - cvt.u64.u32 %rd4, %r45; - @%p3 bra $L__BB57_15; + setp.eq.s32 %p3, %r28, 0; + cvt.u64.u32 %rd4, %r42; + @%p3 bra $L__BB57_12; - mov.u32 %r46, %r3; + mov.u32 %r43, %r3; $L__BB57_5: - setp.ge.u32 %p4, %r5, %r27; - @%p4 bra $L__BB57_14; - - mul.wide.u32 %rd8, %r46, 8; - add.s64 %rd9, %rd3, %rd8; - ld.global.nc.u64 %rd10, [%rd9]; - cvta.to.global.u64 %rd11, %rd10; - shl.b64 %rd12, %rd4, 2; - add.s64 %rd13, %rd11, %rd12; - ld.global.f32 %f1, [%rd13]; - mul.lo.s32 %r10, %r46, %r27; - setp.eq.s32 %p5, %r30, 1; - @%p5 bra $L__BB57_10; - - setp.ne.s32 %p6, %r30, 2; - @%p6 bra $L__BB57_12; + setp.ge.u32 %p4, %r5, %r25; + @%p4 bra $L__BB57_11; - add.s32 %r47, %r5, %r10; - mov.u32 %r48, %r5; + setp.eq.s32 %p5, %r28, 2; + mul.wide.u32 %rd9, %r43, 8; + add.s64 %rd10, %rd3, %rd9; + ld.global.nc.u64 %rd11, [%rd10]; + cvta.to.global.u64 %rd12, %rd11; + shl.b64 %rd13, %rd4, 2; + add.s64 %rd5, %rd12, %rd13; + mul.lo.s32 %r10, %r43, %r25; + @%p5 bra $L__BB57_9; + bra.uni $L__BB57_7; $L__BB57_9: - mul.wide.u32 %rd14, %r48, 8; - add.s64 %rd15, %rd2, %rd14; - ld.global.nc.u64 %rd16, [%rd15]; - cvta.to.global.u64 %rd17, %rd16; - add.s64 %rd19, %rd17, %rd12; - ld.global.f32 %f3, [%rd19]; - sub.ftz.f32 %f4, %f1, %f3; - abs.ftz.f32 %f5, %f4; - mul.wide.u32 %rd20, %r47, 4; - add.s64 %rd21, %rd1, %rd20; - atom.global.add.f32 %f6, [%rd21], %f5; - add.s32 %r47, %r47, %r7; - add.s32 %r48, %r48, %r7; - setp.lt.u32 %p7, %r48, %r27; - @%p7 bra $L__BB57_9; - bra.uni $L__BB57_14; + ld.global.f32 %f1, [%rd5]; + mov.u32 %r46, %r5; $L__BB57_10: - mov.u32 %r49, %r5; + mul.wide.u32 %rd16, %r46, 8; + add.s64 %rd17, %rd2, %rd16; + ld.global.nc.u64 %rd18, [%rd17]; + cvta.to.global.u64 %rd19, %rd18; + add.s64 %rd21, %rd19, %rd13; + ld.global.f32 %f4, [%rd21]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + add.s32 %r40, %r46, %r10; + mul.wide.u32 %rd22, %r40, 4; + add.s64 %rd23, %rd1, %rd22; + atom.global.add.f32 %f7, [%rd23], %f6; + add.s32 %r46, %r46, %r7; + setp.lt.u32 %p7, %r46, %r25; + @%p7 bra $L__BB57_10; + bra.uni $L__BB57_11; + +$L__BB57_7: + add.s32 %r44, %r5, %r10; + mov.u32 %r45, %r5; + +$L__BB57_8: + mul.wide.u32 %rd14, %r44, 4; + add.s64 %rd15, %rd1, %rd14; + atom.global.add.f32 %f3, [%rd15], 0f00000000; + add.s32 %r44, %r44, %r7; + add.s32 %r45, %r45, %r7; + setp.lt.u32 %p6, %r45, %r25; + @%p6 bra $L__BB57_8; $L__BB57_11: - mul.wide.u32 %rd22, %r49, 8; - add.s64 %rd23, %rd2, %rd22; - ld.global.nc.u64 %rd24, [%rd23]; - cvta.to.global.u64 %rd25, %rd24; - add.s64 %rd27, %rd25, %rd12; - ld.global.f32 %f7, [%rd27]; - mul.ftz.f32 %f8, %f1, %f7; - add.s32 %r42, %r49, %r10; - mul.wide.u32 %rd28, %r42, 4; - add.s64 %rd29, %rd1, %rd28; - atom.global.add.f32 %f9, [%rd29], %f8; - add.s32 %r49, %r49, %r7; - setp.lt.u32 %p8, %r49, %r27; - @%p8 bra $L__BB57_11; - bra.uni $L__BB57_14; + add.s32 %r43, %r43, %r6; + setp.lt.u32 %p8, %r43, %r26; + @%p8 bra $L__BB57_5; + bra.uni $L__BB57_17; $L__BB57_12: - mov.u32 %r50, %r5; + mov.u32 %r47, %r3; $L__BB57_13: - add.s32 %r43, %r50, %r10; - mul.wide.u32 %rd30, %r43, 4; - add.s64 %rd31, %rd1, %rd30; - atom.global.add.f32 %f10, [%rd31], 0f00000000; - add.s32 %r50, %r50, %r7; - setp.lt.u32 %p9, %r50, %r27; - @%p9 bra $L__BB57_13; - -$L__BB57_14: - add.s32 %r46, %r46, %r6; - setp.lt.u32 %p10, %r46, %r28; - @%p10 bra $L__BB57_5; - bra.uni $L__BB57_20; + setp.ge.u32 %p9, %r5, %r25; + @%p9 bra $L__BB57_16; + + mul.wide.u32 %rd24, %r47, 8; + add.s64 %rd25, %rd3, %rd24; + ld.global.nc.u64 %rd26, [%rd25]; + cvta.to.global.u64 %rd27, %rd26; + shl.b64 %rd28, %rd4, 2; + add.s64 %rd29, %rd27, %rd28; + ld.global.f32 %f2, [%rd29]; + mul.lo.s32 %r20, %r47, %r25; + mov.u32 %r48, %r5; $L__BB57_15: - mov.u32 %r51, %r3; + mul.wide.u32 %rd30, %r48, 8; + add.s64 %rd31, %rd2, %rd30; + ld.global.nc.u64 %rd32, [%rd31]; + cvta.to.global.u64 %rd33, %rd32; + add.s64 %rd35, %rd33, %rd28; + ld.global.f32 %f8, [%rd35]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + add.s32 %r41, %r48, %r20; + mul.wide.u32 %rd36, %r41, 4; + add.s64 %rd37, %rd1, %rd36; + atom.global.add.f32 %f11, [%rd37], %f10; + add.s32 %r48, %r48, %r7; + setp.lt.u32 %p10, %r48, %r25; + @%p10 bra $L__BB57_15; $L__BB57_16: - setp.ge.u32 %p11, %r5, %r27; - @%p11 bra $L__BB57_19; - - mul.wide.u32 %rd32, %r51, 8; - add.s64 %rd33, %rd3, %rd32; - ld.global.nc.u64 %rd34, [%rd33]; - cvta.to.global.u64 %rd35, %rd34; - shl.b64 %rd36, %rd4, 2; - add.s64 %rd37, %rd35, %rd36; - ld.global.f32 %f2, [%rd37]; - mul.lo.s32 %r22, %r51, %r27; - mov.u32 %r52, %r5; + add.s32 %r47, %r47, %r6; + setp.lt.u32 %p11, %r47, %r26; + @%p11 bra $L__BB57_13; + +$L__BB57_17: + add.s32 %r42, %r42, %r4; + setp.lt.u32 %p12, %r42, %r27; + @%p12 bra $L__BB57_2; $L__BB57_18: - mul.wide.u32 %rd38, %r52, 8; - add.s64 %rd39, %rd2, %rd38; - ld.global.nc.u64 %rd40, [%rd39]; - cvta.to.global.u64 %rd41, %rd40; - add.s64 %rd43, %rd41, %rd36; - ld.global.f32 %f11, [%rd43]; - sub.ftz.f32 %f12, %f2, %f11; - mul.ftz.f32 %f13, %f12, %f12; - add.s32 %r44, %r52, %r22; - mul.wide.u32 %rd44, %r44, 4; - add.s64 %rd45, %rd1, %rd44; - atom.global.add.f32 %f14, [%rd45], %f13; - add.s32 %r52, %r52, %r7; - setp.lt.u32 %p12, %r52, %r27; - @%p12 bra $L__BB57_18; - -$L__BB57_19: - add.s32 %r51, %r51, %r6; - setp.lt.u32 %p13, %r51, %r28; - @%p13 bra $L__BB57_16; - -$L__BB57_20: - add.s32 %r45, %r45, %r4; - setp.lt.u32 %p14, %r45, %r29; - @%p14 bra $L__BB57_2; - -$L__BB57_21: ret; } - // .globl MultiCosineDistance -.visible .entry MultiCosineDistance( - .param .u64 MultiCosineDistance_param_0, - .param .u64 MultiCosineDistance_param_1, - .param .u64 MultiCosineDistance_param_2, - .param .u64 MultiCosineDistance_param_3, - .param .u64 MultiCosineDistance_param_4, - .param .u32 MultiCosineDistance_param_5, - .param .u32 MultiCosineDistance_param_6, - .param .u32 MultiCosineDistance_param_7 + // .globl CalculateDistances +.visible .entry CalculateDistances( + .param .u64 CalculateDistances_param_0, + .param .u64 CalculateDistances_param_1, + .param .u64 CalculateDistances_param_2, + .param .u32 CalculateDistances_param_3, + .param .u32 CalculateDistances_param_4, + .param .u32 CalculateDistances_param_5 +) +{ + .reg .pred %p<11>; + .reg .f32 %f<12>; + .reg .b32 %r<31>; + .reg .b64 %rd<31>; + + + ld.param.u64 %rd7, [CalculateDistances_param_0]; + ld.param.u64 %rd8, [CalculateDistances_param_1]; + ld.param.u64 %rd9, [CalculateDistances_param_2]; + ld.param.u32 %r16, [CalculateDistances_param_3]; + ld.param.u32 %r17, [CalculateDistances_param_4]; + ld.param.u32 %r18, [CalculateDistances_param_5]; + cvta.to.global.u64 %rd1, %rd9; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd7; + mov.u32 %r19, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r20, %tid.x; + mad.lo.s32 %r26, %r1, %r19, %r20; + setp.ge.u32 %p1, %r26, %r17; + @%p1 bra $L__BB58_15; + + mov.u32 %r21, %ntid.y; + mov.u32 %r22, %ctaid.y; + mov.u32 %r23, %tid.y; + mad.lo.s32 %r3, %r21, %r22, %r23; + mov.u32 %r24, %nctaid.x; + mul.lo.s32 %r4, %r1, %r24; + mov.u32 %r25, %nctaid.y; + mul.lo.s32 %r5, %r21, %r25; + setp.eq.s32 %p2, %r18, 0; + @%p2 bra $L__BB58_11; + +$L__BB58_3: + setp.ge.u32 %p3, %r3, %r16; + @%p3 bra $L__BB58_9; + + setp.eq.s32 %p4, %r18, 2; + cvt.u64.u32 %rd4, %r26; + mul.wide.u32 %rd10, %r26, 4; + add.s64 %rd5, %rd3, %rd10; + @%p4 bra $L__BB58_7; + bra.uni $L__BB58_5; + +$L__BB58_7: + ld.global.nc.f32 %f1, [%rd5]; + mov.u32 %r28, %r3; + +$L__BB58_8: + mul.wide.u32 %rd13, %r28, 8; + add.s64 %rd14, %rd2, %rd13; + ld.global.nc.u64 %rd15, [%rd14]; + cvta.to.global.u64 %rd16, %rd15; + shl.b64 %rd17, %rd4, 2; + add.s64 %rd18, %rd16, %rd17; + ld.global.f32 %f4, [%rd18]; + sub.ftz.f32 %f5, %f1, %f4; + abs.ftz.f32 %f6, %f5; + mul.wide.u32 %rd19, %r28, 4; + add.s64 %rd20, %rd1, %rd19; + atom.global.add.f32 %f7, [%rd20], %f6; + add.s32 %r28, %r28, %r5; + setp.lt.u32 %p6, %r28, %r16; + @%p6 bra $L__BB58_8; + bra.uni $L__BB58_9; + +$L__BB58_5: + mov.u32 %r27, %r3; + +$L__BB58_6: + mul.wide.u32 %rd11, %r27, 4; + add.s64 %rd12, %rd1, %rd11; + atom.global.add.f32 %f3, [%rd12], 0f00000000; + add.s32 %r27, %r27, %r5; + setp.lt.u32 %p5, %r27, %r16; + @%p5 bra $L__BB58_6; + +$L__BB58_9: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p7, %r26, %r17; + @%p7 bra $L__BB58_3; + bra.uni $L__BB58_15; + +$L__BB58_11: + setp.ge.u32 %p8, %r3, %r16; + @%p8 bra $L__BB58_14; + + cvt.u64.u32 %rd6, %r26; + mul.wide.u32 %rd21, %r26, 4; + add.s64 %rd22, %rd3, %rd21; + ld.global.nc.f32 %f2, [%rd22]; + mov.u32 %r30, %r3; + +$L__BB58_13: + mul.wide.u32 %rd23, %r30, 8; + add.s64 %rd24, %rd2, %rd23; + ld.global.nc.u64 %rd25, [%rd24]; + cvta.to.global.u64 %rd26, %rd25; + shl.b64 %rd27, %rd6, 2; + add.s64 %rd28, %rd26, %rd27; + ld.global.f32 %f8, [%rd28]; + sub.ftz.f32 %f9, %f2, %f8; + mul.ftz.f32 %f10, %f9, %f9; + mul.wide.u32 %rd29, %r30, 4; + add.s64 %rd30, %rd1, %rd29; + atom.global.add.f32 %f11, [%rd30], %f10; + add.s32 %r30, %r30, %r5; + setp.lt.u32 %p9, %r30, %r16; + @%p9 bra $L__BB58_13; + +$L__BB58_14: + add.s32 %r26, %r26, %r4; + setp.lt.u32 %p10, %r26, %r17; + @%p10 bra $L__BB58_11; + +$L__BB58_15: + ret; + +} + // .globl CosineMultiDistance +.visible .entry CosineMultiDistance( + .param .u64 CosineMultiDistance_param_0, + .param .u64 CosineMultiDistance_param_1, + .param .u64 CosineMultiDistance_param_2, + .param .u64 CosineMultiDistance_param_3, + .param .u64 CosineMultiDistance_param_4, + .param .u32 CosineMultiDistance_param_5, + .param .u32 CosineMultiDistance_param_6, + .param .u32 CosineMultiDistance_param_7 ) { .reg .pred %p<7>; @@ -4224,20 +4332,20 @@ $L__BB57_21: .reg .b64 %rd<28>; - ld.param.u64 %rd7, [MultiCosineDistance_param_0]; - ld.param.u64 %rd8, [MultiCosineDistance_param_1]; - ld.param.u64 %rd9, [MultiCosineDistance_param_2]; - ld.param.u64 %rd10, [MultiCosineDistance_param_3]; - ld.param.u64 %rd11, [MultiCosineDistance_param_4]; - ld.param.u32 %r17, [MultiCosineDistance_param_5]; - ld.param.u32 %r18, [MultiCosineDistance_param_6]; - ld.param.u32 %r19, [MultiCosineDistance_param_7]; + ld.param.u64 %rd7, [CosineMultiDistance_param_0]; + ld.param.u64 %rd8, [CosineMultiDistance_param_1]; + ld.param.u64 %rd9, [CosineMultiDistance_param_2]; + ld.param.u64 %rd10, [CosineMultiDistance_param_3]; + ld.param.u64 %rd11, [CosineMultiDistance_param_4]; + ld.param.u32 %r17, [CosineMultiDistance_param_5]; + ld.param.u32 %r18, [CosineMultiDistance_param_6]; + ld.param.u32 %r19, [CosineMultiDistance_param_7]; mov.u32 %r20, %ctaid.x; mov.u32 %r1, %ntid.x; mov.u32 %r21, %tid.x; mad.lo.s32 %r31, %r1, %r20, %r21; setp.ge.u32 %p1, %r31, %r19; - @%p1 bra $L__BB58_9; + @%p1 bra $L__BB59_9; mov.u32 %r22, %ntid.y; mov.u32 %r23, %ctaid.y; @@ -4259,16 +4367,16 @@ $L__BB57_21: cvta.to.global.u64 %rd4, %rd10; cvta.to.global.u64 %rd5, %rd11; -$L__BB58_2: +$L__BB59_2: setp.ge.u32 %p2, %r3, %r18; - @%p2 bra $L__BB58_8; + @%p2 bra $L__BB59_8; cvt.u64.u32 %rd6, %r31; mov.u32 %r32, %r3; -$L__BB58_4: +$L__BB59_4: setp.ge.u32 %p3, %r5, %r17; - @%p3 bra $L__BB58_7; + @%p3 bra $L__BB59_7; mul.wide.u32 %rd12, %r32, 8; add.s64 %rd13, %rd1, %rd12; @@ -4281,7 +4389,7 @@ $L__BB58_4: mad.lo.s32 %r33, %r32, %r17, %r5; mov.u32 %r34, %r5; -$L__BB58_6: +$L__BB59_6: mul.wide.u32 %rd18, %r34, 8; add.s64 %rd19, %rd2, %rd18; ld.global.nc.u64 %rd20, [%rd19]; @@ -4300,19 +4408,105 @@ $L__BB58_6: add.s32 %r33, %r33, %r7; add.s32 %r34, %r34, %r7; setp.lt.u32 %p4, %r34, %r17; - @%p4 bra $L__BB58_6; + @%p4 bra $L__BB59_6; -$L__BB58_7: +$L__BB59_7: add.s32 %r32, %r32, %r6; setp.lt.u32 %p5, %r32, %r18; - @%p5 bra $L__BB58_4; + @%p5 bra $L__BB59_4; -$L__BB58_8: +$L__BB59_8: add.s32 %r31, %r31, %r4; setp.lt.u32 %p6, %r31, %r19; - @%p6 bra $L__BB58_2; + @%p6 bra $L__BB59_2; -$L__BB58_9: +$L__BB59_9: + ret; + +} + // .globl CosineDistances +.visible .entry CosineDistances( + .param .u64 CosineDistances_param_0, + .param .u64 CosineDistances_param_1, + .param .u64 CosineDistances_param_2, + .param .u64 CosineDistances_param_3, + .param .u64 CosineDistances_param_4, + .param .u32 CosineDistances_param_5, + .param .u32 CosineDistances_param_6 +) +{ + .reg .pred %p<5>; + .reg .f32 %f<9>; + .reg .b32 %r<21>; + .reg .b64 %rd<24>; + + + ld.param.u64 %rd7, [CosineDistances_param_0]; + ld.param.u64 %rd8, [CosineDistances_param_1]; + ld.param.u64 %rd9, [CosineDistances_param_2]; + ld.param.u64 %rd10, [CosineDistances_param_3]; + ld.param.u64 %rd11, [CosineDistances_param_4]; + ld.param.u32 %r10, [CosineDistances_param_5]; + ld.param.u32 %r11, [CosineDistances_param_6]; + mov.u32 %r12, %ctaid.x; + mov.u32 %r1, %ntid.x; + mov.u32 %r13, %tid.x; + mad.lo.s32 %r19, %r1, %r12, %r13; + setp.ge.u32 %p1, %r19, %r11; + @%p1 bra $L__BB60_6; + + mov.u32 %r14, %ntid.y; + mov.u32 %r15, %ctaid.y; + mov.u32 %r16, %tid.y; + mad.lo.s32 %r3, %r14, %r15, %r16; + mov.u32 %r17, %nctaid.x; + mul.lo.s32 %r4, %r1, %r17; + mov.u32 %r18, %nctaid.y; + mul.lo.s32 %r5, %r14, %r18; + cvta.to.global.u64 %rd1, %rd7; + cvta.to.global.u64 %rd2, %rd8; + cvta.to.global.u64 %rd3, %rd9; + cvta.to.global.u64 %rd4, %rd10; + cvta.to.global.u64 %rd5, %rd11; + +$L__BB60_2: + setp.ge.u32 %p2, %r3, %r10; + @%p2 bra $L__BB60_5; + + cvt.u64.u32 %rd6, %r19; + mul.wide.u32 %rd12, %r19, 4; + add.s64 %rd13, %rd1, %rd12; + ld.global.nc.f32 %f1, [%rd13]; + mul.ftz.f32 %f2, %f1, %f1; + mov.u32 %r20, %r3; + +$L__BB60_4: + mul.wide.u32 %rd14, %r20, 8; + add.s64 %rd15, %rd2, %rd14; + ld.global.nc.u64 %rd16, [%rd15]; + cvta.to.global.u64 %rd17, %rd16; + shl.b64 %rd18, %rd6, 2; + add.s64 %rd19, %rd17, %rd18; + ld.global.f32 %f3, [%rd19]; + mul.wide.u32 %rd20, %r20, 4; + add.s64 %rd21, %rd3, %rd20; + atom.global.add.f32 %f4, [%rd21], %f2; + add.s64 %rd22, %rd4, %rd20; + mul.ftz.f32 %f5, %f1, %f3; + atom.global.add.f32 %f6, [%rd22], %f5; + add.s64 %rd23, %rd5, %rd20; + mul.ftz.f32 %f7, %f3, %f3; + atom.global.add.f32 %f8, [%rd23], %f7; + add.s32 %r20, %r20, %r5; + setp.lt.u32 %p3, %r20, %r10; + @%p3 bra $L__BB60_4; + +$L__BB60_5: + add.s32 %r19, %r19, %r4; + setp.lt.u32 %p4, %r19, %r11; + @%p4 bra $L__BB60_2; + +$L__BB60_6: ret; } @@ -4340,7 +4534,7 @@ $L__BB58_9: mov.u32 %r2, %tid.x; mad.lo.s32 %r3, %r22, %r1, %r2; setp.ge.u32 %p1, %r3, %r20; - @%p1 bra $L__BB59_2; + @%p1 bra $L__BB61_2; cvta.to.global.u64 %rd3, %rd1; mul.lo.s32 %r23, %r3, %r21; @@ -4352,17 +4546,17 @@ $L__BB58_9: add.s32 %r26, %r25, %r24; st.shared.f32 [%r26], %f8; -$L__BB59_2: +$L__BB61_2: bar.sync 0; setp.ne.s32 %p2, %r2, 0; - @%p2 bra $L__BB59_11; + @%p2 bra $L__BB61_11; shl.b32 %r4, %r1, 10; sub.s32 %r27, %r20, %r4; min.u32 %r5, %r27, 1024; setp.eq.s32 %p3, %r5, 0; mov.f32 %f25, 0f00000000; - @%p3 bra $L__BB59_10; + @%p3 bra $L__BB61_10; not.b32 %r29, %r20; add.s32 %r30, %r4, %r29; @@ -4373,7 +4567,7 @@ $L__BB59_2: setp.lt.u32 %p4, %r33, 3; mov.f32 %f25, 0f00000000; mov.u32 %r45, 0; - @%p4 bra $L__BB59_7; + @%p4 bra $L__BB61_7; add.s32 %r36, %r4, -1; sub.s32 %r37, %r36, %r20; @@ -4382,7 +4576,7 @@ $L__BB59_2: neg.s32 %r42, %r39; mov.u32 %r43, _ZZ9SumValuesE5block; -$L__BB59_6: +$L__BB61_6: ld.shared.f32 %f13, [%r43]; add.ftz.f32 %f14, %f25, %f13; ld.shared.f32 %f15, [%r43+4]; @@ -4395,32 +4589,32 @@ $L__BB59_6: add.s32 %r43, %r43, 16; add.s32 %r42, %r42, -4; setp.ne.s32 %p5, %r42, 1; - @%p5 bra $L__BB59_6; + @%p5 bra $L__BB61_6; -$L__BB59_7: +$L__BB61_7: setp.eq.s32 %p6, %r47, 0; - @%p6 bra $L__BB59_10; + @%p6 bra $L__BB61_10; shl.b32 %r40, %r45, 2; mov.u32 %r41, _ZZ9SumValuesE5block; add.s32 %r46, %r41, %r40; -$L__BB59_9: +$L__BB61_9: .pragma "nounroll"; ld.shared.f32 %f20, [%r46]; add.ftz.f32 %f25, %f25, %f20; add.s32 %r46, %r46, 4; add.s32 %r47, %r47, -1; setp.ne.s32 %p7, %r47, 0; - @%p7 bra $L__BB59_9; + @%p7 bra $L__BB61_9; -$L__BB59_10: +$L__BB61_10: cvta.to.global.u64 %rd6, %rd2; mul.wide.u32 %rd7, %r1, 4; add.s64 %rd8, %rd6, %rd7; st.global.f32 [%rd8], %f25; -$L__BB59_11: +$L__BB61_11: ret; } diff --git a/BrightData.Cuda/cuda/build_kernels.bat b/BrightData.Cuda/cuda/build_kernels.bat index 226d85f6..2a6eccc9 100644 --- a/BrightData.Cuda/cuda/build_kernels.bat +++ b/BrightData.Cuda/cuda/build_kernels.bat @@ -1,16 +1,16 @@ -nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_50 -o brightwire_50.ptx -nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_52 -o brightwire_52.ptx -nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_53 -o brightwire_53.ptx -nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_60 -o brightwire_60.ptx -nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_61 -o brightwire_61.ptx -nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_62 -o brightwire_62.ptx -nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_70 -o brightwire_70.ptx -nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_72 -o brightwire_72.ptx -nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_75 -o brightwire_75.ptx -nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_80 -o brightwire_80.ptx -nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_86 -o brightwire_86.ptx -nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_87 -o brightwire_87.ptx -nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_89 -o brightwire_89.ptx -nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_90 -o brightwire_90.ptx +nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_50 -o brightwire_50.ptx -allow-unsupported-compiler +nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_52 -o brightwire_52.ptx -allow-unsupported-compiler +nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_53 -o brightwire_53.ptx -allow-unsupported-compiler +nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_60 -o brightwire_60.ptx -allow-unsupported-compiler +nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_61 -o brightwire_61.ptx -allow-unsupported-compiler +nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_62 -o brightwire_62.ptx -allow-unsupported-compiler +nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_70 -o brightwire_70.ptx -allow-unsupported-compiler +nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_72 -o brightwire_72.ptx -allow-unsupported-compiler +nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_75 -o brightwire_75.ptx -allow-unsupported-compiler +nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_80 -o brightwire_80.ptx -allow-unsupported-compiler +nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_86 -o brightwire_86.ptx -allow-unsupported-compiler +nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_87 -o brightwire_87.ptx -allow-unsupported-compiler +nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_89 -o brightwire_89.ptx -allow-unsupported-compiler +nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_90 -o brightwire_90.ptx -allow-unsupported-compiler -nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_52 -o brightwire.ptx \ No newline at end of file +nvcc brightwire.cu -use_fast_math -ptx -m 64 -arch sm_52 -o brightwire.ptx -allow-unsupported-compiler \ No newline at end of file diff --git a/BrightData.UnitTests/CudaTests.cs b/BrightData.UnitTests/CudaTests.cs index eb4cd5ec..e087565f 100644 --- a/BrightData.UnitTests/CudaTests.cs +++ b/BrightData.UnitTests/CudaTests.cs @@ -1,4 +1,6 @@ -using BrightData.LinearAlgebra.Segments; +using System.Linq; +using BrightData.Helper; +using BrightData.LinearAlgebra.Segments; using BrightData.UnitTests.Helper; using FluentAssertions; using Xunit; @@ -10,11 +12,48 @@ public class CudaTests : CudaBase [Fact] public void CopyToWrapperWithStride() { - var empty = _cuda.CreateSegment(24, true); - var ones = _cuda.CreateSegment(8, _ => 1); + using var empty = _cuda.CreateSegment(24, true); + using var ones = _cuda.CreateSegment(8, _ => 1); var wrapper = new MutableTensorSegmentWrapper(empty, 0, 3, 8); ones.CopyTo(wrapper); empty.ToNewArray()[..6].Should().BeEquivalentTo([1, 0, 0, 1, 0, 0]); } + + void FindDistance(DistanceMetric distanceMetric) + { + using var gpuSegment = _cuda.CreateSegment(8, _ => _context.NextRandomFloat()); + var gpuSegments = 8.AsRange().Select(_ => _cuda.CreateSegment(8, _ => _context.NextRandomFloat())).ToArray(); + using var gpuDistance = _cuda.FindDistances(gpuSegment, gpuSegments, distanceMetric); + + using var cpuSegment = _cpu.CreateSegment(gpuSegment); + var cpuSegments = gpuSegments.Select(_cpu.CreateSegment).ToArray(); + using var cpuDistance = _cpu.FindDistances(cpuSegment, gpuSegments, distanceMetric); + + foreach (var (g, c) in gpuDistance.Values.Zip(cpuDistance.Values)) + Math.AreApproximatelyEqual(g, c).Should().BeTrue(); + + foreach(var item in cpuSegments) + item.Dispose(); + foreach(var item in gpuSegments) + item.Dispose(); + } + + [Fact] + public void FindCosineDistance() + { + FindDistance(DistanceMetric.Cosine); + } + + [Fact] + public void FindEuclideanDistance() + { + FindDistance(DistanceMetric.Euclidean); + } + + [Fact] + public void FindManhattanDistance() + { + FindDistance(DistanceMetric.Manhattan); + } } } diff --git a/BrightData.UnitTests/VectorTests.cs b/BrightData.UnitTests/VectorTests.cs index f5315149..d7813939 100644 --- a/BrightData.UnitTests/VectorTests.cs +++ b/BrightData.UnitTests/VectorTests.cs @@ -77,20 +77,17 @@ public void TestVectorCreation() void TestDistances(DistanceMetric distanceMetric) { var distribution = _context.CreateNormalDistribution(0, 5); - var vectors = Enumerable.Range(0, 10).Select(_ => _cpu.CreateVector(100, _ => distribution.Sample())).ToArray(); - var compareTo = Enumerable.Range(0, 20).Select(_ => _cpu.CreateVector(100, _ => distribution.Sample())).ToArray(); + var vectors = Enumerable.Range(0, 10).Select(_ => (IReadOnlyNumericSegment)_cpu.CreateSegment(100, _ => distribution.Sample())).ToArray(); + var compareTo = Enumerable.Range(0, 20).Select(_ => (IReadOnlyNumericSegment)_cpu.CreateSegment(100, _ => distribution.Sample())).ToArray(); - var gpuVectors = vectors.Select(v => _cuda.CreateVector(v.Segment)).ToArray(); - var gpuCompareTo = compareTo.Select(v => _cuda.CreateVector(v.Segment)).ToArray(); - - var mklVectors = vectors.Select(v => _mkl.CreateVector(v.Segment)).ToArray(); - var mklCompareTo = compareTo.Select(v => _mkl.CreateVector(v.Segment)).ToArray(); + var gpuVectors = vectors.Select(_cuda.CreateSegment).ToArray(); + var gpuCompareTo = compareTo.Select(_cuda.CreateSegment).ToArray(); try { AssertSameAndThenDispose( _cpu.FindDistances(vectors, compareTo, distanceMetric), _cuda.FindDistances(gpuVectors, gpuCompareTo, distanceMetric), - _mkl.FindDistances(mklVectors, mklCompareTo, distanceMetric) + _mkl.FindDistances(vectors, compareTo, distanceMetric) ); } finally { @@ -98,8 +95,6 @@ void TestDistances(DistanceMetric distanceMetric) compareTo.DisposeAll(); gpuVectors.DisposeAll(); gpuCompareTo.DisposeAll(); - mklVectors.DisposeAll(); - mklCompareTo.DisposeAll(); } } diff --git a/BrightData/BrightData.xml b/BrightData/BrightData.xml index 226c3e14..87ebccac 100644 --- a/BrightData/BrightData.xml +++ b/BrightData/BrightData.xml @@ -9637,6 +9637,13 @@ + + + Creates a tensor segment from an existing segment + + + + Creates a tensor segment @@ -10515,7 +10522,7 @@ - + Finds the distance between each pair of vectors @@ -10524,6 +10531,24 @@ Distance metric Matrix with the rows corresponding to the first set and columns corresponding to the second set and each element containing the distance + + + Finds the distance between each pair of vectors + + + + + + + + + Finds the distance between a vector and list of vectors + + + + + + Binds a new thread to this provider @@ -12059,12 +12084,12 @@ Current number of neighbours - + The smallest neighbour weight - + The largest neighbour weight @@ -12079,7 +12104,7 @@ The index of the neighbour with the largest weight - + Tries to add a new neighbour - will succeed if there aren't already max neighbours with a smaller weight @@ -12114,6 +12139,17 @@ + + + Creates a vector graph from an array of nodes + + + + + + Number of nodes in the graph + + Gets the neighbours for a node, sorted by distance @@ -12128,6 +12164,13 @@ + + + Enumerates the neighbour indices and their weights in ascending order + + + + Creates diff --git a/BrightData/ExtensionMethods.TensorSegment.cs b/BrightData/ExtensionMethods.TensorSegment.cs index eb97c2f1..0fa60106 100644 --- a/BrightData/ExtensionMethods.TensorSegment.cs +++ b/BrightData/ExtensionMethods.TensorSegment.cs @@ -1,5 +1,6 @@ using BrightData.Helper; using System; +using System.Collections; using System.Collections.Generic; using System.IO; using System.Linq; @@ -616,5 +617,31 @@ public static ReadOnlyMemory GetMemory(this IReadOnlyNumericSegment seg : segment.ToNewArray() ; } + + class MaxComparer : IComparer + where T : unmanaged, INumber, IComparable + { + public int Compare(T x, T y) => y.CompareTo(x); + } + class MinComparer : IComparer + where T : unmanaged, INumber, IComparable + { + public int Compare(T x, T y) => x.CompareTo(y); + } + + public static PriorityQueue RankedIndices(this IReadOnlyNumericSegment segment, uint count, bool ascending = true) + where T : unmanaged, INumber + { + var ret = new PriorityQueue(ascending ? new MaxComparer() : new MinComparer()); + segment.ApplyReadOnlySpan(x => { + for (int i = 0, len = x.Length; i < len; i++) { + if (ret.Count < count) + ret.Enqueue((uint)i, x[i]); + else + ret.EnqueueDequeue((uint)i, x[i]); + } + }); + return ret; + } } } diff --git a/BrightData/LinearAlgebra/LinearAlgebraProvider.cs b/BrightData/LinearAlgebra/LinearAlgebraProvider.cs index e4590f83..4b8a7e8b 100644 --- a/BrightData/LinearAlgebra/LinearAlgebraProvider.cs +++ b/BrightData/LinearAlgebra/LinearAlgebraProvider.cs @@ -9,6 +9,7 @@ using BrightData.LinearAlgebra.ReadOnly; using BrightData.LinearAlgebra.Segments; using CommunityToolkit.HighPerformance.Buffers; +using static BrightData.DataTable.ColumnOrientedDataTable; namespace BrightData.LinearAlgebra { @@ -117,6 +118,13 @@ public virtual void PopScope() /// public virtual INumericSegment CreateSegment(params T[] data) => new MutableTensorSegment(data); + /// + /// Creates a tensor segment from an existing segment + /// + /// + /// + public virtual INumericSegment CreateSegment(IReadOnlyNumericSegment segment) => new MutableTensorSegment(segment.ToNewArray()); + /// /// Creates a tensor segment /// @@ -1219,10 +1227,10 @@ public virtual IMatrix SoftmaxDerivative(IReadOnlyNumericSegment tensor) /// Second set of vectors /// Distance metric /// Matrix with the rows corresponding to the first set and columns corresponding to the second set and each element containing the distance - public virtual IMatrix FindDistances(IVector[] vectors, IReadOnlyList> compareTo, DistanceMetric distanceMetric) + public virtual IMatrix FindDistances(IReadOnlyList> vectors, IReadOnlyList> compareTo, DistanceMetric distanceMetric) { var rows = (uint)compareTo.Count; - var columns = (uint)vectors.Length; + var columns = (uint)vectors.Count; var ret = CreateMatrix(rows, columns, false); var totalSize = rows * columns; @@ -1230,13 +1238,13 @@ public virtual IMatrix FindDistances(IVector[] vectors, IReadOnlyList { var i = (uint)(ind % rows); var j = (uint)(ind / rows); - ret[i, j] = compareTo[(int)i].FindDistance(vectors[j], distanceMetric); + ret[i, j] = compareTo[(int)i].FindDistance(vectors[(int)j], distanceMetric); }); } else { for (uint i = 0; i < rows; i++) { for (uint j = 0; j < columns; j++) { - ret[i, j] = compareTo[(int)i].FindDistance(vectors[j], distanceMetric); + ret[i, j] = compareTo[(int)i].FindDistance(vectors[(int)j], distanceMetric); } } } @@ -1244,6 +1252,45 @@ public virtual IMatrix FindDistances(IVector[] vectors, IReadOnlyList + /// Finds the distance between each pair of vectors + /// + /// + /// + /// + /// + public virtual IMatrix FindDistances(IReadOnlyList> vectors, IReadOnlyList> compareTo, DistanceMetric distanceMetric) + { + return FindDistances( + vectors.Select(x => x.Segment).ToArray(), + compareTo.Select(x => x.Segment).ToArray(), + distanceMetric + ); + } + + /// + /// Finds the distance between a vector and list of vectors + /// + /// + /// + /// + /// + public virtual IVector FindDistances(IReadOnlyNumericSegment vector, IReadOnlyList> compareTo, DistanceMetric distanceMetric) + { + var size = (uint)compareTo.Count; + var ret = CreateVector(size, false); + if (size >= Consts.MinimumSizeForParallel) { + Parallel.For(0, ret.Size, i => { + ret[i] = vector.FindDistance(compareTo[(int)i], distanceMetric); + }); + } + else { + for (uint i = 0; i < size; i++) + ret[i] = vector.FindDistance(compareTo[(int)i], distanceMetric); + } + return ret; + } + /// /// Binds a new thread to this provider /// diff --git a/BrightData/LinearAlgebra/VectorIndexing/Helper/IndexedFixedSizeGraphNode.cs b/BrightData/LinearAlgebra/VectorIndexing/Helper/IndexedFixedSizeGraphNode.cs index f8e9822f..b9f9365a 100644 --- a/BrightData/LinearAlgebra/VectorIndexing/Helper/IndexedFixedSizeGraphNode.cs +++ b/BrightData/LinearAlgebra/VectorIndexing/Helper/IndexedFixedSizeGraphNode.cs @@ -24,12 +24,12 @@ internal struct IndexFixedSize public uint _element0; } [InlineArray(MaxNeighbours)] - internal struct DistanceFixedSize + internal struct WeightFixedSize { public T _element0; } readonly IndexFixedSize _neighbourIndices = new(); - readonly DistanceFixedSize _neighbourWeights = new(); + readonly WeightFixedSize _neighbourWeights = new(); /// /// Current number of neighbours @@ -39,12 +39,12 @@ internal struct DistanceFixedSize /// /// The smallest neighbour weight /// - public readonly T MinDistance => NeighbourCount > 0 ? NeighbourWeights[0] : T.MaxValue; + public readonly T MinWeight => NeighbourCount > 0 ? NeighbourWeights[0] : T.MaxValue; /// /// The largest neighbour weight /// - public readonly T MaxDistance => NeighbourCount > 0 ? NeighbourWeights[NeighbourCount - 1] : T.MinValue; + public readonly T MaxWeight => NeighbourCount > 0 ? NeighbourWeights[NeighbourCount - 1] : T.MinValue; /// /// The index of the neighbour with the smallest weight @@ -62,60 +62,11 @@ internal struct DistanceFixedSize /// /// /// - public unsafe bool TryAddNeighbour2(uint neighbourIndex, T neighbourWeight) - { - var isFull = NeighbourCount == MaxNeighbours; - fixed (uint* indices = &_neighbourIndices._element0) - fixed (T* weights = &_neighbourWeights._element0) { - // check to see if it should be inserted - if (isFull && weights[NeighbourCount - 1] <= neighbourWeight) - return false; - - byte insertPosition = 0; - var foundInsertPosition = false; - for (byte i = 0; i < NeighbourCount; i++) { - // check that the neighbour has not already been added - if (indices[i] == neighbourIndex) - return false; - - // see if we should insert here - if (weights[i] > neighbourWeight) { - insertPosition = i; - foundInsertPosition = true; - break; - } - } - - if (!foundInsertPosition) { - // there is no room left - if (isFull) - return false; - - // insert at end - insertPosition = NeighbourCount; - } - else { - // shuffle to make room - for (var i = NeighbourCount - (isFull ? 2 : 1); i >= insertPosition; i--) { - indices[i + 1] = indices[i]; - weights[i + 1] = weights[i]; - } - } - - // insert the item - indices[insertPosition] = neighbourIndex; - weights[insertPosition] = neighbourWeight; - if (!isFull) - ++NeighbourCount; - } - return true; - } - public bool TryAddNeighbour(uint neighbourIndex, T neighbourWeight) { var isFull = NeighbourCount == MaxNeighbours; var indices = MemoryMarshal.CreateSpan(ref Unsafe.As(ref Unsafe.AsRef(in _neighbourIndices)), MaxNeighbours); - var weights = MemoryMarshal.CreateSpan(ref Unsafe.As(ref Unsafe.AsRef(in _neighbourWeights)), MaxNeighbours); + var weights = MemoryMarshal.CreateSpan(ref Unsafe.As(ref Unsafe.AsRef(in _neighbourWeights)), MaxNeighbours); // check to see if it should be inserted if (isFull && weights[NeighbourCount - 1] <= neighbourWeight) @@ -181,7 +132,7 @@ public bool TryAddNeighbour(uint neighbourIndex, T neighbourWeight) /// /// Sorted list of neighbour weights /// - public readonly ReadOnlySpan NeighbourWeights => MemoryMarshal.CreateReadOnlySpan(ref Unsafe.As(ref Unsafe.AsRef(in _neighbourWeights)), NeighbourCount); + public readonly ReadOnlySpan NeighbourWeights => MemoryMarshal.CreateReadOnlySpan(ref Unsafe.As(ref Unsafe.AsRef(in _neighbourWeights)), NeighbourCount); /// /// Returns a neighbour weight diff --git a/BrightData/LinearAlgebra/VectorIndexing/Helper/VectorGraph.cs b/BrightData/LinearAlgebra/VectorIndexing/Helper/VectorGraph.cs index d7150987..3fcb5d3f 100644 --- a/BrightData/LinearAlgebra/VectorIndexing/Helper/VectorGraph.cs +++ b/BrightData/LinearAlgebra/VectorIndexing/Helper/VectorGraph.cs @@ -1,4 +1,5 @@ using System; +using System.Collections.Generic; using System.IO; using System.Numerics; using System.Runtime.CompilerServices; @@ -11,16 +12,25 @@ namespace BrightData.LinearAlgebra.VectorIndexing.Helper /// Creates a graph of vectors with a fixed size set of neighbours /// /// - public class VectorGraph + public class VectorGraph : IHaveSize where T : unmanaged, IBinaryFloatingPointIeee754, IMinMaxValue { readonly IndexedFixedSizeGraphNode[] _nodes; - VectorGraph(IndexedFixedSizeGraphNode[] nodes) + /// + /// Creates a vector graph from an array of nodes + /// + /// + public VectorGraph(IndexedFixedSizeGraphNode[] nodes) { _nodes = nodes; } + /// + /// Number of nodes in the graph + /// + public uint Size => (uint)_nodes.Length; + /// /// Gets the neighbours for a node, sorted by distance /// @@ -35,6 +45,13 @@ public class VectorGraph /// public ReadOnlySpan GetNeighbourWeights(uint vectorIndex) => _nodes[vectorIndex].NeighbourWeights; + /// + /// Enumerates the neighbour indices and their weights in ascending order + /// + /// + /// + public IEnumerable<(uint NeighbourIndex, T NeighbourWeight)> GetWeightedNeighbours(uint vectorIndex) => _nodes[vectorIndex].WeightedNeighbours; + /// /// Creates /// diff --git a/BrightWire/BrightWire.xml b/BrightWire/BrightWire.xml index 6011257f..cd838d2d 100644 --- a/BrightWire/BrightWire.xml +++ b/BrightWire/BrightWire.xml @@ -2981,59 +2981,6 @@ Calculate vector based statistics - - - Calculates the distance between a list of vectors and a list of vectors to compare against - - - - - Constructor - - List of vectors to compare - Distance metric for comparison - - - - The list of vectors to compare against - - - - - Distance metric - - - - - Adds a comparison vector (will be owned and disposed by the helper class) - - Vector to compare against - Index of the comparison vector - - - - Updates the comparison vector at this index (disposes the old vector) - - Index to update - Vector to replace with - - - - Updates the entire list of comparison vectors - - List of vectors to compare against - - - - Returns the index of the closest comparison vector for each vector - - - - - Returns a vector averaged from the data vectors - - Indices of the data vectors to use in the averaged vector - K Nearest Neighbour classifier diff --git a/BrightWire/Helper/VectorDistanceHelper.cs b/BrightWire/Helper/VectorDistanceHelper.cs deleted file mode 100644 index 1e335999..00000000 --- a/BrightWire/Helper/VectorDistanceHelper.cs +++ /dev/null @@ -1,130 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using BrightData; -using BrightData.LinearAlgebra; - -namespace BrightWire.Helper -{ - /// - /// Calculates the distance between a list of vectors and a list of vectors to compare against - /// - public class VectorDistanceHelper : IDisposable - { - readonly LinearAlgebraProvider _lap; - readonly List> _comparison = []; - readonly IVector[] _data; - - /// - /// Constructor - /// - /// List of vectors to compare - /// Distance metric for comparison - public VectorDistanceHelper(IVector[] data, DistanceMetric distanceMetric = DistanceMetric.Euclidean) - { - _lap = data[0].Context.LinearAlgebraProvider; - Metric = distanceMetric; - _data = data; - } - - void IDisposable.Dispose() - { - _comparison.ForEach(x => x.Dispose()); - Array.ForEach(_data, x => x.Dispose()); - _comparison.Clear(); - } - - /// - /// The list of vectors to compare against - /// - public IReadOnlyList> CompareTo => _comparison; - - /// - /// Distance metric - /// - public DistanceMetric Metric { get; } - - /// - /// Adds a comparison vector (will be owned and disposed by the helper class) - /// - /// Vector to compare against - /// Index of the comparison vector - public int AddComparison(IVector comparison) - { - var ret = _comparison.Count; - _comparison.Add(comparison); - return ret; - } - - /// - /// Updates the comparison vector at this index (disposes the old vector) - /// - /// Index to update - /// Vector to replace with - public void UpdateComparisonVector(int index, IVector newVector) - { - _comparison[index].Dispose(); - _comparison[index] = newVector; - } - - /// - /// Updates the entire list of comparison vectors - /// - /// List of vectors to compare against - public void SetComparisonVectors(IEnumerable> comparisonVectors) - { - _comparison.ForEach(c => c.Dispose()); - _comparison.Clear(); - _comparison.AddRange(comparisonVectors); - } - - /// - /// Returns the index of the closest comparison vector for each vector - /// - public uint[] GetClosest() - { - using var distance = _lap.FindDistances(_data, _comparison, Metric); - return _data.Length.AsRange() - .Select(i => GetMinimum(distance, i).Index) - .ToArray(); - } - - /// - /// Returns a vector averaged from the data vectors - /// - /// Indices of the data vectors to use in the averaged vector - public IVector GetAverageFromData(uint[] indices) - { - using var data = _lap.CreateMatrixFromColumns(indices.Select(i => _data[i]).ToArray()); - var result = data.RowSums(); - result.MultiplyInPlace(1f / indices.Length); - return result; - } - - (uint Index, float Value) GetMinimum(IMatrix matrix, uint index) - { - var len = _comparison.Count; - - switch (len) { - case 1: - return (0, matrix[0, index]); - case 0: - throw new Exception("Cannot find minimum with zero length"); - } - - var (min, _, minIndex, _) = matrix.GetColumnSpan(index).GetMinAndMaxValues(); - return (minIndex, min); - - //var bestIndex = uint.MaxValue; - //var min = float.MaxValue; - //for (uint j = 0; j < len; j++) { - // var val = matrix[j, columnIndex]; - // if (val < min) { - // bestIndex = j; - // min = val; - // } - //} - //return (bestIndex, min); - } - } -}