From 90f2c2186fd390e55241b16a782830082da49b2d Mon Sep 17 00:00:00 2001 From: Johan Gustafsson Date: Fri, 2 Aug 2019 01:18:08 +0200 Subject: [PATCH] New major mathematical optimization giving boost of up to 14% on my nVidia GTX 1070. Number of multiplications during point addition phase has been reduced from 3 to 2. In addition to this a visible progress counter has been added during initialization. --- Dispatcher.cpp | 79 ++++++++-------- Dispatcher.hpp | 13 +-- profanity.cl | 251 ++++++++++++++++++++++++------------------------- 3 files changed, 172 insertions(+), 171 deletions(-) diff --git a/Dispatcher.cpp b/Dispatcher.cpp index c9226fc..884ced3 100644 --- a/Dispatcher.cpp +++ b/Dispatcher.cpp @@ -55,12 +55,12 @@ static void printResult(cl_ulong4 seed, cl_ulong round, result r, cl_uchar score std::cout << ": 0x" << strPublic << std::endl; } -unsigned int getKernelExecutionTimeMillis(cl_event & e) { +unsigned int getKernelExecutionTimeMicros(cl_event & e) { cl_ulong timeStart = 0, timeEnd = 0; clWaitForEvents(1, &e); clGetEventProfilingInfo(e, CL_PROFILING_COMMAND_START, sizeof(timeStart), &timeStart, NULL); clGetEventProfilingInfo(e, CL_PROFILING_COMMAND_END, sizeof(timeEnd), &timeEnd, NULL); - return (timeEnd - timeStart) / 1000000; + return (timeEnd - timeStart) / 1000; } Dispatcher::OpenCLException::OpenCLException(const std::string s, const cl_int res) : @@ -127,16 +127,15 @@ Dispatcher::Device::Device(Dispatcher & parent, cl_context & clContext, cl_progr m_worksizeLocal(worksizeLocal), m_clScoreMax(0), m_clQueue(createQueue(clContext, clDeviceId) ), - m_kernelBegin( createKernel(clProgram, "profanity_begin") ), - m_kernelInverse(createKernel(clProgram, "profanity_inverse_multiple")), - m_kernelInversePost(createKernel(clProgram, "profanity_inverse_post")), - m_kernelEnd(createKernel(clProgram, "profanity_end")), + m_kernelInit( createKernel(clProgram, "profanity_init") ), + m_kernelInverse(createKernel(clProgram, "profanity_inverse")), + m_kernelIterate(createKernel(clProgram, "profanity_iterate")), m_kernelTransform( mode.transformKernel() == "" ? NULL : createKernel(clProgram, mode.transformKernel())), m_kernelScore(createKernel(clProgram, mode.kernel)), m_memPrecomp(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, sizeof(g_precomp), g_precomp), - m_memPointsX(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true), - m_memPointsY(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true), - m_memInverse(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true), + m_memPointsDeltaX(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true), + m_memInversedNegativeDoubleGy(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true), + m_memPrevLambda(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, size, true), m_memResult(clContext, m_clQueue, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY, PROFANITY_MAX_SCORE + 1), m_memData1(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, 20), m_memData2(clContext, m_clQueue, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY, 20), @@ -205,6 +204,8 @@ void Dispatcher::init() { std::cout << std::endl; const auto deviceCount = m_vDevices.size(); + m_sizeInitTotal = m_size * deviceCount; + m_sizeInitDone = 0; cl_event * const pInitEvents = new cl_event[deviceCount]; @@ -238,33 +239,28 @@ void Dispatcher::initBegin(Device & d) { d.m_memData2.write(true); // Kernel arguments - profanity_begin - d.m_memPrecomp.setKernelArg(d.m_kernelBegin, 0); - d.m_memPointsX.setKernelArg(d.m_kernelBegin, 1); - d.m_memPointsY.setKernelArg(d.m_kernelBegin, 2); - d.m_memResult.setKernelArg(d.m_kernelBegin, 3); - CLMemory::setKernelArg(d.m_kernelBegin, 4, d.m_clSeed); + d.m_memPrecomp.setKernelArg(d.m_kernelInit, 0); + d.m_memPointsDeltaX.setKernelArg(d.m_kernelInit, 1); + d.m_memPrevLambda.setKernelArg(d.m_kernelInit, 2); + d.m_memResult.setKernelArg(d.m_kernelInit, 3); + CLMemory::setKernelArg(d.m_kernelInit, 4, d.m_clSeed); // Kernel arguments - profanity_inverse - d.m_memPointsX.setKernelArg(d.m_kernelInverse, 0); - d.m_memInverse.setKernelArg(d.m_kernelInverse, 1); + d.m_memPointsDeltaX.setKernelArg(d.m_kernelInverse, 0); + d.m_memInversedNegativeDoubleGy.setKernelArg(d.m_kernelInverse, 1); - // Kernel arguments - profanity_inverse_post - d.m_memPointsX.setKernelArg(d.m_kernelInversePost, 0); - d.m_memPointsY.setKernelArg(d.m_kernelInversePost, 1); - d.m_memInverse.setKernelArg(d.m_kernelInversePost, 2); - - // Kernel arguments - profanity_end - d.m_memPointsX.setKernelArg(d.m_kernelEnd, 0); - d.m_memPointsY.setKernelArg(d.m_kernelEnd, 1); - d.m_memInverse.setKernelArg(d.m_kernelEnd, 2); + // Kernel arguments - profanity_iterate + d.m_memPointsDeltaX.setKernelArg(d.m_kernelIterate, 0); + d.m_memInversedNegativeDoubleGy.setKernelArg(d.m_kernelIterate, 1); + d.m_memPrevLambda.setKernelArg(d.m_kernelIterate, 2); // Kernel arguments - profanity_transform_* if(d.m_kernelTransform) { - d.m_memInverse.setKernelArg(d.m_kernelTransform, 0); + d.m_memInversedNegativeDoubleGy.setKernelArg(d.m_kernelTransform, 0); } // Kernel arguments - profanity_score_* - d.m_memInverse.setKernelArg(d.m_kernelScore, 0); + d.m_memInversedNegativeDoubleGy.setKernelArg(d.m_kernelScore, 0); d.m_memResult.setKernelArg(d.m_kernelScore, 1); d.m_memData1.setKernelArg(d.m_kernelScore, 2); d.m_memData2.setKernelArg(d.m_kernelScore, 3); @@ -277,11 +273,16 @@ void Dispatcher::initBegin(Device & d) { void Dispatcher::initContinue(Device & d) { size_t sizeLeft = m_size - d.m_sizeInitialized; + const size_t sizeInitLimit = m_size / 20; + + // Print progress + const size_t percentDone = m_sizeInitDone * 100 / m_sizeInitTotal; + std::cout << " " << percentDone << "%\r" << std::flush; if (sizeLeft) { cl_event event; - const size_t sizeRun = std::min(sizeLeft, m_worksizeMax); - const auto resEnqueue = clEnqueueNDRangeKernel(d.m_clQueue, d.m_kernelBegin, 1, &d.m_sizeInitialized, &sizeRun, NULL, 0, NULL, &event); + const size_t sizeRun = std::min(sizeInitLimit, std::min(sizeLeft, m_worksizeMax)); + const auto resEnqueue = clEnqueueNDRangeKernel(d.m_clQueue, d.m_kernelInit, 1, &d.m_sizeInitialized, &sizeRun, NULL, 0, NULL, &event); OpenCLException::throwIfError("kernel queueing failed during initilization", resEnqueue); // See: https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/clSetEventCallback.html @@ -292,7 +293,9 @@ void Dispatcher::initContinue(Device & d) { // clFlush on the queue before returning or arrange for clFlush to be called later on another thread. clFlush(d.m_clQueue); + std::lock_guard lock(m_mutex); d.m_sizeInitialized += sizeRun; + m_sizeInitDone += sizeRun; const auto resCallback = clSetEventCallback(event, CL_COMPLETE, staticCallback, &d); OpenCLException::throwIfError("failed to set custom callback during initialization", resCallback); @@ -340,16 +343,13 @@ void Dispatcher::dispatch(Device & d) { #ifdef PROFANITY_DEBUG cl_event eventInverse; - cl_event eventInversePost; - cl_event eventEnd; + cl_event eventIterate; enqueueKernelDevice(d, d.m_kernelInverse, m_size / m_inverseSize, &eventInverse); - enqueueKernelDevice(d, d.m_kernelInversePost, m_size, &eventInversePost); - enqueueKernelDevice(d, d.m_kernelEnd, m_size, &eventEnd); + enqueueKernelDevice(d, d.m_kernelIterate, m_size, &eventIterate); #else enqueueKernelDevice(d, d.m_kernelInverse, m_size / m_inverseSize); - enqueueKernelDevice(d, d.m_kernelInversePost, m_size); - enqueueKernelDevice(d, d.m_kernelEnd, m_size); + enqueueKernelDevice(d, d.m_kernelIterate, m_size); #endif if (d.m_kernelTransform) { @@ -360,8 +360,11 @@ void Dispatcher::dispatch(Device & d) { clFlush(d.m_clQueue); #ifdef PROFANITY_DEBUG - clFinish(d.m_clQueue); - std::cout << getKernelExecutionTimeMillis(eventInverse) << ", " << getKernelExecutionTimeMillis(eventInversePost) << ", " << getKernelExecutionTimeMillis(eventEnd) << std::endl; + // We're actually not allowed to call clFinish here because this function is ultimately asynchronously called by OpenCL. + // However, this happens to work on my computer and it's not really intended for release, just something to aid me in + // optimizations. + clFinish(d.m_clQueue); + std::cout << "Timing: profanity_inverse = " << getKernelExecutionTimeMicros(eventInverse) << "us, profanity_iterate = " << getKernelExecutionTimeMicros(eventIterate) << "us" << std::endl; #endif const auto res = clSetEventCallback(event, CL_COMPLETE, staticCallback, &d); @@ -399,8 +402,8 @@ void Dispatcher::onEvent(cl_event event, cl_int status, Device & d) { else if (d.m_eventFinished != NULL) { initContinue(d); } else { - handleResult(d); ++d.m_round; + handleResult(d); bool bDispatch = true; { diff --git a/Dispatcher.hpp b/Dispatcher.hpp index 9266622..ea29136 100644 --- a/Dispatcher.hpp +++ b/Dispatcher.hpp @@ -49,17 +49,16 @@ class Dispatcher { cl_uchar m_clScoreMax; cl_command_queue m_clQueue; - cl_kernel m_kernelBegin; + cl_kernel m_kernelInit; cl_kernel m_kernelInverse; - cl_kernel m_kernelInversePost; - cl_kernel m_kernelEnd; + cl_kernel m_kernelIterate; cl_kernel m_kernelTransform; cl_kernel m_kernelScore; CLMemory m_memPrecomp; - CLMemory m_memPointsX; - CLMemory m_memPointsY; - CLMemory m_memInverse; + CLMemory m_memPointsDeltaX; + CLMemory m_memInversedNegativeDoubleGy; + CLMemory m_memPrevLambda; CLMemory m_memResult; // Data parameters used in some modes @@ -125,6 +124,8 @@ class Dispatcher { std::chrono::time_point timeStart; unsigned int m_countPrint; unsigned int m_countRunning; + size_t m_sizeInitTotal; + size_t m_sizeInitDone; bool m_quit; }; diff --git a/profanity.cl b/profanity.cl index 5df812b..41a3fcf 100644 --- a/profanity.cl +++ b/profanity.cl @@ -32,45 +32,12 @@ * * Iteration * ========= - * An iteration consists of a call to: - * 1. profanity_inverse_multiple - * 2. profanity_inverse_post - * 3. profanity_end - * 4. Potential transformation kernel for contract mode - * 5. One of the scoring kernels * - * The most two important kernels are 1 and 2. After initialization the pointers - * pX and pY point to the X and Y coordinates of a number of points. In other - * words, point i is given by {pX[i], pY[i]}. These are points on the - * elliptical curve used for Ethereum address generation: secp256k1. - * - * A private key is a point on this curve and its transformed in an - * irreversible process to what we know as a public address. Since the process - * is irreversible this program has to try private key after private key until - * it finds the pattern the user is seeking. To move from one private key - * to the next - that is to say from one point on the curve to the next - we - * have to add the generator point to it. - * - * This is an elliptical point addition and it is performed in the second kernel, - * profanity_inverse_post. The point addition requires the modular inverse of - * a value, this inverse is calculated in profanity_inverse_multiple and saved - * for a point in the area pointed to by pInverse. So the inverse necessary for - * point i is saved in pInverse[i]. - * - * The transformation from a point to a public address takes place in the - * kernel profanity_end. - * - * As a result of optimization this program no longer directly stores the X and - * Y value of a point on the curve in pPoints, instead the deltas X - G_x and - * Y - G_y are saved and the actual X and Y coordinates are retrieved by - * adding back G_x and G_y to the stored values. More information on this - * optimization can be found in the extensive comments for the kernels themselves. * * TODO * ==== - * * Experiment and see if an improved version of profanity_inverse_multiple - * that runs over all the points and thus only perform A SINGLE inversion - * is feasible. Preliminary testing shows a maximum speed-up by about 4%. + * * Update comments to reflect new optimizations and structure + * */ /* ------------------------------------------------------------------------ */ @@ -94,6 +61,10 @@ __constant const mp_number tripleNegativeGx = { {0xbb17b196, 0xf2287bec, 0x76958 // doubleNegativeGy = 0x6f8a4b11b2b8773544b60807e3ddeeae05d0976eb2f557ccc7705edf09de52bf __constant const mp_number doubleNegativeGy = { {0x09de52bf, 0xc7705edf, 0xb2f557cc, 0x05d0976e, 0xe3ddeeae, 0x44b60807, 0xb2b87735, 0x6f8a4b11} }; +// negativeGy = 0xb7c52588d95c3b9aa25b0403f1eef75702e84bb7597aabe663b82f6f04ef2777 +__constant const mp_number negativeGy = { {0x04ef2777, 0x63b82f6f, 0x597aabe6, 0x02e84bb7, 0xf1eef757, 0xa25b0403, 0xd95c3b9a, 0xb7c52588 } }; + + // Multiprecision subtraction. Underflow signalled via return value. mp_word mp_sub(mp_number * const r, const mp_number * const a, const mp_number * const b) { mp_word t, c = 0; @@ -343,7 +314,6 @@ void mp_mul_mod_word_sub(mp_number * const r, const mp_word w, const bool withMo // I have no idea, for the time being I'll leave it like this, also see the comments at the // beginning of this document under the title "Cutting corners". void mp_mod_mul(mp_number * const r, const mp_number * const X, const mp_number * const Y) { - mp_number Z = { {0} }; mp_word extraWord; @@ -418,7 +388,7 @@ typedef struct { // Elliptical point addition // Does not handle points sharing X coordinate, this is a deliberate design choice. // For more information on this choice see the beginning of this file. -void point_add(point * const p, point * const o) { +void point_add(point * const r, point * const p, point * const o) { mp_number tmp; mp_number newX; mp_number newY; @@ -438,8 +408,8 @@ void point_add(point * const p, point * const o) { mp_mod_mul(&newY, &newY, &tmp); mp_mod_sub(&newY, &newY, &p->y); - p->x = newX; - p->y = newY; + r->x = newX; + r->y = newY; } /* ------------------------------------------------------------------------ */ @@ -451,7 +421,7 @@ typedef struct { uchar foundHash[20]; } result; -void profanity_begin_seed(__global const point * const precomp, point * const p, bool * const pIsFirst, const size_t precompOffset, const ulong seed) { +void profanity_init_seed(__global const point * const precomp, point * const p, bool * const pIsFirst, const size_t precompOffset, const ulong seed) { point o; for (uchar i = 0; i < 8; ++i) { @@ -465,27 +435,42 @@ void profanity_begin_seed(__global const point * const precomp, point * const p, *pIsFirst = false; } else { - point_add(p, &o); + point_add(p, p, &o); } } } } -__kernel void profanity_begin(__global const point * const precomp, __global mp_number * const pX, __global mp_number * const pY, __global result * const pResult, const ulong4 seed) { +__kernel void profanity_init(__global const point * const precomp, __global mp_number * const pDeltaX, __global mp_number * const pPrevLambda, __global result * const pResult, const ulong4 seed) { const size_t id = get_global_id(0); point p; bool bIsFirst = true; - profanity_begin_seed(precomp, &p, &bIsFirst, 8 * 255 * 0, seed.x); - profanity_begin_seed(precomp, &p, &bIsFirst, 8 * 255 * 1, seed.y); - profanity_begin_seed(precomp, &p, &bIsFirst, 8 * 255 * 2, seed.z); - profanity_begin_seed(precomp, &p, &bIsFirst, 8 * 255 * 3, seed.w + id); + mp_number tmp1, tmp2; + point tmp3; + + // Calculate G^k where k = seed.wzyx (in other words, find the point indicated by the private key represented in seed) + profanity_init_seed(precomp, &p, &bIsFirst, 8 * 255 * 0, seed.x); + profanity_init_seed(precomp, &p, &bIsFirst, 8 * 255 * 1, seed.y); + profanity_init_seed(precomp, &p, &bIsFirst, 8 * 255 * 2, seed.z); + profanity_init_seed(precomp, &p, &bIsFirst, 8 * 255 * 3, seed.w + id); + + // Calculate current lambda in this point + mp_mod_sub_gx(&tmp1, &p.x); + mp_mod_inverse(&tmp1); + + mp_mod_sub_gy(&tmp2, &p.y); + mp_mod_mul(&tmp1, &tmp1, &tmp2); + // Jump to next point (precomp[0] is the generator point G) + tmp3 = precomp[0]; + point_add(&p, &tmp3, &p); + + // pDeltaX should contain the delta (x - G_x) mp_mod_sub_gx(&p.x, &p.x); - mp_mod_sub_gy(&p.y, &p.y); - pX[id] = p.x; - pY[id] = p.y; + pDeltaX[id] = p.x; + pPrevLambda[id] = tmp1; for (uchar i = 0; i < PROFANITY_MAX_SCORE + 1; ++i) { pResult[i].found = 0; @@ -499,27 +484,40 @@ __kernel void profanity_begin(__global const point * const precomp, __global mp_ // My RX 480 is very sensitive to changes in the second loop and sometimes I have // to make seemingly non-functional changes to the code to make the compiler // generate the most optimized version. -__kernel void profanity_inverse_multiple(__global mp_number * const pX, __global mp_number * const pInverse) { +__kernel void profanity_inverse(__global const mp_number * const pDeltaX, __global mp_number * const pInverse) { const size_t id = get_global_id(0) * PROFANITY_INVERSE_SIZE; + // negativeDoubleGy = 0x6f8a4b11b2b8773544b60807e3ddeeae05d0976eb2f557ccc7705edf09de52bf + mp_number negativeDoubleGy = { {0x09de52bf, 0xc7705edf, 0xb2f557cc, 0x05d0976e, 0xe3ddeeae, 0x44b60807, 0xb2b87735, 0x6f8a4b11 } }; + mp_number copy1, copy2; mp_number buffer[PROFANITY_INVERSE_SIZE]; + mp_number buffer2[PROFANITY_INVERSE_SIZE]; - buffer[0] = pX[id]; - + // We initialize buffer and buffer2 such that: + // buffer[i] = pDeltaX[id] * pDeltaX[id + 1] * pDeltaX[id + 2] * ... * pDeltaX[id + i] + // buffer2[i] = pDeltaX[id + i] + buffer[0] = pDeltaX[id]; for (uint i = 1; i < PROFANITY_INVERSE_SIZE; ++i) { - buffer[i] = pX[id + i]; - mp_mod_mul(&buffer[i], &buffer[i], &buffer[i - 1]); + buffer2[i] = pDeltaX[id + i]; + mp_mod_mul(&buffer[i], &buffer2[i], &buffer[i - 1]); } + // Take the inverse of all x-values combined copy1 = buffer[PROFANITY_INVERSE_SIZE - 1]; mp_mod_inverse(©1); + // We multiply in -2G_y together with the inverse so that we have: + // - 2 * G_y + // ---------------------------- + // x_0 * x_1 * x_2 * x_3 * ... + mp_mod_mul(©1, ©1, &negativeDoubleGy); + + // Multiply out each individual inverse using the buffers for (uint i = PROFANITY_INVERSE_SIZE - 1; i > 0; --i) { mp_mod_mul(©2, ©1, &buffer[i - 1]); + mp_mod_mul(©1, ©1, &buffer2[i]); pInverse[id + i] = copy2; - copy2 = pX[id + i]; - mp_mod_mul(©1, ©1, ©2); } pInverse[id] = copy1; @@ -537,7 +535,7 @@ __kernel void profanity_inverse_multiple(__global mp_number * const pX, __global // profanity_end to retrieve the actual x-coordinate instead of the // delta as that's what used for calculating the public hash. // -// The optimization comes when calculating the next y-coordinate. As +// One optimization is when calculating the next y-coordinate. As // given in the wiki the next y-coordinate is given by: // y_r = λ²(x_p - x_r) - y_p // In our case the other point P is the generator point so x_p = G_x, @@ -569,94 +567,93 @@ __kernel void profanity_inverse_multiple(__global mp_number * const pX, __global // We can just precalculate the constant -G_y and we get rid of one // subtraction. Woo! // +// But we aren't done yet! Let's expand the expression for the next +// lambda, λ'. We have: +// λ' = (y' - G_y) / d' +// = (-λ * d' - G_y - G_y) / d' +// = (-λ * d' - 2*G_y) / d' +// = -λ - 2*G_y / d' +// +// So the next lambda value can be calculated from the old one. This in +// and of itself is not so interesting but the fact that the term -2 * G_y +// is a constant is! Since it's constant it'll be the same value no matter +// which point we're currently working with. This means that this factor +// can be multiplied in during the inversion, and just with one call per +// inversion instead of one call per point! This is small enough to be +// negligible and thus we've reduced our point addition from three +// multi-precision multiplications to just two! Wow. Just wow. +// +// There is additional overhead introduced by storing the previous lambda +// but it's still a net gain. To additionally decrease memory access +// overhead I never any longer store the Y coordinate. Instead I +// calculate it at the end directly from the lambda and deltaX. +// // In addition to this some algebraic re-ordering has been done to move // constants into the same argument to a new function mp_mod_sub_const // in hopes that using constant storage instead of private storage // will aid speeds. // -// Just as we don't directly save the X coordinate but instead x - G_x -// we also do the same for the Y coordinate. This doesn't lead to -// any particular optimization or speed-up but is simply for consistensy. -// -// There is a minor mathematical optimization I've figured out that would -// save us from subtracting tripleNegativeGx on every other iteration by -// saving an intermediary value, this global data access however resulted -// in a net degradation in performance. I think any more optimizations -// here will be hard. -__kernel void profanity_inverse_post(__global mp_number * const pX, __global mp_number * const pY, __global const mp_number * const pInverse) { - const size_t id = get_global_id(0); - - mp_number x = pX[id]; - mp_number y = pY[id]; - mp_number tmp = pInverse[id]; - - // λ = (y - G_Y) / (x - G_X) - // y := y * pInverse[id] - mp_mod_mul(&y, &y, &tmp); - - // λ² = λ * λ <=> tmp := y * y = λ² - mp_mod_mul(&tmp, &y, &y); - - // d' = λ² - d - 3g = (-3g) - (d - λ²) <=> x := tripleNegativeGx - (x - tmp) - mp_mod_sub(&x, &x, &tmp); - mp_mod_sub_const(&x, &tripleNegativeGx, &x); - - // y' = (-G_Y) - λ * d' <=> p.y := negativeGy - (p.y * p.x) - mp_mod_mul(&y, &y, &x); - mp_mod_sub_const(&y, &doubleNegativeGy, &y); - - pX[id] = x; - pY[id] = y; -} - -// This kernel retrieves a point and calculates its public address. The -// public address is then stored in pInverse which is used only as interim -// storage as it won't otherwise be used again this cycle. +// After the above point addition this kernel calculates the public address +// corresponding to the point and stores it in pInverse which is used only +// as interim storage as it won't otherwise be used again this cycle. // // One of the scoring kernels will run after this and fetch the address // from pInverse. -__kernel void profanity_end(__global mp_number * const pX, __global mp_number * const pY, __global mp_number * const pInverse) { +__kernel void profanity_iterate(__global mp_number * const pDeltaX, __global mp_number * const pInverse, __global mp_number * const pPrevLambda) { const size_t id = get_global_id(0); // negativeGx = 0x8641998106234453aa5f9d6a3178f4f8fd640324d231d726a60d7ea3e907e497 mp_number negativeGx = { {0xe907e497, 0xa60d7ea3, 0xd231d726, 0xfd640324, 0x3178f4f8, 0xaa5f9d6a, 0x06234453, 0x86419981 } }; - // negativeGy = 0xb7c52588d95c3b9aa25b0403f1eef75702e84bb7597aabe663b82f6f04ef2777 - mp_number negativeGy = { {0x04ef2777, 0x63b82f6f, 0x597aabe6, 0x02e84bb7, 0xf1eef757, 0xa25b0403, 0xd95c3b9a, 0xb7c52588 } }; - ethhash h = { { 0 } }; - mp_number x = pX[id]; - mp_number y = pY[id]; - - // The values in pPoints are not the actual points. As a result of - // small optimizations instead the deltas p_x - G_x and p_y - G_Y - // are stored here. To retrieve the point we need to add G_x and G_y - // to the x and y values respectively. We do this by subtracting - // their negative values since I'd rather reuse mp_mod_sub than - // implement mp_mod_add. - mp_mod_sub(&x, &x, &negativeGx); - mp_mod_sub(&y, &y, &negativeGy); - - h.d[0] = bswap32(x.d[MP_WORDS - 1]); - h.d[1] = bswap32(x.d[MP_WORDS - 2]); - h.d[2] = bswap32(x.d[MP_WORDS - 3]); - h.d[3] = bswap32(x.d[MP_WORDS - 4]); - h.d[4] = bswap32(x.d[MP_WORDS - 5]); - h.d[5] = bswap32(x.d[MP_WORDS - 6]); - h.d[6] = bswap32(x.d[MP_WORDS - 7]); - h.d[7] = bswap32(x.d[MP_WORDS - 8]); - h.d[8] = bswap32(y.d[MP_WORDS - 1]); - h.d[9] = bswap32(y.d[MP_WORDS - 2]); - h.d[10] = bswap32(y.d[MP_WORDS - 3]); - h.d[11] = bswap32(y.d[MP_WORDS - 4]); - h.d[12] = bswap32(y.d[MP_WORDS - 5]); - h.d[13] = bswap32(y.d[MP_WORDS - 6]); - h.d[14] = bswap32(y.d[MP_WORDS - 7]); - h.d[15] = bswap32(y.d[MP_WORDS - 8]); + + mp_number dX = pDeltaX[id]; + mp_number tmp = pInverse[id]; + mp_number lambda = pPrevLambda[id]; + + // λ' = - (2G_y) / d' - λ <=> lambda := pInversedNegativeDoubleGy[id] - pPrevLambda[id] + mp_mod_sub(&lambda, &tmp, &lambda); + + // λ² = λ * λ <=> tmp := lambda * lambda = λ² + mp_mod_mul(&tmp, &lambda, &lambda); + + // d' = λ² - d - 3g = (-3g) - (d - λ²) <=> x := tripleNegativeGx - (x - tmp) + mp_mod_sub(&dX, &dX, &tmp); + mp_mod_sub_const(&dX, &tripleNegativeGx, &dX); + + pDeltaX[id] = dX; + pPrevLambda[id] = lambda; + + // Calculate y from dX and lambda + // y' = (-G_Y) - λ * d' <=> p.y := negativeGy - (p.y * p.x) + mp_mod_mul(&tmp, &lambda, &dX); + mp_mod_sub_const(&tmp, &negativeGy, &tmp); + + // Restore X coordinate from delta value + mp_mod_sub(&dX, &dX, &negativeGx); + + // Initialize Keccak structure with point coordinates in big endian + h.d[0] = bswap32(dX.d[MP_WORDS - 1]); + h.d[1] = bswap32(dX.d[MP_WORDS - 2]); + h.d[2] = bswap32(dX.d[MP_WORDS - 3]); + h.d[3] = bswap32(dX.d[MP_WORDS - 4]); + h.d[4] = bswap32(dX.d[MP_WORDS - 5]); + h.d[5] = bswap32(dX.d[MP_WORDS - 6]); + h.d[6] = bswap32(dX.d[MP_WORDS - 7]); + h.d[7] = bswap32(dX.d[MP_WORDS - 8]); + h.d[8] = bswap32(tmp.d[MP_WORDS - 1]); + h.d[9] = bswap32(tmp.d[MP_WORDS - 2]); + h.d[10] = bswap32(tmp.d[MP_WORDS - 3]); + h.d[11] = bswap32(tmp.d[MP_WORDS - 4]); + h.d[12] = bswap32(tmp.d[MP_WORDS - 5]); + h.d[13] = bswap32(tmp.d[MP_WORDS - 6]); + h.d[14] = bswap32(tmp.d[MP_WORDS - 7]); + h.d[15] = bswap32(tmp.d[MP_WORDS - 8]); h.d[16] ^= 0x01; // length 64 sha3_keccakf(&h); + // Save public address hash in pInverse, only used as interim storage until next cycle pInverse[id].d[0] = h.d[3]; pInverse[id].d[1] = h.d[4]; pInverse[id].d[2] = h.d[5];