From 117678a1d246bbff068ca8f5e69f05e922dbac4c Mon Sep 17 00:00:00 2001 From: xzavier Date: Mon, 26 Aug 2024 21:52:57 +0800 Subject: [PATCH] further gpu optimization --- .gitmodules | 3 +- Makefile | 10 +- src/config/definitions.hpp | 4 +- src/goldilocks | 2 +- src/prover/prover.cpp | 39 +- src/starkpil/chelpers.cpp | 2 +- src/starkpil/chelpers_steps.hpp | 22 +- src/starkpil/chelpers_steps_avx512.hpp | 21 +- src/starkpil/chelpers_steps_gpu.cu | 1030 ---------------------- src/starkpil/chelpers_steps_gpu.hpp | 73 -- src/starkpil/chelpers_steps_pack.cu | 918 +++++++++++++++++++ src/starkpil/chelpers_steps_pack.cuh | 66 ++ src/starkpil/chelpers_steps_pack.hpp | 113 ++- src/starkpil/merkleTree/merkleTreeGL.cpp | 2 +- src/starkpil/polinomial.hpp | 35 +- src/starkpil/starks.cpp | 60 +- src/starkpil/starks.hpp | 14 +- src/utils/memory.cu | 18 +- src/utils/utils.cpp | 2 +- 19 files changed, 1180 insertions(+), 1254 deletions(-) delete mode 100644 src/starkpil/chelpers_steps_gpu.cu delete mode 100644 src/starkpil/chelpers_steps_gpu.hpp create mode 100644 src/starkpil/chelpers_steps_pack.cu create mode 100644 src/starkpil/chelpers_steps_pack.cuh diff --git a/.gitmodules b/.gitmodules index fc9bb9b0f..35c9d2ced 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,4 @@ [submodule "src/goldilocks"] path = src/goldilocks - url = https://github.com/0xPolygonHermez/goldilocks.git + url = https://github.com/okx/goldilocks.git + branch = "develop" diff --git a/Makefile b/Makefile index 663a18507..54c2d1ff1 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ #INFO := $(shell cd src/goldilocks && ./configure.sh && cd ../.. && sleep 2) -#include src/goldilocks/CudaArch.mk +include src/goldilocks/CudaArch.mk NVCC := /usr/local/cuda/bin/nvcc TARGET_ZKP := zkProver @@ -45,6 +45,8 @@ else CXXFLAGS += -O3 endif +PROVER_FORK_ID=10 + ifdef PROVER_FORK_ID CXXFLAGS += -DPROVER_FORK_ID=$(PROVER_FORK_ID) endif @@ -70,7 +72,7 @@ GRPC_CPP_PLUGIN = grpc_cpp_plugin GRPC_CPP_PLUGIN_PATH ?= `which $(GRPC_CPP_PLUGIN)` INC_DIRS := $(shell find $(SRC_DIRS) -type d) $(sort $(dir)) -INC_FLAGS := $(addprefix -I,$(INC_DIRS)) +INC_FLAGS := $(addprefix -I,$(INC_DIRS)) -I/usr/local/cuda/include SRCS_ZKP := $(shell find $(SRC_DIRS) ! -path "./src/fflonk_setup/fflonk_setup*" ! -path "./tools/starkpil/bctree/*" ! -path "./test/examples/*" ! -path "./test/expressions/*" ! -path "./test/prover/*" ! -path "./src/goldilocks/benchs/*" ! -path "./src/goldilocks/benchs/*" ! -path "./src/goldilocks/tests/*" ! -path "./src/main_generator/*" ! -path "./src/pols_generator/*" ! -path "./src/pols_diff/*" ! -path "./src/witness2db/*" \( -name *.cpp -or -name *.c -or -name *.asm -or -name *.cc \)) SRCS_ZKP_GPU := $(shell find $(SRC_DIRS) ! -path "./src/fflonk_setup/fflonk_setup*" ! -path "./tools/starkpil/bctree/*" ! -path "./test/examples/*" ! -path "./test/expressions/*" ! -path "./test/prover/*" ! -path "./src/goldilocks/benchs/*" ! -path "./src/goldilocks/benchs/*" ! -path "./src/goldilocks/tests/*" ! -path "./src/main_generator/*" ! -path "./src/pols_generator/*" ! -path "./src/pols_diff/*" ! -path "./src/witness2db/*" ! -path "./src/goldilocks/utils/deviceQuery.cu" \( -name *.cpp -or -name *.c -or -name *.asm -or -name *.cc -or -name *.cu \)) @@ -157,7 +159,7 @@ $(BUILD_DIR_GPU)/%.asm.o: %.asm # c++ source $(BUILD_DIR_GPU)/%.cpp.o: %.cpp $(MKDIR_P) $(dir $@) - $(CXX) -D__USE_CUDA__ $(CFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@ + $(CXX) -DENABLE_EXPERIMENTAL_CODE -D__USE_CUDA__ $(CFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@ $(BUILD_DIR_GPU)/%.cc.o: %.cc $(MKDIR_P) $(dir $@) @@ -166,7 +168,7 @@ $(BUILD_DIR_GPU)/%.cc.o: %.cc # cuda source $(BUILD_DIR_GPU)/%.cu.o: %.cu $(MKDIR_P) $(dir $@) - $(NVCC) -D__USE_CUDA__ $(INC_FLAGS) -Isrc/goldilocks/utils -Xcompiler -fopenmp -Xcompiler -fPIC -Xcompiler -mavx2 -Xcompiler -O3 -O3 -arch=$(CUDA_ARCH) -O3 $< -dc --output-file $@ + $(NVCC) -DENABLE_EXPERIMENTAL_CODE -D__USE_CUDA__ $(INC_FLAGS) -Isrc/goldilocks/utils -Xcompiler -fopenmp -Xcompiler -fPIC -Xcompiler -mavx2 -Xcompiler -O3 -O3 -arch=$(CUDA_ARCH) -O3 $< -dc --output-file $@ main_generator: $(BUILD_DIR)/$(TARGET_MNG) diff --git a/src/config/definitions.hpp b/src/config/definitions.hpp index d0f0898d5..b611c227c 100644 --- a/src/config/definitions.hpp +++ b/src/config/definitions.hpp @@ -82,8 +82,8 @@ #define DATABASE_USE_CACHE // If defined, the Database class uses a cache #define USE_NEW_KVTREE -#define MAIN_SM_EXECUTOR_GENERATED_CODE -#define MAIN_SM_PROVER_GENERATED_CODE +//#define MAIN_SM_EXECUTOR_GENERATED_CODE +//#define MAIN_SM_PROVER_GENERATED_CODE #define LOAD_CONST_FILES false diff --git a/src/goldilocks b/src/goldilocks index 30a1de8e5..d8c4973fb 160000 --- a/src/goldilocks +++ b/src/goldilocks @@ -1 +1 @@ -Subproject commit 30a1de8e5f2f6faa6194191fed33d061a8241f01 +Subproject commit d8c4973fbd6059861f428fb53fb0a9f50a774f90 diff --git a/src/prover/prover.cpp b/src/prover/prover.cpp index 2695327ee..ceb5fb833 100644 --- a/src/prover/prover.cpp +++ b/src/prover/prover.cpp @@ -36,7 +36,6 @@ #include "commit_pols_starks.hpp" #include "chelpers_steps.hpp" #include "chelpers_steps_pack.hpp" -#include "chelpers_steps_gpu.hpp" #ifdef __AVX512__ #include "chelpers_steps_avx512.hpp" #endif @@ -54,6 +53,7 @@ #include "cuda_utils.hpp" #include "ntt_goldilocks.hpp" #include +#include "chelpers_steps_pack.cuh" int asynctask(void* (*task)(void* args), void* arg) { @@ -93,6 +93,10 @@ Prover::Prover(Goldilocks &fr, { if (config.generateProof()) { +#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE) + alloc_pinned_mem(uint64_t(1<<25) * 128); + warmup_gpu(); +#endif TimerStart(PROVER_INIT); //checkSetupHash(config.zkevmVerifier); @@ -136,11 +140,6 @@ Prover::Prover(Goldilocks &fr, } zklog.info("Prover::genBatchProof() successfully allocated " + to_string(polsSize) + " bytes"); } - -#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE) - alloc_pinned_mem(uint64_t(1<<24) * _starkInfo.mapSectionsN.section[eSection::cm1_n]); - warmup_gpu(); -#endif json finalVerkeyJson; file2json(config.finalVerkey, finalVerkeyJson); @@ -519,8 +518,11 @@ void Prover::genBatchProof(ProverRequest *pProverRequest) json recursive2Verkey; file2json(config.recursive2Verkey, recursive2Verkey); +#ifdef __USE_CUDA__ + Goldilocks::Element *publics = (Goldilocks::Element *)malloc_zkevm(starksRecursive1->starkInfo.nPublics); +#else Goldilocks::Element publics[starksRecursive1->starkInfo.nPublics]; - +#endif // oldStateRoot publics[0] = cmPols.Main.B0[0]; publics[1] = cmPols.Main.B1[0]; @@ -598,16 +600,18 @@ void Prover::genBatchProof(ProverRequest *pProverRequest) /*************************************/ #if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE) - CHelpersStepsGPU cHelpersSteps; + CHelpersStepsPackGPU cHelpersStepsZkevm; #elif defined(__AVX512__) - CHelpersStepsAvx512 cHelpersSteps; -#elif defined(__PACK__) - CHelpersStepsPack cHelpersSteps; + CHelpersStepsAvx512 cHelpersStepsZkevm; +#elif defined(__PACK__) + CHelpersStepsPack cHelpersStepsZkevm; cHelpersSteps.nrowsPack = NROWS_PACK; #else - CHelpersSteps cHelpersSteps; + CHelpersSteps cHelpersStepsZkevm; #endif + CHelpersSteps cHelpersSteps; + TimerStart(STARK_PROOF_BATCH_PROOF); ZkevmSteps zkevmChelpersSteps; @@ -615,7 +619,7 @@ void Prover::genBatchProof(ProverRequest *pProverRequest) FRIProof fproof((1 << polBits), FIELD_EXTENSION, starkZkevm->starkInfo.starkStruct.steps.size(), starkZkevm->starkInfo.evMap.size(), starkZkevm->starkInfo.nPublics); if(USE_GENERIC_PARSER) { - starkZkevm->genProof(fproof, &publics[0], zkevmVerkey, &cHelpersSteps); + starkZkevm->genProof(fproof, &publics[0], zkevmVerkey, &cHelpersStepsZkevm); } else { starkZkevm->genProof(fproof, &publics[0], zkevmVerkey, &zkevmChelpersSteps); } @@ -727,10 +731,15 @@ void Prover::genBatchProof(ProverRequest *pProverRequest) jProofRecursive1["publics"] = publicStarkJson; json2file(jProofRecursive1, pProverRequest->filePrefix + "batch_proof.proof.json"); } + +#ifdef __USE_CUDA__ + free_zkevm(publics); +#endif TimerStopAndLog(SAVE_PROOF); } TimerStopAndLog(PROVER_BATCH_PROOF); + assert(0); } void Prover::genAggregatedProof(ProverRequest *pProverRequest) @@ -846,7 +855,7 @@ void Prover::genAggregatedProof(ProverRequest *pProverRequest) if(USE_GENERIC_PARSER) { #if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE) - CHelpersStepsGPU cHelpersSteps; + CHelpersStepsPackGPU cHelpersSteps; #elif defined(__AVX512__) CHelpersStepsAvx512 cHelpersSteps; #elif defined(__PACK__) @@ -955,7 +964,7 @@ void Prover::genFinalProof(ProverRequest *pProverRequest) FRIProofC12 fproofRecursiveF((1 << polBitsRecursiveF), FIELD_EXTENSION, starksRecursiveF->starkInfo.starkStruct.steps.size(), starksRecursiveF->starkInfo.evMap.size(), starksRecursiveF->starkInfo.nPublics); if(USE_GENERIC_PARSER) { #if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE) - CHelpersStepsGPU cHelpersSteps; + CHelpersStepsPackGPU cHelpersSteps; #elif defined(__AVX512__) CHelpersStepsAvx512 cHelpersSteps; #elif defined(__PACK__) diff --git a/src/starkpil/chelpers.cpp b/src/starkpil/chelpers.cpp index d3f830a85..628e2782d 100644 --- a/src/starkpil/chelpers.cpp +++ b/src/starkpil/chelpers.cpp @@ -70,4 +70,4 @@ void CHelpers::loadCHelpers(BinFileUtils::BinFile *cHelpersBin) { } cHelpersBin->endReadSection(); -}; \ No newline at end of file +}; diff --git a/src/starkpil/chelpers_steps.hpp b/src/starkpil/chelpers_steps.hpp index 4c350bc1f..0c2c23037 100644 --- a/src/starkpil/chelpers_steps.hpp +++ b/src/starkpil/chelpers_steps.hpp @@ -1,8 +1,8 @@ #ifndef CHELPERS_STEPS_HPP #define CHELPERS_STEPS_HPP #include "chelpers.hpp" - #include "steps.hpp" +#include "definitions.hpp" class CHelpersSteps { public: @@ -75,14 +75,19 @@ class CHelpersSteps { bool isTmpPol = !domainExtended && s == 4; for(uint64_t k = 0; k < nColsStages[s]; ++k) { uint64_t dim = storePol[nColsStagesAcc[s] + k]; - if(storePol[nColsStagesAcc[s] + k]) { + if(!TRANSPOSE_TMP_POLS) { __m256i *buffT = &bufferT_[(nColsStagesAcc[s] + k)]; - if(isTmpPol) { - for(uint64_t i = 0; i < dim; ++i) { - Goldilocks::store_avx(¶ms.pols[offsetsStages[s] + k * domainSize + row * dim + i], uint64_t(dim), buffT[i]); + Goldilocks::store_avx(¶ms.pols[offsetsStages[s] + k + row * nColsStages[s]], nColsStages[s], buffT[0]); + } else { + if(storePol[nColsStagesAcc[s] + k]) { + __m256i *buffT = &bufferT_[(nColsStagesAcc[s] + k)]; + if(isTmpPol) { + for(uint64_t i = 0; i < dim; ++i) { + Goldilocks::store_avx(¶ms.pols[offsetsStages[s] + k * domainSize + row * dim + i], uint64_t(dim), buffT[i]); + } + } else { + Goldilocks::store_avx(¶ms.pols[offsetsStages[s] + k + row * nColsStages[s]], nColsStages[s], buffT[0]); } - } else { - Goldilocks::store_avx(¶ms.pols[offsetsStages[s] + k + row * nColsStages[s]], nColsStages[s], buffT[0]); } } } @@ -165,6 +170,7 @@ class CHelpersSteps { uint8_t *storePol = &parserArgs.storePols[parserParams.storePolsOffset]; setBufferTInfo(starkInfo, parserParams.stage); + Goldilocks3::Element_avx challenges[params.challenges.degree()]; Goldilocks3::Element_avx challenges_ops[params.challenges.degree()]; for(uint64_t i = 0; i < params.challenges.degree(); ++i) { @@ -703,4 +709,4 @@ class CHelpersSteps { } }; -#endif \ No newline at end of file +#endif diff --git a/src/starkpil/chelpers_steps_avx512.hpp b/src/starkpil/chelpers_steps_avx512.hpp index bea51d686..9fffc7791 100644 --- a/src/starkpil/chelpers_steps_avx512.hpp +++ b/src/starkpil/chelpers_steps_avx512.hpp @@ -3,6 +3,7 @@ #include "chelpers.hpp" #include "chelpers_steps.hpp" #include "steps.hpp" +#include "definitions.hpp" class CHelpersStepsAvx512 : public CHelpersSteps { public: @@ -75,14 +76,19 @@ class CHelpersStepsAvx512 : public CHelpersSteps { bool isTmpPol = !domainExtended && s == 4; for(uint64_t k = 0; k < nColsStages[s]; ++k) { uint64_t dim = storePol[nColsStagesAcc[s] + k]; - if(storePol[nColsStagesAcc[s] + k]) { + if(!TRANSPOSE_TMP_POLS) { __m512i *buffT = &bufferT_[(nColsStagesAcc[s] + k)]; - if(isTmpPol) { - for(uint64_t i = 0; i < dim; ++i) { - Goldilocks::store_avx512(¶ms.pols[offsetsStages[s] + k * domainSize + row * dim + i], uint64_t(dim), buffT[i]); + Goldilocks::store_avx512(¶ms.pols[offsetsStages[s] + k + row * nColsStages[s]], nColsStages[s], buffT[0]); + } else { + if(storePol[nColsStagesAcc[s] + k]) { + __m512i *buffT = &bufferT_[(nColsStagesAcc[s] + k)]; + if(isTmpPol) { + for(uint64_t i = 0; i < dim; ++i) { + Goldilocks::store_avx512(¶ms.pols[offsetsStages[s] + k * domainSize + row * dim + i], uint64_t(dim), buffT[i]); + } + } else { + Goldilocks::store_avx512(¶ms.pols[offsetsStages[s] + k + row * nColsStages[s]], nColsStages[s], buffT[0]); } - } else { - Goldilocks::store_avx512(¶ms.pols[offsetsStages[s] + k + row * nColsStages[s]], nColsStages[s], buffT[0]); } } } @@ -701,6 +707,7 @@ class CHelpersStepsAvx512 : public CHelpersSteps { assert(i_args == parserParams.nArgs); } } + }; -#endif \ No newline at end of file +#endif diff --git a/src/starkpil/chelpers_steps_gpu.cu b/src/starkpil/chelpers_steps_gpu.cu deleted file mode 100644 index 8e6bdfab2..000000000 --- a/src/starkpil/chelpers_steps_gpu.cu +++ /dev/null @@ -1,1030 +0,0 @@ -#include "chelpers_steps_gpu.hpp" -#include "chelpers_steps_pack.hpp" -#include "zklog.hpp" - -#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE) - -#include "gl64_t.cuh" -#include "goldilocks_cubic_extension.cuh" -#include -#include "cuda_utils.cuh" -#include "cuda_utils.hpp" - -void CHelpersStepsGPU::dataSetup(StarkInfo &starkInfo, StepsParams ¶ms, ParserArgs &parserArgs, ParserParams &parserParams) -{ - bool domainExtended = parserParams.stage <= 3 ? false : true; - uint64_t domainSize = domainExtended ? 1 << starkInfo.starkStruct.nBitsExt : 1 << starkInfo.starkStruct.nBits; - uint64_t nextStride = domainExtended ? 1 << (starkInfo.starkStruct.nBitsExt - starkInfo.starkStruct.nBits) : 1; - - /* - Metadata - */ - nColsStagesAcc.resize(10 + 2); - nColsStages.resize(10 + 2); - offsetsStages.resize(10 + 2); - - nColsStages[0] = starkInfo.nConstants + 2; - offsetsStages[0] = 0; - - for (uint64_t s = 1; s <= 3; ++s) - { - nColsStages[s] = starkInfo.mapSectionsN.section[string2section("cm" + to_string(s) + "_n")]; - if (domainExtended) - { - offsetsStages[s] = starkInfo.mapOffsets.section[string2section("cm" + to_string(s) + "_2ns")]; - } - else - { - offsetsStages[s] = starkInfo.mapOffsets.section[string2section("cm" + to_string(s) + "_n")]; - } - } - if (domainExtended) - { - nColsStages[4] = starkInfo.mapSectionsN.section[eSection::cm4_2ns]; - offsetsStages[4] = starkInfo.mapOffsets.section[eSection::cm4_2ns]; - } - else - { - nColsStages[4] = starkInfo.mapSectionsN.section[eSection::tmpExp_n]; - offsetsStages[4] = starkInfo.mapOffsets.section[eSection::tmpExp_n]; - } - for (uint64_t o = 0; o < 2; ++o) - { - for (uint64_t s = 0; s < 5; ++s) - { - if (s == 0) - { - if (o == 0) - { - nColsStagesAcc[0] = 0; - } - else - { - nColsStagesAcc[5 * o] = nColsStagesAcc[5 * o - 1] + nColsStages[4]; - } - } - else - { - nColsStagesAcc[5 * o + s] = nColsStagesAcc[5 * o + (s - 1)] + nColsStages[(s - 1)]; - } - } - } - nColsStagesAcc[10] = nColsStagesAcc[9] + nColsStages[4]; // Polinomials f & q - if (parserParams.stage == 4) - { - offsetsStages[10] = starkInfo.mapOffsets.section[eSection::q_2ns]; - nColsStages[10] = starkInfo.qDim; - } - else if (parserParams.stage == 5) - { - offsetsStages[10] = starkInfo.mapOffsets.section[eSection::f_2ns]; - nColsStages[10] = 3; - } - nColsStagesAcc[11] = nColsStagesAcc[10] + nColsStages[10]; // xDivXSubXi - nCols = nColsStagesAcc[11] + 6; // 3 for xDivXSubXi and 3 for xDivXSubWxi - - stepPointers_h.domainSize = domainSize; - stepPointers_h.nConstants = starkInfo.nConstants; - stepPointers_h.nextStride = nextStride; - - CHECKCUDAERR(cudaMalloc((void **)&(stepPointers_h.nColsStages_d), nColsStages.size() * sizeof(uint64_t))); - CHECKCUDAERR(cudaMemcpy(stepPointers_h.nColsStages_d, nColsStages.data(), nColsStages.size() * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - CHECKCUDAERR(cudaMalloc((void **)&(stepPointers_h.nColsStagesAcc_d), nColsStagesAcc.size() * sizeof(uint64_t))); - CHECKCUDAERR(cudaMemcpy(stepPointers_h.nColsStagesAcc_d, nColsStagesAcc.data(), nColsStagesAcc.size() * sizeof(uint64_t), cudaMemcpyHostToDevice)); - - /* - non-buffered data - */ - uint8_t *ops = &parserArgs.ops[parserParams.opsOffset]; - uint32_t *ops_aux = new uint32_t[parserParams.nOps]; - for (uint64_t i = 0; i < parserParams.nOps; ++i) - ops_aux[i] = uint32_t(ops[i]); - CHECKCUDAERR(cudaMalloc((void **)&(stepPointers_h.ops_d), parserParams.nOps * sizeof(uint32_t))); - CHECKCUDAERR(cudaMemcpy(stepPointers_h.ops_d, ops_aux, parserParams.nOps * sizeof(uint32_t), cudaMemcpyHostToDevice)); - delete[] ops_aux; - - uint16_t *args = &parserArgs.args[parserParams.argsOffset]; - uint32_t *args_aux = new uint32_t[parserParams.nArgs]; - for (uint64_t i = 0; i < parserParams.nArgs; ++i) - args_aux[i] = uint32_t(args[i]); - CHECKCUDAERR(cudaMalloc((void **)&(stepPointers_h.args_d), parserParams.nArgs * sizeof(uint32_t))); - CHECKCUDAERR(cudaMemcpy(stepPointers_h.args_d, args_aux, parserParams.nArgs * sizeof(uint32_t), cudaMemcpyHostToDevice)); - delete[] args_aux; - - uint64_t *numbers = &parserArgs.numbers[parserParams.numbersOffset]; - Goldilocks::Element *numbers_aux = new Goldilocks::Element[parserParams.nNumbers * nrowsPack]; - //this expansion could be done in the GPU... - for (uint64_t i = 0; i < parserParams.nNumbers; ++i) - { - for (uint64_t j = 0; j < nrowsPack; ++j) - { - numbers_aux[i * nrowsPack + j] = Goldilocks::fromU64(numbers[i]); - } - } - CHECKCUDAERR(cudaMalloc((void **)&(stepPointers_h.numbers_d), parserParams.nNumbers * nrowsPack * sizeof(gl64_t))); - CHECKCUDAERR(cudaMemcpy(stepPointers_h.numbers_d, numbers_aux, parserParams.nNumbers * nrowsPack * sizeof(gl64_t), cudaMemcpyHostToDevice)); - delete[] numbers_aux; - - Goldilocks::Element *challenges_aux = new Goldilocks::Element[params.challenges.degree() * FIELD_EXTENSION * nrowsPack]; - Goldilocks::Element *challenges_ops_aux = new Goldilocks::Element[params.challenges.degree() * FIELD_EXTENSION * nrowsPack]; - //this expansion could be done in the GPU... - for (uint64_t i = 0; i < params.challenges.degree(); ++i) - { - for (uint64_t j = 0; j < nrowsPack; ++j) - { - challenges_aux[(i * FIELD_EXTENSION) * nrowsPack + j] = params.challenges[i][0]; - challenges_aux[(i * FIELD_EXTENSION + 1) * nrowsPack + j] = params.challenges[i][1]; - challenges_aux[(i * FIELD_EXTENSION + 2) * nrowsPack + j] = params.challenges[i][2]; - challenges_ops_aux[(i * FIELD_EXTENSION) * nrowsPack + j] = params.challenges[i][0] + params.challenges[i][1]; - challenges_ops_aux[(i * FIELD_EXTENSION + 1) * nrowsPack + j] = params.challenges[i][0] + params.challenges[i][2]; - challenges_ops_aux[(i * FIELD_EXTENSION + 2) * nrowsPack + j] = params.challenges[i][1] + params.challenges[i][2]; - } - } - - - CHECKCUDAERR(cudaMalloc((void **)&(stepPointers_h.challenges_d), params.challenges.degree() * FIELD_EXTENSION * nrowsPack * sizeof(gl64_t))); - CHECKCUDAERR(cudaMemcpy(stepPointers_h.challenges_d, challenges_aux, params.challenges.degree() * FIELD_EXTENSION * nrowsPack * sizeof(gl64_t), cudaMemcpyHostToDevice)); - - CHECKCUDAERR(cudaMalloc((void **)&(stepPointers_h.challenges_ops_d), params.challenges.degree() * FIELD_EXTENSION * nrowsPack * sizeof(gl64_t))); - CHECKCUDAERR(cudaMemcpy(stepPointers_h.challenges_ops_d, challenges_ops_aux, params.challenges.degree() * FIELD_EXTENSION * nrowsPack * sizeof(gl64_t), cudaMemcpyHostToDevice)); - - delete[] challenges_aux; - delete[] challenges_ops_aux; - - Goldilocks::Element *publics_aux = new Goldilocks::Element[starkInfo.nPublics * nrowsPack]; - for (uint64_t i = 0; i < starkInfo.nPublics; ++i) - { - for (uint64_t j = 0; j < nrowsPack; ++j) - { - publics_aux[i * nrowsPack + j] = params.publicInputs[i]; - } - } - CHECKCUDAERR(cudaMalloc((void **)&(stepPointers_h.publics_d), starkInfo.nPublics * nrowsPack * sizeof(gl64_t))); - CHECKCUDAERR(cudaMemcpy(stepPointers_h.publics_d, publics_aux, starkInfo.nPublics * nrowsPack * sizeof(gl64_t), cudaMemcpyHostToDevice)); - delete[] publics_aux; - - Goldilocks::Element *evals_aux = new Goldilocks::Element[params.evals.degree() * FIELD_EXTENSION * nrowsPack]; - for (uint64_t i = 0; i < params.evals.degree(); ++i) - { - for (uint64_t j = 0; j < nrowsPack; ++j) - { - evals_aux[(i * FIELD_EXTENSION) * nrowsPack + j] = params.evals[i][0]; - evals_aux[(i * FIELD_EXTENSION + 1) * nrowsPack + j] = params.evals[i][1]; - evals_aux[(i * FIELD_EXTENSION + 2) * nrowsPack + j] = params.evals[i][2]; - } - } - CHECKCUDAERR(cudaMalloc((void **)&(stepPointers_h.evals_d), params.evals.degree() * FIELD_EXTENSION * nrowsPack * sizeof(gl64_t))); - CHECKCUDAERR(cudaMemcpy(stepPointers_h.evals_d, evals_aux, params.evals.degree() * FIELD_EXTENSION * nrowsPack * sizeof(gl64_t), cudaMemcpyHostToDevice)); - delete[] evals_aux; - - CHECKCUDAERR(cudaMalloc((void**)&(stepPointers_h.x_n_d), params.x_n.degree()*sizeof(gl64_t))); - CHECKCUDAERR(cudaMemcpy(stepPointers_h.x_n_d, params.x_n.address(), params.x_n.degree()*sizeof(gl64_t), cudaMemcpyHostToDevice)); - - CHECKCUDAERR(cudaMalloc((void**)&(stepPointers_h.x_2ns_d), params.x_2ns.degree()*sizeof(gl64_t))); - CHECKCUDAERR(cudaMemcpy(stepPointers_h.x_2ns_d, params.x_2ns.address(), params.x_2ns.degree()*sizeof(gl64_t), cudaMemcpyHostToDevice)); - - CHECKCUDAERR(cudaMalloc((void**)&(stepPointers_h.zi_d), params.zi.degree()*sizeof(gl64_t))); - CHECKCUDAERR(cudaMemcpy(stepPointers_h.zi_d, params.zi.address(), params.zi.degree()*sizeof(gl64_t), cudaMemcpyHostToDevice)); - - CHECKCUDAERR(cudaMalloc((void**)&(stepPointers_h.xDivXSubXi_d), params.xDivXSubXi.degree()*params.xDivXSubXi.dim()*sizeof(gl64_t))); - CHECKCUDAERR(cudaMemcpy(stepPointers_h.xDivXSubXi_d, params.xDivXSubXi.address(), params.xDivXSubXi.degree()*params.xDivXSubXi.dim()*sizeof(gl64_t), cudaMemcpyHostToDevice)); - - /* - temporal buffers - */ - - stepPointers_h.dimBufferT = 2 * nCols * nrowsPack; - CHECKCUDAERR(cudaMalloc((void **)&(stepPointers_h.bufferT_d), stepPointers_h.dimBufferT * nstreams * sizeof(gl64_t))); - - stepPointers_h.dimBufferPols = 0; - uint64_t nStages = 3; - for (uint64_t s = 1; s <= nStages; ++s){ - stepPointers_h.dimBufferPols += nColsStages[s]; - } - if(parserParams.stage==5){ - stepPointers_h.dimBufferPols += nColsStages[nStages + 1]; - } - stepPointers_h.dimBufferPols += nColsStages[10]; //for the store - stepPointers_h.dimBufferPols = stepPointers_h.dimBufferPols * (nrowsPack+nextStride); - CHECKCUDAERR(cudaMallocHost((void **)&(stepPointers_h.bufferPols_d), stepPointers_h.dimBufferPols * nstreams * sizeof(gl64_t))); - - stepPointers_h.dimBufferConsts = starkInfo.nConstants * (nrowsPack+nextStride); - CHECKCUDAERR(cudaMallocHost((void **)&(stepPointers_h.bufferConsts_d), stepPointers_h.dimBufferConsts * nstreams * sizeof(gl64_t))); - - stepPointers_h.dimTmp1 = parserParams.nTemp1 * nrowsPack; - CHECKCUDAERR(cudaMalloc((void **)&(stepPointers_h.tmp1_d), stepPointers_h.dimTmp1 * nstreams * sizeof(gl64_t))); - - stepPointers_h.dimTmp3 = parserParams.nTemp3 * nrowsPack * FIELD_EXTENSION; - CHECKCUDAERR(cudaMalloc((void **)&(stepPointers_h.tmp3_d), stepPointers_h.dimTmp3 * nstreams * sizeof(gl64_t))); - - /* - copy to device - */ - CHECKCUDAERR(cudaMalloc((void **)&(stepPointers_d), sizeof(StepsPointers))); - CHECKCUDAERR(cudaMemcpy(stepPointers_d, &stepPointers_h, sizeof(StepsPointers), cudaMemcpyHostToDevice)); -} - -void CHelpersStepsGPU::loadData(StarkInfo &starkInfo, StepsParams ¶ms, ParserArgs &parserArgs, ParserParams &parserParams, uint64_t row, cudaStream_t& stream){ - - bool domainExtended = parserParams.stage > 3 ? true : false; - uint32_t iStream = (row / nrowsPack) % nstreams; - gl64_t *bufferConsts_d = &stepPointers_h.bufferConsts_d[stepPointers_h.dimBufferConsts * iStream]; - gl64_t *bufferPols_d = &stepPointers_h.bufferPols_d[stepPointers_h.dimBufferPols * iStream]; - ConstantPolsStarks *constPols = domainExtended ? params.pConstPols2ns : params.pConstPols; - - - CHECKCUDAERR(cudaMemcpyAsync(bufferConsts_d, &(((Goldilocks::Element *)constPols->address())[row * starkInfo.nConstants]), stepPointers_h.dimBufferConsts*sizeof(Goldilocks::Element), cudaMemcpyHostToDevice, stream)); - - uint64_t nStages=3; // do I relly need to copy all - uint64_t offset_pols_d=0; - for (uint64_t s = 1; s <= nStages; ++s) - { - - uint64_t offset_pols_h = offsetsStages[s] + row * nColsStages[s]; - uint64_t size_copy = nColsStages[s]*(nrowsPack+stepPointers_h.nextStride); - CHECKCUDAERR(cudaMemcpyAsync(&(bufferPols_d[offset_pols_d]), &(params.pols[offset_pols_h]), size_copy*sizeof(Goldilocks::Element), cudaMemcpyHostToDevice, stream)); - offset_pols_d += size_copy; - } - if (parserParams.stage == 5){ - - uint64_t offset_pols_h = offsetsStages[nStages + 1] + row * nColsStages[nStages + 1]; - uint64_t size_copy = nColsStages[nStages + 1]*(nrowsPack+stepPointers_h.nextStride); - CHECKCUDAERR(cudaMemcpyAsync(&(bufferPols_d[offset_pols_d]), &(params.pols[offset_pols_h]), size_copy*sizeof(Goldilocks::Element), cudaMemcpyHostToDevice, stream)); - } -} - -void CHelpersStepsGPU::storeData(StarkInfo &starkInfo, StepsParams ¶ms, ParserArgs &parserArgs, ParserParams &parserParams, uint64_t row, cudaStream_t& stream){ - - uint32_t iStream = (row / nrowsPack) % nstreams; - bool domainExtended = parserParams.stage > 3 ? true : false; - gl64_t *bufferPols_d = &stepPointers_h.bufferPols_d[stepPointers_h.dimBufferPols * iStream]; - - if (!domainExtended){ - uint64_t nStages=3; // do I relly need to copy all - uint64_t offset_pols_d=0; - for (uint64_t s = 2; s <= nStages + 1; ++s) //optimize copies that can be avoided... - { - - uint64_t offset_pols_h = offsetsStages[s] + row * nColsStages[s]; - uint64_t size_copy = nColsStages[s]*nrowsPack; - - CHECKCUDAERR(cudaMemcpyAsync(&(params.pols[offset_pols_h]), &(bufferPols_d[offset_pols_d]), size_copy*sizeof(Goldilocks::Element), cudaMemcpyDeviceToHost, stream)); - offset_pols_d += size_copy; - } - }else{ - uint64_t size_copy = nColsStages[10]*nrowsPack; - gl64_t *bufferPols_ = &(stepPointers_h.bufferPols_d[(iStream+1) * stepPointers_h.dimBufferPols-size_copy]); //data available at the end - CHECKCUDAERR(cudaMemcpyAsync(&(params.pols[offsetsStages[10] + row * nColsStages[10]]), bufferPols_, size_copy*sizeof(Goldilocks::Element), cudaMemcpyDeviceToHost, stream)); - } -} - -void CHelpersStepsGPU::calculateExpressions(StarkInfo &starkInfo, StepsParams ¶ms, ParserArgs &parserArgs, ParserParams &parserParams) -{ - - nrowsPack = 64; - nstreams = 16; - int nDevices; - CHECKCUDAERR(cudaGetDeviceCount(&nDevices)); - - - bool domainExtended = parserParams.stage > 3 ? true : false; - uint64_t domainSize = domainExtended ? 1 << starkInfo.starkStruct.nBitsExt : 1 << starkInfo.starkStruct.nBits; - uint64_t nextStride = domainExtended ? 1 << (starkInfo.starkStruct.nBitsExt - starkInfo.starkStruct.nBits) : 1; - - // checka - if(domainSize % nrowsPack != 0){ - zklog.error("nrowsPack should divide domainSize"); - exitProcess(); - } - if(nrowsPack <= nextStride){ - zklog.error("nrowsPack should be greater than nextStride"); - exitProcess(); - } - - // The last pack of rows are solved uwing the chelpers_pack - - CHelpersStepsPack chelpersPack; - chelpersPack.calculateExpressionsRows(starkInfo, params, parserArgs, parserParams, domainSize-nrowsPack, domainSize); - - //Rest of packs are copmuted in the GPU... - dataSetup(starkInfo, params, parserArgs, parserParams); - cudaStream_t *streams = new cudaStream_t[nstreams]; - for (int i = 0; i < nstreams; i++) - { - cudaStreamCreate(&streams[i]); - } - - for (uint64_t i = 0; i < domainSize-nrowsPack; i += nrowsPack) - { - uint32_t iStream = (i / nrowsPack) % nstreams; - loadData(starkInfo, params, parserArgs, parserParams, i, streams[iStream]); - _transposeToBuffer<<<1, nrowsPack, 0, streams[iStream]>>>(stepPointers_d, i, parserParams.stage, domainExtended, iStream); - _packComputation<<<1, nrowsPack, 0, streams[iStream]>>>(stepPointers_d, domainSize, parserParams.nOps, parserParams.nArgs, iStream); - _transposeFromBuffer<<<1, nrowsPack, 0, streams[iStream]>>>(stepPointers_d, i, parserParams.stage, domainExtended, iStream); - storeData(starkInfo, params, parserArgs, parserParams, i, streams[iStream]); - } - - // - // Synchronize and Destroy Streams and free memory - // - for (int i = 0; i < nstreams; i++) - { - cudaStreamSynchronize(streams[i]); - cudaStreamDestroy(streams[i]); - } - freePointers(); - - delete[] streams; -} - -void CHelpersStepsGPU::freePointers() -{ - cudaFree(stepPointers_h.nColsStages_d); - cudaFree(stepPointers_h.nColsStagesAcc_d); - cudaFree(stepPointers_h.ops_d); - cudaFree(stepPointers_h.args_d); - cudaFree(stepPointers_h.numbers_d); - cudaFree(stepPointers_h.challenges_d); - cudaFree(stepPointers_h.challenges_ops_d); - cudaFree(stepPointers_h.publics_d); - cudaFree(stepPointers_h.evals_d); - cudaFree(stepPointers_h.x_n_d); - cudaFree(stepPointers_h.x_2ns_d); - cudaFree(stepPointers_h.zi_d); - cudaFree(stepPointers_h.xDivXSubXi_d); - cudaFree(stepPointers_h.bufferT_d); - cudaFreeHost(stepPointers_h.bufferPols_d); - cudaFreeHost(stepPointers_h.bufferConsts_d); - cudaFree(stepPointers_h.tmp1_d); - cudaFree(stepPointers_h.tmp3_d); - cudaFree(stepPointers_d); -} - -__global__ void _packComputation(StepsPointers *stepPointers_d, uint32_t N, uint32_t nOps, uint32_t nArgs, uint32_t stream) -{ - - uint64_t i_args = 0; - gl64_t *bufferT_ = &(stepPointers_d->bufferT_d[stream * stepPointers_d->dimBufferT]); - gl64_t *tmp1 = &(stepPointers_d->tmp1_d[stream * stepPointers_d->dimTmp1]); - gl64_t *tmp3 = &(stepPointers_d->tmp3_d[stream * stepPointers_d->dimTmp3]); - uint32_t *ops_ = stepPointers_d->ops_d; - uint32_t *args_ = stepPointers_d->args_d; - - for (uint64_t kk = 0; kk < nOps; ++kk) - { - switch (ops_[kk]) - { - case 0: - { - // COPY commit1 to commit1 - gl64_t::copy_gpu(&bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args]] + args_[i_args + 1]) * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 2]] + args_[i_args + 3]) * blockDim.x]); - i_args += 4; - break; - } - case 1: - { - // OPERATION WITH DEST: commit1 - SRC0: commit1 - SRC1: commit1 - gl64_t::op_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 3]] + args_[i_args + 4]) * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 5]] + args_[i_args + 6]) * blockDim.x]); - i_args += 7; - break; - } - case 2: - { - // OPERATION WITH DEST: commit1 - SRC0: commit1 - SRC1: tmp1 - gl64_t::op_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 3]] + args_[i_args + 4]) * blockDim.x], &tmp1[args_[i_args + 5] * blockDim.x]); - i_args += 6; - break; - } - case 3: - { - // OPERATION WITH DEST: commit1 - SRC0: commit1 - SRC1: public - gl64_t::op_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 3]] + args_[i_args + 4]) * blockDim.x], &stepPointers_d->publics_d[args_[i_args + 5] * blockDim.x]); - i_args += 6; - break; - } - case 4: - { - // OPERATION WITH DEST: commit1 - SRC0: commit1 - SRC1: number - gl64_t::op_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 3]] + args_[i_args + 4]) * blockDim.x], &stepPointers_d->numbers_d[args_[i_args + 5] * blockDim.x]); - i_args += 6; - break; - } - case 5: - { - // COPY tmp1 to commit1 - gl64_t::copy_gpu(&bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args]] + args_[i_args + 1]) * blockDim.x], &tmp1[args_[i_args + 2] * blockDim.x]); - i_args += 3; - break; - } - case 6: - { - // OPERATION WITH DEST: commit1 - SRC0: tmp1 - SRC1: tmp1 - gl64_t::op_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &tmp1[args_[i_args + 3] * blockDim.x], &tmp1[args_[i_args + 4] * blockDim.x]); - i_args += 5; - break; - } - case 7: - { - // OPERATION WITH DEST: commit1 - SRC0: tmp1 - SRC1: public - gl64_t::op_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &tmp1[args_[i_args + 3] * blockDim.x], &stepPointers_d->publics_d[args_[i_args + 4] * blockDim.x]); - i_args += 5; - break; - } - case 8: - { - // OPERATION WITH DEST: commit1 - SRC0: tmp1 - SRC1: number - gl64_t::op_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &tmp1[args_[i_args + 3] * blockDim.x], &stepPointers_d->numbers_d[args_[i_args + 4] * blockDim.x]); - i_args += 5; - break; - } - case 9: - { - // COPY public to commit1 - gl64_t::copy_gpu(&bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args]] + args_[i_args + 1]) * blockDim.x], &stepPointers_d->publics_d[args_[i_args + 2] * blockDim.x]); - i_args += 3; - break; - } - case 10: - { - // OPERATION WITH DEST: commit1 - SRC0: public - SRC1: public - gl64_t::op_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &stepPointers_d->publics_d[args_[i_args + 3] * blockDim.x], &stepPointers_d->publics_d[args_[i_args + 4] * blockDim.x]); - i_args += 5; - break; - } - case 11: - { - // OPERATION WITH DEST: commit1 - SRC0: public - SRC1: number - gl64_t::op_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &stepPointers_d->publics_d[args_[i_args + 3] * blockDim.x], &stepPointers_d->numbers_d[args_[i_args + 4] * blockDim.x]); - i_args += 5; - break; - } - case 12: - { - // COPY number to commit1 - gl64_t::copy_gpu(&bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args]] + args_[i_args + 1]) * blockDim.x], &stepPointers_d->numbers_d[args_[i_args + 2] * blockDim.x]); - i_args += 3; - break; - } - case 13: - { - // OPERATION WITH DEST: commit1 - SRC0: number - SRC1: number - gl64_t::op_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &stepPointers_d->numbers_d[args_[i_args + 3] * blockDim.x], &stepPointers_d->numbers_d[args_[i_args + 4] * blockDim.x]); - i_args += 5; - break; - } - case 14: - { - // COPY commit1 to tmp1 - gl64_t::copy_gpu(&tmp1[args_[i_args] * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x]); - i_args += 3; - break; - } - case 15: - { - // OPERATION WITH DEST: tmp1 - SRC0: commit1 - SRC1: commit1 - gl64_t::op_gpu(args_[i_args], &tmp1[args_[i_args + 1] * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 2]] + args_[i_args + 3]) * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 4]] + args_[i_args + 5]) * blockDim.x]); - i_args += 6; - break; - } - case 16: - { - // OPERATION WITH DEST: tmp1 - SRC0: commit1 - SRC1: tmp1 - gl64_t::op_gpu(args_[i_args], &tmp1[args_[i_args + 1] * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 2]] + args_[i_args + 3]) * blockDim.x], &tmp1[args_[i_args + 4] * blockDim.x]); - i_args += 5; - break; - } - case 17: - { - // OPERATION WITH DEST: tmp1 - SRC0: commit1 - SRC1: public - gl64_t::op_gpu(args_[i_args], &tmp1[args_[i_args + 1] * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 2]] + args_[i_args + 3]) * blockDim.x], &stepPointers_d->publics_d[args_[i_args + 4] * blockDim.x]); - i_args += 5; - break; - } - case 18: - { - // OPERATION WITH DEST: tmp1 - SRC0: commit1 - SRC1: number - gl64_t::op_gpu(args_[i_args], &tmp1[args_[i_args + 1] * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 2]] + args_[i_args + 3]) * blockDim.x], &stepPointers_d->numbers_d[args_[i_args + 4] * blockDim.x]); - i_args += 5; - break; - } - case 19: - { - // COPY tmp1 to tmp1 - gl64_t::copy_gpu(&tmp1[args_[i_args] * blockDim.x], &tmp1[args_[i_args + 1] * blockDim.x]); - i_args += 2; - break; - } - case 20: - { - // OPERATION WITH DEST: tmp1 - SRC0: tmp1 - SRC1: tmp1 - gl64_t::op_gpu(args_[i_args], &tmp1[args_[i_args + 1] * blockDim.x], &tmp1[args_[i_args + 2] * blockDim.x], &tmp1[args_[i_args + 3] * blockDim.x]); - i_args += 4; - break; - } - case 21: - { - // OPERATION WITH DEST: tmp1 - SRC0: tmp1 - SRC1: public - gl64_t::op_gpu(args_[i_args], &tmp1[args_[i_args + 1] * blockDim.x], &tmp1[args_[i_args + 2] * blockDim.x], &stepPointers_d->publics_d[args_[i_args + 3] * blockDim.x]); - i_args += 4; - break; - } - case 22: - { - // OPERATION WITH DEST: tmp1 - SRC0: tmp1 - SRC1: number - gl64_t::op_gpu(args_[i_args], &tmp1[args_[i_args + 1] * blockDim.x], &tmp1[args_[i_args + 2] * blockDim.x], &stepPointers_d->numbers_d[args_[i_args + 3] * blockDim.x]); - i_args += 4; - break; - } - case 23: - { - // COPY public to tmp1 - gl64_t::copy_gpu(&tmp1[args_[i_args] * blockDim.x], &stepPointers_d->publics_d[args_[i_args + 1] * blockDim.x]); - i_args += 2; - break; - } - case 24: - { - // OPERATION WITH DEST: tmp1 - SRC0: public - SRC1: public - gl64_t::op_gpu(args_[i_args], &tmp1[args_[i_args + 1] * blockDim.x], &stepPointers_d->publics_d[args_[i_args + 2] * blockDim.x], &stepPointers_d->publics_d[args_[i_args + 3] * blockDim.x]); - i_args += 4; - break; - } - case 25: - { - // OPERATION WITH DEST: tmp1 - SRC0: public - SRC1: number - gl64_t::op_gpu(args_[i_args], &tmp1[args_[i_args + 1] * blockDim.x], &stepPointers_d->publics_d[args_[i_args + 2] * blockDim.x], &stepPointers_d->numbers_d[args_[i_args + 3] * blockDim.x]); - i_args += 4; - break; - } - case 26: - { - // COPY number to tmp1 - gl64_t::copy_gpu(&tmp1[args_[i_args] * blockDim.x], &stepPointers_d->numbers_d[args_[i_args + 1] * blockDim.x]); - i_args += 2; - break; - } - case 27: - { - // OPERATION WITH DEST: tmp1 - SRC0: number - SRC1: number - gl64_t::op_gpu(args_[i_args], &tmp1[args_[i_args + 1] * blockDim.x], &stepPointers_d->numbers_d[args_[i_args + 2] * blockDim.x], &stepPointers_d->numbers_d[args_[i_args + 3] * blockDim.x]); - i_args += 4; - break; - } - case 28: - { - // OPERATION WITH DEST: commit3 - SRC0: commit3 - SRC1: commit1 - Goldilocks3GPU::op_31_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 3]] + args_[i_args + 4]) * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 5]] + args_[i_args + 6]) * blockDim.x]); - i_args += 7; - break; - } - case 29: - { - // OPERATION WITH DEST: commit3 - SRC0: commit3 - SRC1: tmp1 - Goldilocks3GPU::op_31_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 3]] + args_[i_args + 4]) * blockDim.x], &tmp1[args_[i_args + 5] * blockDim.x]); - i_args += 6; - break; - } - case 30: - { - // OPERATION WITH DEST: commit3 - SRC0: commit3 - SRC1: public - Goldilocks3GPU::op_31_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 3]] + args_[i_args + 4]) * blockDim.x], &stepPointers_d->publics_d[args_[i_args + 5] * blockDim.x]); - i_args += 6; - break; - } - case 31: - { - // OPERATION WITH DEST: commit3 - SRC0: commit3 - SRC1: number - Goldilocks3GPU::op_31_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 3]] + args_[i_args + 4]) * blockDim.x], &stepPointers_d->numbers_d[args_[i_args + 5] * blockDim.x]); - i_args += 6; - break; - } - case 32: - { - // OPERATION WITH DEST: commit3 - SRC0: tmp3 - SRC1: commit1 - Goldilocks3GPU::op_31_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &tmp3[args_[i_args + 3] * blockDim.x * FIELD_EXTENSION], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 4]] + args_[i_args + 5]) * blockDim.x]); - i_args += 6; - break; - } - case 33: - { - // OPERATION WITH DEST: commit3 - SRC0: tmp3 - SRC1: tmp1 - Goldilocks3GPU::op_31_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &tmp3[args_[i_args + 3] * blockDim.x * FIELD_EXTENSION], &tmp1[args_[i_args + 4] * blockDim.x]); - i_args += 5; - break; - } - case 34: - { - // OPERATION WITH DEST: commit3 - SRC0: tmp3 - SRC1: public - Goldilocks3GPU::op_31_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &tmp3[args_[i_args + 3] * blockDim.x * FIELD_EXTENSION], &stepPointers_d->publics_d[args_[i_args + 4] * blockDim.x]); - i_args += 5; - break; - } - case 35: - { - // OPERATION WITH DEST: commit3 - SRC0: tmp3 - SRC1: number - Goldilocks3GPU::op_31_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &tmp3[args_[i_args + 3] * blockDim.x * FIELD_EXTENSION], &stepPointers_d->numbers_d[args_[i_args + 4] * blockDim.x]); - i_args += 5; - break; - } - case 36: - { - // OPERATION WITH DEST: commit3 - SRC0: challenge - SRC1: commit1 - Goldilocks3GPU::op_31_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &stepPointers_d->challenges_d[args_[i_args + 3] * FIELD_EXTENSION * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 4]] + args_[i_args + 5]) * blockDim.x]); - i_args += 6; - break; - } - case 37: - { - // OPERATION WITH DEST: commit3 - SRC0: challenge - SRC1: tmp1 - Goldilocks3GPU::op_31_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &stepPointers_d->challenges_d[args_[i_args + 3] * FIELD_EXTENSION * blockDim.x], &tmp1[args_[i_args + 4] * blockDim.x]); - i_args += 5; - break; - } - case 38: - { - // OPERATION WITH DEST: commit3 - SRC0: challenge - SRC1: public - Goldilocks3GPU::op_31_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &stepPointers_d->challenges_d[args_[i_args + 3] * FIELD_EXTENSION * blockDim.x], &stepPointers_d->publics_d[args_[i_args + 4] * blockDim.x]); - i_args += 5; - break; - } - case 39: - { - // OPERATION WITH DEST: commit3 - SRC0: challenge - SRC1: number - Goldilocks3GPU::op_31_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &stepPointers_d->challenges_d[args_[i_args + 3] * FIELD_EXTENSION * blockDim.x], &stepPointers_d->numbers_d[args_[i_args + 4] * blockDim.x]); - i_args += 5; - break; - } - case 40: - { - // COPY commit3 to commit3 - Goldilocks3GPU::copy_gpu(&bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args]] + args_[i_args + 1]) * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 2]] + args_[i_args + 3]) * blockDim.x]); - i_args += 4; - break; - } - case 41: - { - // OPERATION WITH DEST: commit3 - SRC0: commit3 - SRC1: commit3 - Goldilocks3GPU::op_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 3]] + args_[i_args + 4]) * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 5]] + args_[i_args + 6]) * blockDim.x]); - i_args += 7; - break; - } - case 42: - { - // OPERATION WITH DEST: commit3 - SRC0: commit3 - SRC1: tmp3 - Goldilocks3GPU::op_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 3]] + args_[i_args + 4]) * blockDim.x], &tmp3[args_[i_args + 5] * blockDim.x * FIELD_EXTENSION]); - i_args += 6; - break; - } - case 43: - { - // MULTIPLICATION WITH DEST: commit3 - SRC0: commit3 - SRC1: challenge - Goldilocks3GPU::mul_gpu(&bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 3]] + args_[i_args + 4]) * blockDim.x], &stepPointers_d->challenges_d[args_[i_args + 5] * FIELD_EXTENSION * blockDim.x], &stepPointers_d->challenges_ops_d[args_[i_args + 5] * FIELD_EXTENSION * blockDim.x]); - i_args += 6; - break; - } - case 44: - { - // OPERATION WITH DEST: commit3 - SRC0: commit3 - SRC1: challenge - Goldilocks3GPU::op_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 3]] + args_[i_args + 4]) * blockDim.x], &stepPointers_d->challenges_d[args_[i_args + 5] * FIELD_EXTENSION * blockDim.x]); - i_args += 6; - break; - } - case 45: - { - // COPY tmp3 to commit3 - Goldilocks3GPU::copy_gpu(&bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args]] + args_[i_args + 1]) * blockDim.x], &tmp3[args_[i_args + 2] * blockDim.x * FIELD_EXTENSION]); - i_args += 3; - break; - } - case 46: - { - // OPERATION WITH DEST: commit3 - SRC0: tmp3 - SRC1: tmp3 - Goldilocks3GPU::op_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &tmp3[args_[i_args + 3] * blockDim.x * FIELD_EXTENSION], &tmp3[args_[i_args + 4] * blockDim.x * FIELD_EXTENSION]); - i_args += 5; - break; - } - case 47: - { - // MULTIPLICATION WITH DEST: commit3 - SRC0: tmp3 - SRC1: challenge - Goldilocks3GPU::mul_gpu(&bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &tmp3[args_[i_args + 3] * blockDim.x * FIELD_EXTENSION], &stepPointers_d->challenges_d[args_[i_args + 4] * FIELD_EXTENSION * blockDim.x], &stepPointers_d->challenges_ops_d[args_[i_args + 4] * FIELD_EXTENSION * blockDim.x]); - i_args += 5; - break; - } - case 48: - { - // OPERATION WITH DEST: commit3 - SRC0: tmp3 - SRC1: challenge - Goldilocks3GPU::op_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &tmp3[args_[i_args + 3] * blockDim.x * FIELD_EXTENSION], &stepPointers_d->challenges_d[args_[i_args + 4] * FIELD_EXTENSION * blockDim.x]); - i_args += 5; - break; - } - case 49: - { - // MULTIPLICATION WITH DEST: commit3 - SRC0: challenge - SRC1: challenge - Goldilocks3GPU::mul_gpu(&bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &stepPointers_d->challenges_d[args_[i_args + 3] * FIELD_EXTENSION * blockDim.x], &stepPointers_d->challenges_d[args_[i_args + 4] * FIELD_EXTENSION * blockDim.x], &stepPointers_d->challenges_ops_d[args_[i_args + 4] * FIELD_EXTENSION * blockDim.x]); - i_args += 5; - break; - } - case 50: - { - // OPERATION WITH DEST: commit3 - SRC0: challenge - SRC1: challenge - Goldilocks3GPU::op_gpu(args_[i_args], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x], &stepPointers_d->challenges_d[args_[i_args + 3] * FIELD_EXTENSION * blockDim.x], &stepPointers_d->challenges_d[args_[i_args + 4] * FIELD_EXTENSION * blockDim.x]); - i_args += 5; - break; - } - case 51: - { - // OPERATION WITH DEST: tmp3 - SRC0: commit3 - SRC1: commit1 - Goldilocks3GPU::op_31_gpu(args_[i_args], &tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 2]] + args_[i_args + 3]) * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 4]] + args_[i_args + 5]) * blockDim.x]); - i_args += 6; - break; - } - case 52: - { - // OPERATION WITH DEST: tmp3 - SRC0: commit3 - SRC1: tmp1 - Goldilocks3GPU::op_31_gpu(args_[i_args], &tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 2]] + args_[i_args + 3]) * blockDim.x], &tmp1[args_[i_args + 4] * blockDim.x]); - i_args += 5; - break; - } - case 53: - { - // OPERATION WITH DEST: tmp3 - SRC0: commit3 - SRC1: public - Goldilocks3GPU::op_31_gpu(args_[i_args], &tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 2]] + args_[i_args + 3]) * blockDim.x], &stepPointers_d->publics_d[args_[i_args + 4] * blockDim.x]); - i_args += 5; - break; - } - case 54: - { - // OPERATION WITH DEST: tmp3 - SRC0: commit3 - SRC1: number - Goldilocks3GPU::op_31_gpu(args_[i_args], &tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 2]] + args_[i_args + 3]) * blockDim.x], &stepPointers_d->numbers_d[args_[i_args + 4] * blockDim.x]); - i_args += 5; - break; - } - case 55: - { - // OPERATION WITH DEST: tmp3 - SRC0: tmp3 - SRC1: commit1 - Goldilocks3GPU::op_31_gpu(args_[i_args], &tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &tmp3[args_[i_args + 2] * blockDim.x * FIELD_EXTENSION], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 3]] + args_[i_args + 4]) * blockDim.x]); - i_args += 5; - break; - } - case 56: - { - // OPERATION WITH DEST: tmp3 - SRC0: tmp3 - SRC1: tmp1 - Goldilocks3GPU::op_31_gpu(args_[i_args], &tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &tmp3[args_[i_args + 2] * blockDim.x * FIELD_EXTENSION], &tmp1[args_[i_args + 3] * blockDim.x]); - i_args += 4; - break; - } - case 57: - { - // OPERATION WITH DEST: tmp3 - SRC0: tmp3 - SRC1: public - Goldilocks3GPU::op_31_gpu(args_[i_args], &tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &tmp3[args_[i_args + 2] * blockDim.x * FIELD_EXTENSION], &stepPointers_d->publics_d[args_[i_args + 3] * blockDim.x]); - i_args += 4; - break; - } - case 58: - { - // OPERATION WITH DEST: tmp3 - SRC0: tmp3 - SRC1: number - Goldilocks3GPU::op_31_gpu(args_[i_args], &tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &tmp3[args_[i_args + 2] * blockDim.x * FIELD_EXTENSION], &stepPointers_d->numbers_d[args_[i_args + 3] * blockDim.x]); - i_args += 4; - break; - } - case 59: - { - // OPERATION WITH DEST: tmp3 - SRC0: challenge - SRC1: commit1 - Goldilocks3GPU::op_31_gpu(args_[i_args], &tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &stepPointers_d->challenges_d[args_[i_args + 2] * FIELD_EXTENSION * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 3]] + args_[i_args + 4]) * blockDim.x]); - i_args += 5; - break; - } - case 60: - { - // OPERATION WITH DEST: tmp3 - SRC0: challenge - SRC1: tmp1 - Goldilocks3GPU::op_31_gpu(args_[i_args], &tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &stepPointers_d->challenges_d[args_[i_args + 2] * FIELD_EXTENSION * blockDim.x], &tmp1[args_[i_args + 3] * blockDim.x]); - i_args += 4; - break; - } - case 61: - { - // OPERATION WITH DEST: tmp3 - SRC0: challenge - SRC1: public - Goldilocks3GPU::op_31_gpu(args_[i_args], &tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &stepPointers_d->challenges_d[args_[i_args + 2] * FIELD_EXTENSION * blockDim.x], &stepPointers_d->publics_d[args_[i_args + 3] * blockDim.x]); - i_args += 4; - break; - } - case 62: - { - // OPERATION WITH DEST: tmp3 - SRC0: challenge - SRC1: number - Goldilocks3GPU::op_31_gpu(args_[i_args], &tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &stepPointers_d->challenges_d[args_[i_args + 2] * FIELD_EXTENSION * blockDim.x], &stepPointers_d->numbers_d[args_[i_args + 3] * blockDim.x]); - i_args += 4; - break; - } - case 63: - { - // COPY commit3 to tmp3 - Goldilocks3GPU::copy_gpu(&tmp3[args_[i_args] * blockDim.x * FIELD_EXTENSION], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 1]] + args_[i_args + 2]) * blockDim.x]); - i_args += 3; - break; - } - case 64: - { - // OPERATION WITH DEST: tmp3 - SRC0: commit3 - SRC1: commit3 - Goldilocks3GPU::op_gpu(args_[i_args], &tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 2]] + args_[i_args + 3]) * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 4]] + args_[i_args + 5]) * blockDim.x]); - i_args += 6; - break; - } - case 65: - { - // OPERATION WITH DEST: tmp3 - SRC0: commit3 - SRC1: tmp3 - Goldilocks3GPU::op_gpu(args_[i_args], &tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 2]] + args_[i_args + 3]) * blockDim.x], &tmp3[args_[i_args + 4] * blockDim.x * FIELD_EXTENSION]); - i_args += 5; - break; - } - case 66: - { - // MULTIPLICATION WITH DEST: tmp3 - SRC0: commit3 - SRC1: challenge - Goldilocks3GPU::mul_gpu(&tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 2]] + args_[i_args + 3]) * blockDim.x], &stepPointers_d->challenges_d[args_[i_args + 4] * FIELD_EXTENSION * blockDim.x], &stepPointers_d->challenges_ops_d[args_[i_args + 4] * FIELD_EXTENSION * blockDim.x]); - i_args += 5; - break; - } - case 67: - { - // OPERATION WITH DEST: tmp3 - SRC0: commit3 - SRC1: challenge - Goldilocks3GPU::op_gpu(args_[i_args], &tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 2]] + args_[i_args + 3]) * blockDim.x], &stepPointers_d->challenges_d[args_[i_args + 4] * FIELD_EXTENSION * blockDim.x]); - i_args += 5; - break; - } - case 68: - { - // COPY tmp3 to tmp3 - Goldilocks3GPU::copy_gpu(&tmp3[args_[i_args] * blockDim.x * FIELD_EXTENSION], &tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION]); - i_args += 2; - break; - } - case 69: - { - // OPERATION WITH DEST: tmp3 - SRC0: tmp3 - SRC1: tmp3 - Goldilocks3GPU::op_gpu(args_[i_args], &tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &tmp3[args_[i_args + 2] * blockDim.x * FIELD_EXTENSION], &tmp3[args_[i_args + 3] * blockDim.x * FIELD_EXTENSION]); - i_args += 4; - break; - } - case 70: - { - // MULTIPLICATION WITH DEST: tmp3 - SRC0: tmp3 - SRC1: challenge - Goldilocks3GPU::mul_gpu(&tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &tmp3[args_[i_args + 2] * blockDim.x * FIELD_EXTENSION], &stepPointers_d->challenges_d[args_[i_args + 3] * FIELD_EXTENSION * blockDim.x], &stepPointers_d->challenges_ops_d[args_[i_args + 3] * FIELD_EXTENSION * blockDim.x]); - i_args += 4; - break; - } - case 71: - { - // OPERATION WITH DEST: tmp3 - SRC0: tmp3 - SRC1: challenge - Goldilocks3GPU::op_gpu(args_[i_args], &tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &tmp3[args_[i_args + 2] * blockDim.x * FIELD_EXTENSION], &stepPointers_d->challenges_d[args_[i_args + 3] * FIELD_EXTENSION * blockDim.x]); - i_args += 4; - break; - } - case 72: - { - // MULTIPLICATION WITH DEST: tmp3 - SRC0: challenge - SRC1: challenge - Goldilocks3GPU::mul_gpu(&tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &stepPointers_d->challenges_d[args_[i_args + 2] * FIELD_EXTENSION * blockDim.x], &stepPointers_d->challenges_d[args_[i_args + 3] * FIELD_EXTENSION * blockDim.x], &stepPointers_d->challenges_ops_d[args_[i_args + 3] * FIELD_EXTENSION * blockDim.x]); - i_args += 4; - break; - } - case 73: - { - // OPERATION WITH DEST: tmp3 - SRC0: challenge - SRC1: challenge - Goldilocks3GPU::op_gpu(args_[i_args], &tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &stepPointers_d->challenges_d[args_[i_args + 2] * FIELD_EXTENSION * blockDim.x], &stepPointers_d->challenges_d[args_[i_args + 3] * FIELD_EXTENSION * blockDim.x]); - i_args += 4; - break; - } - case 74: - { - // COPY eval to tmp3 - Goldilocks3GPU::copy_gpu(&tmp3[args_[i_args] * blockDim.x * FIELD_EXTENSION], &stepPointers_d->evals_d[args_[i_args + 1] * FIELD_EXTENSION * blockDim.x]); - i_args += 2; - break; - } - case 75: - { - // MULTIPLICATION WITH DEST: tmp3 - SRC0: eval - SRC1: challenge - Goldilocks3GPU::mul_gpu(&tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &stepPointers_d->evals_d[args_[i_args + 2] * FIELD_EXTENSION * blockDim.x], &stepPointers_d->challenges_d[args_[i_args + 3] * FIELD_EXTENSION * blockDim.x], &stepPointers_d->challenges_ops_d[args_[i_args + 3] * FIELD_EXTENSION * blockDim.x]); - i_args += 4; - break; - } - case 76: - { - // OPERATION WITH DEST: tmp3 - SRC0: challenge - SRC1: eval - Goldilocks3GPU::op_gpu(args_[i_args], &tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &stepPointers_d->challenges_d[args_[i_args + 2] * FIELD_EXTENSION * blockDim.x], &stepPointers_d->evals_d[args_[i_args + 3] * FIELD_EXTENSION * blockDim.x]); - i_args += 4; - break; - } - case 77: - { - // OPERATION WITH DEST: tmp3 - SRC0: tmp3 - SRC1: eval - Goldilocks3GPU::op_gpu(args_[i_args], &tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &tmp3[args_[i_args + 2] * blockDim.x * FIELD_EXTENSION], &stepPointers_d->evals_d[args_[i_args + 3] * FIELD_EXTENSION * blockDim.x]); - i_args += 4; - break; - } - case 78: - { - // OPERATION WITH DEST: tmp3 - SRC0: eval - SRC1: commit1 - Goldilocks3GPU::op_31_gpu(args_[i_args], &tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &stepPointers_d->evals_d[args_[i_args + 2] * FIELD_EXTENSION * blockDim.x], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 3]] + args_[i_args + 4]) * blockDim.x]); - i_args += 5; - break; - } - case 79: - { - // OPERATION WITH DEST: tmp3 - SRC0: commit3 - SRC1: eval - Goldilocks3GPU::op_gpu(args_[i_args], &tmp3[args_[i_args + 1] * blockDim.x * FIELD_EXTENSION], &bufferT_[(stepPointers_d->nColsStagesAcc_d[args_[i_args + 2]] + args_[i_args + 3]) * blockDim.x], &stepPointers_d->evals_d[args_[i_args + 4] * FIELD_EXTENSION * blockDim.x]); - i_args += 5; - break; - } - default: - { - return; - } - } - } -} - -__global__ void _transposeToBuffer(StepsPointers *stepPointers_d, uint64_t row, uint32_t stage, bool domainExtended, uint32_t istream){ - - gl64_t *bufferT_ = &(stepPointers_d->bufferT_d[istream * stepPointers_d->dimBufferT]); - gl64_t *bufferConsts_ = &(stepPointers_d->bufferConsts_d[istream * stepPointers_d->dimBufferConsts]); - gl64_t *bufferPols_ = &(stepPointers_d->bufferPols_d[istream * stepPointers_d->dimBufferPols]); - gl64_t * x = domainExtended ? stepPointers_d->x_2ns_d : stepPointers_d->x_n_d; - - uint64_t nextStrides[2] = {0, stepPointers_d->nextStride}; - for (uint64_t o = 0; o < 2; ++o) - { - for (uint64_t k = 0; k < stepPointers_d->nConstants; ++k) - { - - bufferT_[(stepPointers_d->nColsStagesAcc_d[5 * o] + k) * blockDim.x + threadIdx.x] = bufferConsts_[(threadIdx.x+nextStrides[o])* stepPointers_d->nConstants + k]; - } - } - - bufferT_[stepPointers_d->nConstants * blockDim.x + threadIdx.x] = x[row + threadIdx.x]; - bufferT_[(stepPointers_d->nConstants + 1) * blockDim.x + threadIdx.x] = stepPointers_d->zi_d[row + threadIdx.x]; - - uint32_t offset_pols = 0; - uint64_t nStages = 3; - for (uint64_t s = 1; s <= nStages; ++s) - { - for (uint64_t o = 0; o < 2; ++o) - { - for (uint64_t k = 0; k < stepPointers_d->nColsStages_d[s]; ++k) - { - uint64_t l = threadIdx.x + nextStrides[o]; - bufferT_[(stepPointers_d->nColsStagesAcc_d[5 * o + s] + k) * blockDim.x + threadIdx.x] = bufferPols_[offset_pols + l * stepPointers_d->nColsStages_d[s] + k]; - } - } - offset_pols += stepPointers_d->nColsStages_d[s] * (blockDim.x+stepPointers_d->nextStride); - } - if (stage == 5) - { - - for (uint64_t o = 0; o < 2; ++o) - { - for (uint64_t k = 0; k < stepPointers_d->nColsStages_d[nStages + 1]; ++k) - { - uint64_t l = threadIdx.x + nextStrides[o]; - bufferT_[(stepPointers_d->nColsStagesAcc_d[5 * o + nStages + 1] + k) * blockDim.x + threadIdx.x] = bufferPols_[offset_pols + l * stepPointers_d->nColsStages_d[nStages + 1] + k]; - } - } - - for (uint64_t d = 0; d < 2; ++d) - { - for (uint64_t i = 0; i < FIELD_EXTENSION; ++i) - { - bufferT_[(stepPointers_d->nColsStagesAcc_d[11] + FIELD_EXTENSION * d + i) * blockDim.x + threadIdx.x] = stepPointers_d->xDivXSubXi_d[(d * stepPointers_d->domainSize + row + threadIdx.x)*FIELD_EXTENSION+i]; - } - } - } - -} - -__global__ void _transposeFromBuffer(StepsPointers *stepPointers_d, uint64_t row, uint32_t stage, bool domainExtended, uint32_t istream){ - - gl64_t *bufferT_ = &(stepPointers_d->bufferT_d[istream * stepPointers_d->dimBufferT]); - - if (domainExtended) - { - gl64_t *bufferPols_ = &(stepPointers_d->bufferPols_d[(istream+1) * stepPointers_d->dimBufferPols-stepPointers_d->nColsStages_d[10]*blockDim.x]); - // Store either polinomial f or polinomial q - for (uint64_t k = 0; k < stepPointers_d->nColsStages_d[10]; ++k) - { - bufferPols_[threadIdx.x*stepPointers_d->nColsStages_d[10]+k] = bufferT_[(stepPointers_d->nColsStagesAcc_d[10] + k) * blockDim.x + threadIdx.x]; - } - }else{ - gl64_t *bufferPols_ = &(stepPointers_d->bufferPols_d[(istream) * stepPointers_d->dimBufferPols]); - uint64_t nStages = 3; - uint64_t offset_pols_d=0; - for (uint64_t s = 2; s <= nStages + 1; ++s) - { - gl64_t *buffT = &bufferT_[stepPointers_d->nColsStagesAcc_d[s]*blockDim.x]; - for (uint64_t k = 0; k < stepPointers_d->nColsStages_d[s]; ++k) - { - bufferPols_[offset_pols_d + threadIdx.x *stepPointers_d->nColsStages_d[s]+k] = buffT[k*blockDim.x + threadIdx.x]; - } - offset_pols_d += stepPointers_d->nColsStages_d[s]*blockDim.x; - } - } -} -#endif diff --git a/src/starkpil/chelpers_steps_gpu.hpp b/src/starkpil/chelpers_steps_gpu.hpp deleted file mode 100644 index cf21390e8..000000000 --- a/src/starkpil/chelpers_steps_gpu.hpp +++ /dev/null @@ -1,73 +0,0 @@ - -#ifndef CHELPERS_STEPS_GPU_HPP -#define CHELPERS_STEPS_GPU_HPP -#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE) - -#include "definitions.hpp" -#include "chelpers.hpp" -#include "chelpers_steps.hpp" -#include "steps.hpp" - -#include -class gl64_t; -struct StepsPointers -{ - uint64_t domainSize; - uint64_t nConstants; - uint64_t nextStride; - uint64_t *nColsStages_d; - uint64_t *nColsStagesAcc_d; - - uint32_t *ops_d; - uint32_t *args_d; - gl64_t *numbers_d; - gl64_t *challenges_d; - gl64_t *challenges_ops_d; - gl64_t *publics_d; - gl64_t *evals_d; - gl64_t *x_n_d; - gl64_t *x_2ns_d; - gl64_t *zi_d; - gl64_t *xDivXSubXi_d; - - gl64_t *bufferT_d; - gl64_t *bufferPols_d; - gl64_t *bufferConsts_d; - gl64_t *tmp1_d; - gl64_t *tmp3_d; - - uint32_t dimBufferT; - uint32_t dimBufferPols; - uint32_t dimBufferConsts; - uint32_t dimTmp1; - uint32_t dimTmp3; -}; -class CHelpersStepsGPU : public CHelpersSteps -{ - -public: - uint32_t nstreams; - StepsPointers *stepPointers_d; - StepsPointers stepPointers_h; - - void dataSetup(StarkInfo &starkInfo, StepsParams ¶ms, ParserArgs &parserArgs, ParserParams &parserParams); - void loadData(StarkInfo &starkInfo, StepsParams ¶ms, ParserArgs &parserArgs, ParserParams &parserParams, uint64_t row, cudaStream_t& stream); - void storeData(StarkInfo &starkInfo, StepsParams ¶ms, ParserArgs &parserArgs, ParserParams &parserParams, uint64_t row, cudaStream_t& stream); -# - void calculateExpressions(StarkInfo &starkInfo, StepsParams ¶ms, ParserArgs &parserArgs, ParserParams &parserParams); - void freePointers(); -}; -__global__ void _transposeToBuffer(StepsPointers *stepPointers_d, uint64_t row, uint32_t stage, bool domainExtended, uint32_t stream); -__global__ void _transposeFromBuffer(StepsPointers *stepPointers_d, uint64_t row, uint32_t stage, bool domainExtended, uint32_t stream); -__global__ void _packComputation(StepsPointers *stepPointers_d, uint32_t N, uint32_t nOps, uint32_t nArgs, uint32_t stream); - -inline void checkCudaError(cudaError_t err, const char *operation){ - if (err != cudaSuccess) - { - printf("%s failed: %s\n", operation, cudaGetErrorString(err)); - exit(EXIT_FAILURE); - } -} - -#endif -#endif \ No newline at end of file diff --git a/src/starkpil/chelpers_steps_pack.cu b/src/starkpil/chelpers_steps_pack.cu new file mode 100644 index 000000000..2f9232144 --- /dev/null +++ b/src/starkpil/chelpers_steps_pack.cu @@ -0,0 +1,918 @@ +#include "zklog.hpp" +#include + +#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE) +#ifdef __AVX512__ +#include "chelpers_steps_avx512.hpp" +#endif +#include "chelpers_steps_pack.cuh" +#include "goldilocks_cubic_extension.cuh" +#include "cuda_utils.cuh" +#include "cuda_utils.hpp" +#include "timer.hpp" + +const uint64_t MAX_U64 = 0xFFFFFFFFFFFFFFFF; + +CHelpersStepsPackGPU *cHelpersSteps[MAX_GPUS]; +uint64_t *gpuSharedStorage[MAX_GPUS]; +uint64_t *streamExclusiveStorage[nStreams*MAX_GPUS]; +cudaStream_t streams[nStreams*MAX_GPUS]; + +void CHelpersStepsPackGPU::prepareGPU(StarkInfo &starkInfo, StepsParams ¶ms, ParserArgs &parserArgs, ParserParams &parserParams) { + + prepare(starkInfo, params, parserArgs, parserParams); + + + CHECKCUDAERR(cudaGetDeviceCount(&nDevices)); + printf("nDevices: %d\n", nDevices); + nCudaThreads = 1<<15; + domainExtended = parserParams.stage > 3 ? true : false; + domainSize = domainExtended ? 1 << starkInfo.starkStruct.nBitsExt : 1 << starkInfo.starkStruct.nBits; + subDomainSize = nrowsPack * nCudaThreads; + nextStride = domainExtended ? 1 << (starkInfo.starkStruct.nBitsExt - starkInfo.starkStruct.nBits) : 1; + + nOps = parserParams.nOps; + nArgs = parserParams.nArgs; + nBufferT = 2*nCols*nrowsPack; + nTemp1 = parserParams.nTemp1*nrowsPack; + nTemp3 = parserParams.nTemp3*FIELD_EXTENSION*nrowsPack; + + printf("nCols:%lu\n", nCols); + printf("nrowsPack:%lu\n", nrowsPack); + + offsetsStagesGPU.resize(offsetsStages.size()); + uint64_t total_pols = 0; + for (uint64_t s = 1; s < 11; s++) { + if (s < 4 || (s == 4 && parserParams.stage != 4) || (s == 10 && domainExtended)) { + printf("s=%lu, offsets=%lu\n", s, total_pols); + offsetsStagesGPU[s] = total_pols; + total_pols += nColsStages[s] * (nrowsPack * nCudaThreads + nextStride); + } else { + offsetsStagesGPU[s] = MAX_U64; + } + } + + printf("total_pols:%lu\n", total_pols); + + sharedStorageSize = 0; + ops_offset = sharedStorageSize; + sharedStorageSize += nOps; + + args_offset = sharedStorageSize; + sharedStorageSize += nArgs; + + offsetsStages_offset = sharedStorageSize; + sharedStorageSize += offsetsStages.size(); + + nColsStages_offset = sharedStorageSize; + sharedStorageSize += nColsStages.size(); + + nColsStagesAcc_offset = sharedStorageSize; + sharedStorageSize += nColsStagesAcc.size(); + + challenges_offset = sharedStorageSize; + sharedStorageSize += challenges.size(); + + challenges_ops_offset = sharedStorageSize; + sharedStorageSize += challenges_ops.size(); + + numbers_offset = sharedStorageSize; + sharedStorageSize += numbers_.size(); + + publics_offset = sharedStorageSize; + sharedStorageSize += publics.size(); + + evals_offset = sharedStorageSize; + sharedStorageSize += evals.size(); + + uint64_t *ops64 = (uint64_t *)malloc(nOps * sizeof(uint64_t)); + for (uint32_t i=0; i domainSize || (rowEnd -rowIni) % nrowsPack != 0) { + zklog.info("Invalid range for rowIni " + to_string(rowIni) + " and rowEnd " + to_string(rowEnd)); + exitProcess(); + } + + assert((rowEnd - rowIni) % (nrowsPack*nCudaThreads*nStreams*nDevices) == 0); + uint64_t nrowPerStream = (rowEnd - rowIni) / nStreams /nDevices; + + for (int s=0; s>>(cHelpersSteps_d, sharedStorage, exclusiveStorage, starkInfo.nConstants, parserParams.stage); + pack_kernel<<<(nCudaThreads+15)/16,16,0,stream>>>(cHelpersSteps_d, sharedStorage, exclusiveStorage); + storePolinomialsGPU<<<(nCudaThreads+15)/16,16,0,stream>>>(cHelpersSteps_d, sharedStorage, exclusiveStorage); + //TimerStopAndLog(EXP_Kernel); + + //TimerStart(Memcpy_D_to_H); + storeData(starkInfo, params, i, s); + //TimerStopAndLog(Memcpy_D_to_H); + } + //TimerStopAndLog(STREAM_OPS); + } + + + TimerStart(WAIT_STREAM); + for (uint32_t s = 0; s < nStreams*nDevices; s++) { + CHECKCUDAERR(cudaStreamSynchronize(streams[s])); + } + TimerStopAndLog(WAIT_STREAM); +} + +void CHelpersStepsPackGPU::loadData(StarkInfo &starkInfo, StepsParams ¶ms, uint64_t row, uint32_t s) { + + ConstantPolsStarks *constPols = domainExtended ? params.pConstPols2ns : params.pConstPols; + Polinomial &x = domainExtended ? params.x_2ns : params.x_n; + + uint64_t *exclusiveStorage = streamExclusiveStorage[s]; + uint64_t *constPols_d = exclusiveStorage + constPols_offset; + uint64_t *x_d = exclusiveStorage + x_offset; + uint64_t *zi_d = exclusiveStorage + zi_offset; + uint64_t *pols_d = exclusiveStorage + pols_offset; + uint64_t *xDivXSubXi_d = exclusiveStorage + xDivXSubXi_offset; + + cudaStream_t stream = streams[s]; + + if (row + subDomainSize != domainSize) { + CHECKCUDAERR(cudaMemcpyAsync(constPols_d, ((Goldilocks::Element *)constPols->address()) + row * starkInfo.nConstants, starkInfo.nConstants * (subDomainSize + nextStride) * sizeof(uint64_t), cudaMemcpyHostToDevice, stream)); + } else { + CHECKCUDAERR(cudaMemcpyAsync(constPols_d, ((Goldilocks::Element *)constPols->address()) + row * starkInfo.nConstants, starkInfo.nConstants * subDomainSize * sizeof(uint64_t), cudaMemcpyHostToDevice, stream)); + CHECKCUDAERR(cudaMemcpyAsync(constPols_d + starkInfo.nConstants * subDomainSize, (Goldilocks::Element *)constPols->address(), starkInfo.nConstants * nextStride * sizeof(uint64_t), cudaMemcpyHostToDevice, stream)); + } + + CHECKCUDAERR(cudaMemcpyAsync(x_d, x[row], subDomainSize * sizeof(uint64_t), cudaMemcpyHostToDevice, stream)); + CHECKCUDAERR(cudaMemcpyAsync(zi_d, params.zi[row], subDomainSize * sizeof(uint64_t), cudaMemcpyHostToDevice, stream)); + + for (uint64_t s = 1; s < 11; s++) { + if (offsetsStagesGPU[s] != MAX_U64) { + if (row + subDomainSize != domainSize) { + CHECKCUDAERR(cudaMemcpyAsync(pols_d + offsetsStagesGPU[s], ¶ms.pols[offsetsStages[s] + row*nColsStages[s]], (subDomainSize+nextStride) *nColsStages[s] * sizeof(uint64_t), cudaMemcpyHostToDevice, stream)); + } else { + CHECKCUDAERR(cudaMemcpyAsync(pols_d + offsetsStagesGPU[s], ¶ms.pols[offsetsStages[s] + row*nColsStages[s]], subDomainSize *nColsStages[s] * sizeof(uint64_t), cudaMemcpyHostToDevice, stream)); + CHECKCUDAERR(cudaMemcpyAsync(pols_d + offsetsStagesGPU[s] + subDomainSize *nColsStages[s], ¶ms.pols[offsetsStages[s]], nextStride *nColsStages[s] * sizeof(uint64_t), cudaMemcpyHostToDevice, stream)); + } + } + } + + CHECKCUDAERR(cudaMemcpyAsync(xDivXSubXi_d, params.xDivXSubXi[row], subDomainSize *FIELD_EXTENSION * sizeof(uint64_t), cudaMemcpyHostToDevice, stream)); + CHECKCUDAERR(cudaMemcpyAsync(xDivXSubXi_d + subDomainSize *FIELD_EXTENSION, params.xDivXSubXi[domainSize + row], subDomainSize *FIELD_EXTENSION * sizeof(uint64_t), cudaMemcpyHostToDevice, stream)); +} + +void CHelpersStepsPackGPU::storeData(StarkInfo &starkInfo, StepsParams ¶ms, uint64_t row, uint32_t s) { + uint64_t *pols_d = streamExclusiveStorage[s] + pols_offset; + cudaStream_t stream = streams[s]; + for (uint64_t s = 1; s < 11; s++) { + if (offsetsStagesGPU[s] != MAX_U64) { + CHECKCUDAERR(cudaMemcpyAsync(¶ms.pols[offsetsStages[s] + row*nColsStages[s]], pols_d + offsetsStagesGPU[s], subDomainSize *nColsStages[s] * sizeof(uint64_t), cudaMemcpyDeviceToHost, stream)); + } + } +} + +__global__ void loadPolinomialsGPU(CHelpersStepsPackGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage, uint64_t nConstants, uint64_t stage) { + + uint64_t nCudaThreads = cHelpersSteps->nCudaThreads; + + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= nCudaThreads) { + return; + } + + uint64_t nrowsPack = cHelpersSteps->nrowsPack; + uint64_t nextStride = cHelpersSteps->nextStride; + uint64_t subDomainSize = cHelpersSteps->subDomainSize; + uint64_t nBufferT = cHelpersSteps->nBufferT; + + uint64_t *nColsStages = sharedStorage + cHelpersSteps->nColsStages_offset; + uint64_t *nColsStagesAcc = sharedStorage + cHelpersSteps->nColsStagesAcc_offset; + uint64_t *offsetsStages = sharedStorage + cHelpersSteps->offsetsStages_offset; + + gl64_t *bufferT_ = (gl64_t *)exclusiveStorage + cHelpersSteps->bufferT_offset + idx * nBufferT; + gl64_t *pols = (gl64_t *)exclusiveStorage + cHelpersSteps->pols_offset; + gl64_t *constPols = (gl64_t *)exclusiveStorage + cHelpersSteps->constPols_offset; + + uint64_t row = idx*nrowsPack; + uint64_t nStages = 3; + uint64_t nextStrides[2] = {0, nextStride}; + + for(uint64_t k = 0; k < nConstants; ++k) { + for(uint64_t o = 0; o < 2; ++o) { + for(uint64_t j = 0; j < nrowsPack; ++j) { + uint64_t l = (row + j + nextStrides[o]); + bufferT_[(nColsStagesAcc[5*o] + k)*nrowsPack + j] = constPols[l * nConstants + k]; + } + } + } + + // Load x and Zi + for(uint64_t j = 0; j < nrowsPack; ++j) { + bufferT_[nConstants*nrowsPack + j] = (exclusiveStorage + cHelpersSteps->x_offset)[row + j]; + } + for(uint64_t j = 0; j < nrowsPack; ++j) { + bufferT_[(nConstants + 1)*nrowsPack + j] = (exclusiveStorage + cHelpersSteps->zi_offset)[row + j]; + } + + for(uint64_t s = 1; s <= nStages; ++s) { + for(uint64_t k = 0; k < nColsStages[s]; ++k) { + for(uint64_t o = 0; o < 2; ++o) { + for(uint64_t j = 0; j < nrowsPack; ++j) { + uint64_t l = (row + j + nextStrides[o]); + bufferT_[(nColsStagesAcc[5*o + s] + k)*nrowsPack + j] = pols[offsetsStages[s] + l * nColsStages[s] + k]; + } + } + } + } + + if(stage == 5) { + for(uint64_t k = 0; k < nColsStages[nStages + 1]; ++k) { + for(uint64_t o = 0; o < 2; ++o) { + for(uint64_t j = 0; j < nrowsPack; ++j) { + uint64_t l = (row + j + nextStrides[o]); // % domainSize; + bufferT_[(nColsStagesAcc[5*o + nStages + 1] + k)*nrowsPack + j] = pols[offsetsStages[nStages + 1] + l * nColsStages[nStages + 1] + k]; + } + } + } + + // Load xDivXSubXi & xDivXSubWXi + for(uint64_t d = 0; d < 2; ++d) { + for(uint64_t i = 0; i < FIELD_EXTENSION; ++i) { + for(uint64_t j = 0; j < nrowsPack; ++j) { + bufferT_[(nColsStagesAcc[11] + FIELD_EXTENSION*d + i)*nrowsPack + j] = (exclusiveStorage + cHelpersSteps->xDivXSubXi_offset)[(d*subDomainSize + row + j) * FIELD_EXTENSION + i]; + } + } + } + } +} + +__global__ void storePolinomialsGPU(CHelpersStepsPackGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage) { + uint64_t nCudaThreads = cHelpersSteps->nCudaThreads; + + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= nCudaThreads) { + return; + } + + bool domainExtended = cHelpersSteps->domainExtended; + uint64_t nrowsPack = cHelpersSteps->nrowsPack; + uint64_t nBufferT = cHelpersSteps->nBufferT; + + uint64_t row = idx*nrowsPack; + + uint64_t *nColsStages = sharedStorage + cHelpersSteps->nColsStages_offset; + uint64_t *nColsStagesAcc = sharedStorage + cHelpersSteps->nColsStagesAcc_offset; + uint64_t *offsetsStages = sharedStorage + cHelpersSteps->offsetsStages_offset; + + gl64_t *bufferT_ = (gl64_t *)exclusiveStorage + cHelpersSteps->bufferT_offset + idx * nBufferT; + gl64_t *pols = (gl64_t *)exclusiveStorage + cHelpersSteps->pols_offset; + + if(domainExtended) { + // Store either polinomial f or polinomial q + for(uint64_t k = 0; k < nColsStages[10]; ++k) { + gl64_t *buffT = &bufferT_[(nColsStagesAcc[10] + k)* nrowsPack]; + gl64_t::copy_pack(nrowsPack, &pols[offsetsStages[10] + k + row * nColsStages[10]], nColsStages[10], buffT); + } + } else { + uint64_t nStages = 3; + for(uint64_t s = 2; s <= nStages + 1; ++s) { + for(uint64_t k = 0; k < nColsStages[s]; ++k) { + gl64_t *buffT = &bufferT_[(nColsStagesAcc[s] + k)* nrowsPack]; + gl64_t::copy_pack(nrowsPack, &pols[offsetsStages[s] + k + row * nColsStages[s]], nColsStages[s], buffT); + } + } + } +} + +__global__ void pack_kernel(CHelpersStepsPackGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage) +{ + uint64_t nCudaThreads = cHelpersSteps->nCudaThreads; + + uint64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= nCudaThreads) { + return; + } + + uint64_t nrowsPack = cHelpersSteps->nrowsPack; + uint64_t nOps = cHelpersSteps->nOps; + uint64_t nArgs = cHelpersSteps->nArgs; + uint64_t nBufferT = cHelpersSteps->nBufferT; + uint64_t nTemp1 = cHelpersSteps->nTemp1; + uint64_t nTemp3 = cHelpersSteps->nTemp3; + + uint64_t *nColsStagesAcc = sharedStorage + cHelpersSteps->nColsStagesAcc_offset; + uint64_t *ops = sharedStorage + cHelpersSteps->ops_offset; + uint64_t *args = sharedStorage + cHelpersSteps->args_offset; + gl64_t *challenges = (gl64_t *)sharedStorage + cHelpersSteps->challenges_offset; + gl64_t *challenges_ops = (gl64_t *)sharedStorage + cHelpersSteps->challenges_ops_offset; + gl64_t *numbers_ = (gl64_t *)sharedStorage + cHelpersSteps->numbers_offset; + gl64_t *publics = (gl64_t *)sharedStorage + cHelpersSteps->publics_offset; + gl64_t *evals = (gl64_t *)sharedStorage + cHelpersSteps->evals_offset; + + gl64_t *bufferT_ = (gl64_t *)exclusiveStorage + cHelpersSteps->bufferT_offset + idx * nBufferT; + gl64_t *tmp1 = (gl64_t *)exclusiveStorage + cHelpersSteps->tmp1_offset + nTemp1*idx; + gl64_t *tmp3 = (gl64_t *)exclusiveStorage + cHelpersSteps->tmp3_offset + nTemp3*idx; + + uint64_t i_args = 0; + + for (uint64_t kk = 0; kk < nOps; ++kk) { + switch (ops[kk]) { + case 0: { + // COPY commit1 to commit1 + gl64_t::copy_pack(nrowsPack, &bufferT_[(nColsStagesAcc[args[i_args]] + args[i_args + 1]) * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 2]] + args[i_args + 3]) * nrowsPack]); + i_args += 4; + break; + } + case 1: { + // OPERATION WITH DEST: commit1 - SRC0: commit1 - SRC1: commit1 + gl64_t::op_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 3]] + args[i_args + 4]) * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 5]] + args[i_args + 6]) * nrowsPack]); + i_args += 7; + break; + } + case 2: { + // OPERATION WITH DEST: commit1 - SRC0: commit1 - SRC1: tmp1 + gl64_t::op_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 3]] + args[i_args + 4]) * nrowsPack], &tmp1[args[i_args + 5] * nrowsPack]); + i_args += 6; + break; + } + case 3: { + // OPERATION WITH DEST: commit1 - SRC0: commit1 - SRC1: public + gl64_t::op_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 3]] + args[i_args + 4]) * nrowsPack], &publics[args[i_args + 5] * nrowsPack]); + i_args += 6; + break; + } + case 4: { + // OPERATION WITH DEST: commit1 - SRC0: commit1 - SRC1: number + gl64_t::op_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 3]] + args[i_args + 4]) * nrowsPack], &numbers_[args[i_args + 5]*nrowsPack]); + i_args += 6; + break; + } + case 5: { + // COPY tmp1 to commit1 + gl64_t::copy_pack(nrowsPack, &bufferT_[(nColsStagesAcc[args[i_args]] + args[i_args + 1]) * nrowsPack], &tmp1[args[i_args + 2] * nrowsPack]); + i_args += 3; + break; + } + case 6: { + // OPERATION WITH DEST: commit1 - SRC0: tmp1 - SRC1: tmp1 + gl64_t::op_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &tmp1[args[i_args + 3] * nrowsPack], &tmp1[args[i_args + 4] * nrowsPack]); + i_args += 5; + break; + } + case 7: { + // OPERATION WITH DEST: commit1 - SRC0: tmp1 - SRC1: public + gl64_t::op_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &tmp1[args[i_args + 3] * nrowsPack], &publics[args[i_args + 4] * nrowsPack]); + i_args += 5; + break; + } + case 8: { + // OPERATION WITH DEST: commit1 - SRC0: tmp1 - SRC1: number + gl64_t::op_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &tmp1[args[i_args + 3] * nrowsPack], &numbers_[args[i_args + 4]*nrowsPack]); + i_args += 5; + break; + } + case 9: { + // COPY public to commit1 + gl64_t::copy_pack(nrowsPack, &bufferT_[(nColsStagesAcc[args[i_args]] + args[i_args + 1]) * nrowsPack], &publics[args[i_args + 2] * nrowsPack]); + i_args += 3; + break; + } + case 10: { + // OPERATION WITH DEST: commit1 - SRC0: public - SRC1: public + gl64_t::op_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &publics[args[i_args + 3] * nrowsPack], &publics[args[i_args + 4] * nrowsPack]); + i_args += 5; + break; + } + case 11: { + // OPERATION WITH DEST: commit1 - SRC0: public - SRC1: number + gl64_t::op_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &publics[args[i_args + 3] * nrowsPack], &numbers_[args[i_args + 4]*nrowsPack]); + i_args += 5; + break; + } + case 12: { + // COPY number to commit1 + gl64_t::copy_pack(nrowsPack, &bufferT_[(nColsStagesAcc[args[i_args]] + args[i_args + 1]) * nrowsPack], &numbers_[args[i_args + 2]*nrowsPack]); + i_args += 3; + break; + } + case 13: { + // OPERATION WITH DEST: commit1 - SRC0: number - SRC1: number + gl64_t::op_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &numbers_[args[i_args + 3]*nrowsPack], &numbers_[args[i_args + 4]*nrowsPack]); + i_args += 5; + break; + } + case 14: { + // COPY commit1 to tmp1 + gl64_t::copy_pack(nrowsPack, &tmp1[args[i_args] * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack]); + i_args += 3; + break; + } + case 15: { + // OPERATION WITH DEST: tmp1 - SRC0: commit1 - SRC1: commit1 + gl64_t::op_pack(nrowsPack, args[i_args], &tmp1[args[i_args + 1] * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 2]] + args[i_args + 3]) * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 4]] + args[i_args + 5]) * nrowsPack]); + i_args += 6; + break; + } + case 16: { + // OPERATION WITH DEST: tmp1 - SRC0: commit1 - SRC1: tmp1 + gl64_t::op_pack(nrowsPack, args[i_args], &tmp1[args[i_args + 1] * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 2]] + args[i_args + 3]) * nrowsPack], &tmp1[args[i_args + 4] * nrowsPack]); + i_args += 5; + break; + } + case 17: { + // OPERATION WITH DEST: tmp1 - SRC0: commit1 - SRC1: public + gl64_t::op_pack(nrowsPack, args[i_args], &tmp1[args[i_args + 1] * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 2]] + args[i_args + 3]) * nrowsPack], &publics[args[i_args + 4] * nrowsPack]); + i_args += 5; + break; + } + case 18: { + // OPERATION WITH DEST: tmp1 - SRC0: commit1 - SRC1: number + gl64_t::op_pack(nrowsPack, args[i_args], &tmp1[args[i_args + 1] * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 2]] + args[i_args + 3]) * nrowsPack], &numbers_[args[i_args + 4]*nrowsPack]); + i_args += 5; + break; + } + case 19: { + // COPY tmp1 to tmp1 + gl64_t::copy_pack(nrowsPack, &tmp1[args[i_args] * nrowsPack], &tmp1[args[i_args + 1] * nrowsPack]); + i_args += 2; + break; + } + case 20: { + // OPERATION WITH DEST: tmp1 - SRC0: tmp1 - SRC1: tmp1 + gl64_t::op_pack(nrowsPack, args[i_args], &tmp1[args[i_args + 1] * nrowsPack], &tmp1[args[i_args + 2] * nrowsPack], &tmp1[args[i_args + 3] * nrowsPack]); + i_args += 4; + break; + } + case 21: { + // OPERATION WITH DEST: tmp1 - SRC0: tmp1 - SRC1: public + gl64_t::op_pack(nrowsPack, args[i_args], &tmp1[args[i_args + 1] * nrowsPack], &tmp1[args[i_args + 2] * nrowsPack], &publics[args[i_args + 3] * nrowsPack]); + i_args += 4; + break; + } + case 22: { + // OPERATION WITH DEST: tmp1 - SRC0: tmp1 - SRC1: number + gl64_t::op_pack(nrowsPack, args[i_args], &tmp1[args[i_args + 1] * nrowsPack], &tmp1[args[i_args + 2] * nrowsPack], &numbers_[args[i_args + 3]*nrowsPack]); + i_args += 4; + break; + } + case 23: { + // COPY public to tmp1 + gl64_t::copy_pack(nrowsPack, &tmp1[args[i_args] * nrowsPack], &publics[args[i_args + 1] * nrowsPack]); + i_args += 2; + break; + } + case 24: { + // OPERATION WITH DEST: tmp1 - SRC0: public - SRC1: public + gl64_t::op_pack(nrowsPack, args[i_args], &tmp1[args[i_args + 1] * nrowsPack], &publics[args[i_args + 2] * nrowsPack], &publics[args[i_args + 3] * nrowsPack]); + i_args += 4; + break; + } + case 25: { + // OPERATION WITH DEST: tmp1 - SRC0: public - SRC1: number + gl64_t::op_pack(nrowsPack, args[i_args], &tmp1[args[i_args + 1] * nrowsPack], &publics[args[i_args + 2] * nrowsPack], &numbers_[args[i_args + 3]*nrowsPack]); + i_args += 4; + break; + } + case 26: { + // COPY number to tmp1 + gl64_t::copy_pack(nrowsPack, &tmp1[args[i_args] * nrowsPack], &numbers_[args[i_args + 1]*nrowsPack]); + i_args += 2; + break; + } + case 27: { + // OPERATION WITH DEST: tmp1 - SRC0: number - SRC1: number + gl64_t::op_pack(nrowsPack, args[i_args], &tmp1[args[i_args + 1] * nrowsPack], &numbers_[args[i_args + 2]*nrowsPack], &numbers_[args[i_args + 3]*nrowsPack]); + i_args += 4; + break; + } + case 28: { + // OPERATION WITH DEST: commit3 - SRC0: commit3 - SRC1: commit1 + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 3]] + args[i_args + 4]) * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 5]] + args[i_args + 6]) * nrowsPack]); + i_args += 7; + break; + } + case 29: { + // OPERATION WITH DEST: commit3 - SRC0: commit3 - SRC1: tmp1 + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 3]] + args[i_args + 4]) * nrowsPack], &tmp1[args[i_args + 5] * nrowsPack]); + i_args += 6; + break; + } + case 30: { + // OPERATION WITH DEST: commit3 - SRC0: commit3 - SRC1: public + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 3]] + args[i_args + 4]) * nrowsPack], &publics[args[i_args + 5] * nrowsPack]); + i_args += 6; + break; + } + case 31: { + // OPERATION WITH DEST: commit3 - SRC0: commit3 - SRC1: number + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 3]] + args[i_args + 4]) * nrowsPack], &numbers_[args[i_args + 5]*nrowsPack]); + i_args += 6; + break; + } + case 32: { + // OPERATION WITH DEST: commit3 - SRC0: tmp3 - SRC1: commit1 + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &tmp3[args[i_args + 3] * nrowsPack * FIELD_EXTENSION], &bufferT_[(nColsStagesAcc[args[i_args + 4]] + args[i_args + 5]) * nrowsPack]); + i_args += 6; + break; + } + case 33: { + // OPERATION WITH DEST: commit3 - SRC0: tmp3 - SRC1: tmp1 + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &tmp3[args[i_args + 3] * nrowsPack * FIELD_EXTENSION], &tmp1[args[i_args + 4] * nrowsPack]); + i_args += 5; + break; + } + case 34: { + // OPERATION WITH DEST: commit3 - SRC0: tmp3 - SRC1: public + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &tmp3[args[i_args + 3] * nrowsPack * FIELD_EXTENSION], &publics[args[i_args + 4] * nrowsPack]); + i_args += 5; + break; + } + case 35: { + // OPERATION WITH DEST: commit3 - SRC0: tmp3 - SRC1: number + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &tmp3[args[i_args + 3] * nrowsPack * FIELD_EXTENSION], &numbers_[args[i_args + 4]*nrowsPack]); + i_args += 5; + break; + } + case 36: { + // OPERATION WITH DEST: commit3 - SRC0: challenge - SRC1: commit1 + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &challenges[args[i_args + 3]*FIELD_EXTENSION*nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 4]] + args[i_args + 5]) * nrowsPack]); + i_args += 6; + break; + } + case 37: { + // OPERATION WITH DEST: commit3 - SRC0: challenge - SRC1: tmp1 + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &challenges[args[i_args + 3]*FIELD_EXTENSION*nrowsPack], &tmp1[args[i_args + 4] * nrowsPack]); + i_args += 5; + break; + } + case 38: { + // OPERATION WITH DEST: commit3 - SRC0: challenge - SRC1: public + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &challenges[args[i_args + 3]*FIELD_EXTENSION*nrowsPack], &publics[args[i_args + 4] * nrowsPack]); + i_args += 5; + break; + } + case 39: { + // OPERATION WITH DEST: commit3 - SRC0: challenge - SRC1: number + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &challenges[args[i_args + 3]*FIELD_EXTENSION*nrowsPack], &numbers_[args[i_args + 4]*nrowsPack]); + i_args += 5; + break; + } + case 40: { + // COPY commit3 to commit3 + Goldilocks3GPU::copy_pack(nrowsPack, &bufferT_[(nColsStagesAcc[args[i_args]] + args[i_args + 1]) * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 2]] + args[i_args + 3]) * nrowsPack]); + i_args += 4; + break; + } + case 41: { + // OPERATION WITH DEST: commit3 - SRC0: commit3 - SRC1: commit3 + Goldilocks3GPU::op_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 3]] + args[i_args + 4]) * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 5]] + args[i_args + 6]) * nrowsPack]); + i_args += 7; + break; + } + case 42: { + // OPERATION WITH DEST: commit3 - SRC0: commit3 - SRC1: tmp3 + Goldilocks3GPU::op_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 3]] + args[i_args + 4]) * nrowsPack], &tmp3[args[i_args + 5] * nrowsPack * FIELD_EXTENSION]); + i_args += 6; + break; + } + case 43: { + // MULTIPLICATION WITH DEST: commit3 - SRC0: commit3 - SRC1: challenge + Goldilocks3GPU::mul_pack(nrowsPack, &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 3]] + args[i_args + 4]) * nrowsPack], &challenges[args[i_args + 5]*FIELD_EXTENSION*nrowsPack], &challenges_ops[args[i_args + 5]*FIELD_EXTENSION*nrowsPack]); + i_args += 6; + break; + } + case 44: { + // OPERATION WITH DEST: commit3 - SRC0: commit3 - SRC1: challenge + Goldilocks3GPU::op_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 3]] + args[i_args + 4]) * nrowsPack], &challenges[args[i_args + 5]*FIELD_EXTENSION*nrowsPack]); + i_args += 6; + break; + } + case 45: { + // COPY tmp3 to commit3 + Goldilocks3GPU::copy_pack(nrowsPack, &bufferT_[(nColsStagesAcc[args[i_args]] + args[i_args + 1]) * nrowsPack], &tmp3[args[i_args + 2] * nrowsPack * FIELD_EXTENSION]); + i_args += 3; + break; + } + case 46: { + // OPERATION WITH DEST: commit3 - SRC0: tmp3 - SRC1: tmp3 + Goldilocks3GPU::op_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &tmp3[args[i_args + 3] * nrowsPack * FIELD_EXTENSION], &tmp3[args[i_args + 4] * nrowsPack * FIELD_EXTENSION]); + i_args += 5; + break; + } + case 47: { + // MULTIPLICATION WITH DEST: commit3 - SRC0: tmp3 - SRC1: challenge + Goldilocks3GPU::mul_pack(nrowsPack, &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &tmp3[args[i_args + 3] * nrowsPack * FIELD_EXTENSION], &challenges[args[i_args + 4]*FIELD_EXTENSION*nrowsPack], &challenges_ops[args[i_args + 4]*FIELD_EXTENSION*nrowsPack]); + i_args += 5; + break; + } + case 48: { + // OPERATION WITH DEST: commit3 - SRC0: tmp3 - SRC1: challenge + Goldilocks3GPU::op_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &tmp3[args[i_args + 3] * nrowsPack * FIELD_EXTENSION], &challenges[args[i_args + 4]*FIELD_EXTENSION*nrowsPack]); + i_args += 5; + break; + } + case 49: { + // MULTIPLICATION WITH DEST: commit3 - SRC0: challenge - SRC1: challenge + Goldilocks3GPU::mul_pack(nrowsPack, &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &challenges[args[i_args + 3]*FIELD_EXTENSION*nrowsPack], &challenges[args[i_args + 4]*FIELD_EXTENSION*nrowsPack], &challenges_ops[args[i_args + 4]*FIELD_EXTENSION*nrowsPack]); + i_args += 5; + break; + } + case 50: { + // OPERATION WITH DEST: commit3 - SRC0: challenge - SRC1: challenge + Goldilocks3GPU::op_pack(nrowsPack, args[i_args], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack], &challenges[args[i_args + 3]*FIELD_EXTENSION*nrowsPack], &challenges[args[i_args + 4]*FIELD_EXTENSION*nrowsPack]); + i_args += 5; + break; + } + case 51: { + // OPERATION WITH DEST: tmp3 - SRC0: commit3 - SRC1: commit1 + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &bufferT_[(nColsStagesAcc[args[i_args + 2]] + args[i_args + 3]) * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 4]] + args[i_args + 5]) * nrowsPack]); + i_args += 6; + break; + } + case 52: { + // OPERATION WITH DEST: tmp3 - SRC0: commit3 - SRC1: tmp1 + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &bufferT_[(nColsStagesAcc[args[i_args + 2]] + args[i_args + 3]) * nrowsPack], &tmp1[args[i_args + 4] * nrowsPack]); + i_args += 5; + break; + } + case 53: { + // OPERATION WITH DEST: tmp3 - SRC0: commit3 - SRC1: public + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &bufferT_[(nColsStagesAcc[args[i_args + 2]] + args[i_args + 3]) * nrowsPack], &publics[args[i_args + 4] * nrowsPack]); + i_args += 5; + break; + } + case 54: { + // OPERATION WITH DEST: tmp3 - SRC0: commit3 - SRC1: number + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &bufferT_[(nColsStagesAcc[args[i_args + 2]] + args[i_args + 3]) * nrowsPack], &numbers_[args[i_args + 4]*nrowsPack]); + i_args += 5; + break; + } + case 55: { + // OPERATION WITH DEST: tmp3 - SRC0: tmp3 - SRC1: commit1 + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &tmp3[args[i_args + 2] * nrowsPack * FIELD_EXTENSION], &bufferT_[(nColsStagesAcc[args[i_args + 3]] + args[i_args + 4]) * nrowsPack]); + i_args += 5; + break; + } + case 56: { + // OPERATION WITH DEST: tmp3 - SRC0: tmp3 - SRC1: tmp1 + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &tmp3[args[i_args + 2] * nrowsPack * FIELD_EXTENSION], &tmp1[args[i_args + 3] * nrowsPack]); + i_args += 4; + break; + } + case 57: { + // OPERATION WITH DEST: tmp3 - SRC0: tmp3 - SRC1: public + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &tmp3[args[i_args + 2] * nrowsPack * FIELD_EXTENSION], &publics[args[i_args + 3] * nrowsPack]); + i_args += 4; + break; + } + case 58: { + // OPERATION WITH DEST: tmp3 - SRC0: tmp3 - SRC1: number + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &tmp3[args[i_args + 2] * nrowsPack * FIELD_EXTENSION], &numbers_[args[i_args + 3]*nrowsPack]); + i_args += 4; + break; + } + case 59: { + // OPERATION WITH DEST: tmp3 - SRC0: challenge - SRC1: commit1 + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &challenges[args[i_args + 2]*FIELD_EXTENSION*nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 3]] + args[i_args + 4]) * nrowsPack]); + i_args += 5; + break; + } + case 60: { + // OPERATION WITH DEST: tmp3 - SRC0: challenge - SRC1: tmp1 + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &challenges[args[i_args + 2]*FIELD_EXTENSION*nrowsPack], &tmp1[args[i_args + 3] * nrowsPack]); + i_args += 4; + break; + } + case 61: { + // OPERATION WITH DEST: tmp3 - SRC0: challenge - SRC1: public + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &challenges[args[i_args + 2]*FIELD_EXTENSION*nrowsPack], &publics[args[i_args + 3] * nrowsPack]); + i_args += 4; + break; + } + case 62: { + // OPERATION WITH DEST: tmp3 - SRC0: challenge - SRC1: number + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &challenges[args[i_args + 2]*FIELD_EXTENSION*nrowsPack], &numbers_[args[i_args + 3]*nrowsPack]); + i_args += 4; + break; + } + case 63: { + // COPY commit3 to tmp3 + Goldilocks3GPU::copy_pack(nrowsPack, &tmp3[args[i_args] * nrowsPack * FIELD_EXTENSION], &bufferT_[(nColsStagesAcc[args[i_args + 1]] + args[i_args + 2]) * nrowsPack]); + i_args += 3; + break; + } + case 64: { + // OPERATION WITH DEST: tmp3 - SRC0: commit3 - SRC1: commit3 + Goldilocks3GPU::op_pack(nrowsPack, args[i_args], &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &bufferT_[(nColsStagesAcc[args[i_args + 2]] + args[i_args + 3]) * nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 4]] + args[i_args + 5]) * nrowsPack]); + i_args += 6; + break; + } + case 65: { + // OPERATION WITH DEST: tmp3 - SRC0: commit3 - SRC1: tmp3 + Goldilocks3GPU::op_pack(nrowsPack, args[i_args], &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &bufferT_[(nColsStagesAcc[args[i_args + 2]] + args[i_args + 3]) * nrowsPack], &tmp3[args[i_args + 4] * nrowsPack * FIELD_EXTENSION]); + i_args += 5; + break; + } + case 66: { + // MULTIPLICATION WITH DEST: tmp3 - SRC0: commit3 - SRC1: challenge + Goldilocks3GPU::mul_pack(nrowsPack, &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &bufferT_[(nColsStagesAcc[args[i_args + 2]] + args[i_args + 3]) * nrowsPack], &challenges[args[i_args + 4]*FIELD_EXTENSION*nrowsPack], &challenges_ops[args[i_args + 4]*FIELD_EXTENSION*nrowsPack]); + i_args += 5; + break; + } + case 67: { + // OPERATION WITH DEST: tmp3 - SRC0: commit3 - SRC1: challenge + Goldilocks3GPU::op_pack(nrowsPack, args[i_args], &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &bufferT_[(nColsStagesAcc[args[i_args + 2]] + args[i_args + 3]) * nrowsPack], &challenges[args[i_args + 4]*FIELD_EXTENSION*nrowsPack]); + i_args += 5; + break; + } + case 68: { + // COPY tmp3 to tmp3 + Goldilocks3GPU::copy_pack(nrowsPack, &tmp3[args[i_args] * nrowsPack * FIELD_EXTENSION], &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION]); + i_args += 2; + break; + } + case 69: { + // OPERATION WITH DEST: tmp3 - SRC0: tmp3 - SRC1: tmp3 + Goldilocks3GPU::op_pack(nrowsPack, args[i_args], &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &tmp3[args[i_args + 2] * nrowsPack * FIELD_EXTENSION], &tmp3[args[i_args + 3] * nrowsPack * FIELD_EXTENSION]); + i_args += 4; + break; + } + case 70: { + // MULTIPLICATION WITH DEST: tmp3 - SRC0: tmp3 - SRC1: challenge + Goldilocks3GPU::mul_pack(nrowsPack, &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &tmp3[args[i_args + 2] * nrowsPack * FIELD_EXTENSION], &challenges[args[i_args + 3]*FIELD_EXTENSION*nrowsPack], &challenges_ops[args[i_args + 3]*FIELD_EXTENSION*nrowsPack]); + i_args += 4; + break; + } + case 71: { + // OPERATION WITH DEST: tmp3 - SRC0: tmp3 - SRC1: challenge + Goldilocks3GPU::op_pack(nrowsPack, args[i_args], &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &tmp3[args[i_args + 2] * nrowsPack * FIELD_EXTENSION], &challenges[args[i_args + 3]*FIELD_EXTENSION*nrowsPack]); + i_args += 4; + break; + } + case 72: { + // MULTIPLICATION WITH DEST: tmp3 - SRC0: challenge - SRC1: challenge + Goldilocks3GPU::mul_pack(nrowsPack, &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &challenges[args[i_args + 2]*FIELD_EXTENSION*nrowsPack], &challenges[args[i_args + 3]*FIELD_EXTENSION*nrowsPack], &challenges_ops[args[i_args + 3]*FIELD_EXTENSION*nrowsPack]); + i_args += 4; + break; + } + case 73: { + // OPERATION WITH DEST: tmp3 - SRC0: challenge - SRC1: challenge + Goldilocks3GPU::op_pack(nrowsPack, args[i_args], &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &challenges[args[i_args + 2]*FIELD_EXTENSION*nrowsPack], &challenges[args[i_args + 3]*FIELD_EXTENSION*nrowsPack]); + i_args += 4; + break; + } + case 74: { + // COPY eval to tmp3 + Goldilocks3GPU::copy_pack(nrowsPack, &tmp3[args[i_args] * nrowsPack * FIELD_EXTENSION], &evals[args[i_args + 1]*FIELD_EXTENSION*nrowsPack]); + i_args += 2; + break; + } + case 75: { + // MULTIPLICATION WITH DEST: tmp3 - SRC0: eval - SRC1: challenge + Goldilocks3GPU::mul_pack(nrowsPack, &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &evals[args[i_args + 2]*FIELD_EXTENSION*nrowsPack], &challenges[args[i_args + 3]*FIELD_EXTENSION*nrowsPack], &challenges_ops[args[i_args + 3]*FIELD_EXTENSION*nrowsPack]); + i_args += 4; + break; + } + case 76: { + // OPERATION WITH DEST: tmp3 - SRC0: challenge - SRC1: eval + Goldilocks3GPU::op_pack(nrowsPack, args[i_args], &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &challenges[args[i_args + 2]*FIELD_EXTENSION*nrowsPack], &evals[args[i_args + 3]*FIELD_EXTENSION*nrowsPack]); + i_args += 4; + break; + } + case 77: { + // OPERATION WITH DEST: tmp3 - SRC0: tmp3 - SRC1: eval + Goldilocks3GPU::op_pack(nrowsPack, args[i_args], &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &tmp3[args[i_args + 2] * nrowsPack * FIELD_EXTENSION], &evals[args[i_args + 3]*FIELD_EXTENSION*nrowsPack]); + i_args += 4; + break; + } + case 78: { + // OPERATION WITH DEST: tmp3 - SRC0: eval - SRC1: commit1 + Goldilocks3GPU::op_31_pack(nrowsPack, args[i_args], &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &evals[args[i_args + 2]*FIELD_EXTENSION*nrowsPack], &bufferT_[(nColsStagesAcc[args[i_args + 3]] + args[i_args + 4]) * nrowsPack]); + i_args += 5; + break; + } + case 79: { + // OPERATION WITH DEST: tmp3 - SRC0: commit3 - SRC1: eval + Goldilocks3GPU::op_pack(nrowsPack, args[i_args], &tmp3[args[i_args + 1] * nrowsPack * FIELD_EXTENSION], &bufferT_[(nColsStagesAcc[args[i_args + 2]] + args[i_args + 3]) * nrowsPack], &evals[args[i_args + 4]*FIELD_EXTENSION*nrowsPack]); + i_args += 5; + break; + } + default: { + assert(false); + } + } + } + assert(i_args == nArgs); +} +#endif diff --git a/src/starkpil/chelpers_steps_pack.cuh b/src/starkpil/chelpers_steps_pack.cuh new file mode 100644 index 000000000..8d5e50123 --- /dev/null +++ b/src/starkpil/chelpers_steps_pack.cuh @@ -0,0 +1,66 @@ +#ifndef CHELPERS_STEPS_GPU_CUH +#define CHELPERS_STEPS_GPU_CUH + +#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE) +#include "chelpers_steps_pack.hpp" +#include +const int nStreams = 2; // streams per device +const int MAX_GPUS = 8; +class gl64_t; +class CHelpersStepsPackGPU: public CHelpersStepsPack { +public: + + int nDevices; + int32_t nCudaThreads; + + bool domainExtended; + uint64_t domainSize; + uint64_t subDomainSize; + uint32_t nextStride; + uint32_t nOps; + uint32_t nArgs; + uint32_t nBufferT; + uint32_t nTemp1; + uint32_t nTemp3; + + vector offsetsStagesGPU; + + uint32_t sharedStorageSize = 0; + uint32_t ops_offset; + uint32_t args_offset; + uint32_t offsetsStages_offset; + uint32_t nColsStages_offset; + uint32_t nColsStagesAcc_offset; + uint32_t challenges_offset; + uint32_t challenges_ops_offset; + uint32_t numbers_offset; + uint32_t publics_offset; + uint32_t evals_offset; + + + uint32_t exclusiveStorageSize = 0; + uint32_t constPols_offset; + uint32_t x_offset; + uint32_t zi_offset; + uint32_t pols_offset; + uint32_t xDivXSubXi_offset; + uint32_t bufferT_offset; + uint32_t tmp1_offset; + uint32_t tmp3_offset; + + void calculateExpressions(StarkInfo &starkInfo, StepsParams ¶ms, ParserArgs &parserArgs, ParserParams &parserParams); + void calculateExpressionsRowsGPU(StarkInfo &starkInfo, StepsParams ¶ms, ParserArgs &parserArgs, ParserParams &parserParams, uint64_t rowIni, uint64_t rowEnd); + void prepareGPU(StarkInfo &starkInfo, StepsParams ¶ms, ParserArgs &parserArgs, ParserParams &parserParams); + void compare(StepsParams ¶ms, uint64_t row); + void cleanupGPU(); + + void loadData(StarkInfo &starkInfo, StepsParams ¶ms, uint64_t row, uint32_t streamIdx); + void storeData(StarkInfo &starkInfo, StepsParams ¶ms, uint64_t row, uint32_t streamIdx); +}; + +__global__ void loadPolinomialsGPU(CHelpersStepsPackGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage, uint64_t nConstants, uint64_t stage); +__global__ void storePolinomialsGPU(CHelpersStepsPackGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage); +__global__ void pack_kernel(CHelpersStepsPackGPU *cHelpersSteps, uint64_t *sharedStorage, uint64_t *exclusiveStorage); + +#endif +#endif diff --git a/src/starkpil/chelpers_steps_pack.hpp b/src/starkpil/chelpers_steps_pack.hpp index b6067e813..7c473e678 100644 --- a/src/starkpil/chelpers_steps_pack.hpp +++ b/src/starkpil/chelpers_steps_pack.hpp @@ -15,6 +15,16 @@ class CHelpersStepsPack : public CHelpersSteps { vector nColsStagesAcc; vector offsetsStages; + uint8_t* ops; + uint16_t* args; + uint8_t* storePol; + + vector challenges; + vector challenges_ops; + vector numbers_; + vector publics; + vector evals; + using CHelpersSteps::storePolinomials; // Just to avoid compiation warnings using CHelpersSteps::loadPolinomials; // Just to avoid compiation warnings @@ -68,6 +78,53 @@ class CHelpersStepsPack : public CHelpersSteps { nCols = nColsStagesAcc[11] + 6; // 3 for xDivXSubXi and 3 for xDivXSubWxi } + inline virtual void prepare(StarkInfo &starkInfo, StepsParams ¶ms, ParserArgs &parserArgs, ParserParams &parserParams) { + + setBufferTInfo(starkInfo, parserParams.stage); + + challenges.resize(params.challenges.degree()*FIELD_EXTENSION*nrowsPack); + challenges_ops.resize(params.challenges.degree()*FIELD_EXTENSION*nrowsPack); + for(uint64_t i = 0; i < params.challenges.degree(); ++i) { + for(uint64_t j = 0; j < nrowsPack; ++j) { + challenges[(i*FIELD_EXTENSION)*nrowsPack + j] = params.challenges[i][0]; + challenges[(i*FIELD_EXTENSION + 1)*nrowsPack + j] = params.challenges[i][1]; + challenges[(i*FIELD_EXTENSION + 2)*nrowsPack + j] = params.challenges[i][2]; + challenges_ops[(i*FIELD_EXTENSION)*nrowsPack + j] = params.challenges[i][0] + params.challenges[i][1]; + challenges_ops[(i*FIELD_EXTENSION + 1)*nrowsPack + j] = params.challenges[i][0] + params.challenges[i][2]; + challenges_ops[(i*FIELD_EXTENSION + 2)*nrowsPack + j] = params.challenges[i][1] + params.challenges[i][2]; + } + } + + numbers_.resize(parserParams.nNumbers*nrowsPack); + for(uint64_t i = 0; i < parserParams.nNumbers; ++i) { + for(uint64_t j = 0; j < nrowsPack; ++j) { + numbers_[i*nrowsPack + j] = Goldilocks::fromU64(parserArgs.numbers[parserParams.numbersOffset+i]); + } + } + + + publics.resize(starkInfo.nPublics*nrowsPack); + for(uint64_t i = 0; i < starkInfo.nPublics; ++i) { + for(uint64_t j = 0; j < nrowsPack; ++j) { + publics[i*nrowsPack + j] = params.publicInputs[i]; + } + } + + + evals.resize(params.evals.degree()*FIELD_EXTENSION*nrowsPack); + for(uint64_t i = 0; i < params.evals.degree(); ++i) { + for(uint64_t j = 0; j < nrowsPack; ++j) { + evals[(i*FIELD_EXTENSION)*nrowsPack + j] = params.evals[i][0]; + evals[(i*FIELD_EXTENSION + 1)*nrowsPack + j] = params.evals[i][1]; + evals[(i*FIELD_EXTENSION + 2)*nrowsPack + j] = params.evals[i][2]; + } + } + + ops = &parserArgs.ops[parserParams.opsOffset]; + args = &parserArgs.args[parserParams.argsOffset]; + storePol = &parserArgs.storePols[parserParams.storePolsOffset]; + } + inline virtual void storePolinomials(StarkInfo &starkInfo, StepsParams ¶ms, Goldilocks::Element *bufferT_, uint8_t* storePol, uint64_t row, uint64_t nrowsPack, uint64_t domainExtended) { if(domainExtended) { // Store either polinomial f or polinomial q @@ -83,10 +140,8 @@ class CHelpersStepsPack : public CHelpersSteps { for(uint64_t k = 0; k < nColsStages[s]; ++k) { uint64_t dim = storePol[nColsStagesAcc[s] + k]; if(!TRANSPOSE_TMP_POLS) { - for(uint64_t k = 0; k < nColsStages[s]; ++k) { - Goldilocks::Element *buffT = &bufferT_[(nColsStagesAcc[s] + k)* nrowsPack]; - Goldilocks::copy_pack(nrowsPack, ¶ms.pols[offsetsStages[s] + k + row * nColsStages[s]], nColsStages[s], buffT); - } + Goldilocks::Element *buffT = &bufferT_[(nColsStagesAcc[s] + k)* nrowsPack]; + Goldilocks::copy_pack(nrowsPack, ¶ms.pols[offsetsStages[s] + k + row * nColsStages[s]], nColsStages[s], buffT); } else { if(storePol[nColsStagesAcc[s] + k]) { Goldilocks::Element *buffT = &bufferT_[(nColsStagesAcc[s] + k)* nrowsPack]; @@ -162,7 +217,8 @@ class CHelpersStepsPack : public CHelpersSteps { virtual void calculateExpressions(StarkInfo &starkInfo, StepsParams ¶ms, ParserArgs &parserArgs, ParserParams &parserParams){ bool domainExtended = parserParams.stage > 3 ? true : false; - uint64_t domainSize = domainExtended ? 1 << starkInfo.starkStruct.nBitsExt : 1 << starkInfo.starkStruct.nBits; + uint64_t domainSize = domainExtended ? 1 << starkInfo.starkStruct.nBitsExt : 1 << starkInfo.starkStruct.nBits; + prepare(starkInfo, params, parserArgs, parserParams); calculateExpressionsRows(starkInfo, params, parserArgs, parserParams, 0, domainSize); } @@ -171,57 +227,16 @@ class CHelpersStepsPack : public CHelpersSteps { bool domainExtended = parserParams.stage > 3 ? true : false; uint64_t domainSize = domainExtended ? 1 << starkInfo.starkStruct.nBitsExt : 1 << starkInfo.starkStruct.nBits; - uint8_t *ops = &parserArgs.ops[parserParams.opsOffset]; - uint16_t *args = &parserArgs.args[parserParams.argsOffset]; - uint64_t *numbers = &parserArgs.numbers[parserParams.numbersOffset]; - uint8_t *storePol = &parserArgs.storePols[parserParams.storePolsOffset]; if(rowEnd < rowIni || rowEnd > domainSize) { zklog.info("Invalid range for rowIni and rowEnd"); exitProcess(); } - if(rowEnd -rowIni % nrowsPack != 0) { - nrowsPack = 1; - } - - setBufferTInfo(starkInfo, parserParams.stage); - Goldilocks::Element challenges[params.challenges.degree()*FIELD_EXTENSION*nrowsPack]; - Goldilocks::Element challenges_ops[params.challenges.degree()*FIELD_EXTENSION*nrowsPack]; - for(uint64_t i = 0; i < params.challenges.degree(); ++i) { - for(uint64_t j = 0; j < nrowsPack; ++j) { - challenges[(i*FIELD_EXTENSION)*nrowsPack + j] = params.challenges[i][0]; - challenges[(i*FIELD_EXTENSION + 1)*nrowsPack + j] = params.challenges[i][1]; - challenges[(i*FIELD_EXTENSION + 2)*nrowsPack + j] = params.challenges[i][2]; - challenges_ops[(i*FIELD_EXTENSION)*nrowsPack + j] = params.challenges[i][0] + params.challenges[i][1]; - challenges_ops[(i*FIELD_EXTENSION + 1)*nrowsPack + j] = params.challenges[i][0] + params.challenges[i][2]; - challenges_ops[(i*FIELD_EXTENSION + 2)*nrowsPack + j] = params.challenges[i][1] + params.challenges[i][2]; - } - } - - Goldilocks::Element numbers_[parserParams.nNumbers*nrowsPack]; - for(uint64_t i = 0; i < parserParams.nNumbers; ++i) { - for(uint64_t j = 0; j < nrowsPack; ++j) { - numbers_[i*nrowsPack + j] = Goldilocks::fromU64(numbers[i]); - } - } - - Goldilocks::Element publics[starkInfo.nPublics*nrowsPack]; - for(uint64_t i = 0; i < starkInfo.nPublics; ++i) { - for(uint64_t j = 0; j < nrowsPack; ++j) { - publics[i*nrowsPack + j] = params.publicInputs[i]; - } - } - Goldilocks::Element evals[params.evals.degree()*FIELD_EXTENSION*nrowsPack]; - for(uint64_t i = 0; i < params.evals.degree(); ++i) { - for(uint64_t j = 0; j < nrowsPack; ++j) { - evals[(i*FIELD_EXTENSION)*nrowsPack + j] = params.evals[i][0]; - evals[(i*FIELD_EXTENSION + 1)*nrowsPack + j] = params.evals[i][1]; - evals[(i*FIELD_EXTENSION + 2)*nrowsPack + j] = params.evals[i][2]; - } + if((rowEnd -rowIni) % nrowsPack != 0) { + nrowsPack = 1; } - #pragma omp parallel for for (uint64_t i = rowIni; i < rowEnd; i+= nrowsPack) { uint64_t i_args = 0; @@ -720,7 +735,9 @@ class CHelpersStepsPack : public CHelpersSteps { } } } + storePolinomials(starkInfo, params, bufferT_, storePol, i, nrowsPack, domainExtended); + if (i_args != parserParams.nArgs) std::cout << " " << i_args << " - " << parserParams.nArgs << std::endl; assert(i_args == parserParams.nArgs); } diff --git a/src/starkpil/merkleTree/merkleTreeGL.cpp b/src/starkpil/merkleTree/merkleTreeGL.cpp index 6119069cc..9f392eb76 100644 --- a/src/starkpil/merkleTree/merkleTreeGL.cpp +++ b/src/starkpil/merkleTree/merkleTreeGL.cpp @@ -37,7 +37,7 @@ void MerkleTreeGL::genMerkleProof(Goldilocks::Element *proof, uint64_t idx, uint void MerkleTreeGL::merkelize() { #if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE) - PoseidonGoldilocks::merkletree_cuda(nodes, source, width, height); + PoseidonGoldilocks::merkletree_cuda_async(nodes, source, width, height); #else #ifdef __AVX512__ PoseidonGoldilocks::merkletree_avx512(nodes, source, width, height); diff --git a/src/starkpil/polinomial.hpp b/src/starkpil/polinomial.hpp index 90fc26974..96ff5f409 100644 --- a/src/starkpil/polinomial.hpp +++ b/src/starkpil/polinomial.hpp @@ -8,6 +8,7 @@ #include "zklog.hpp" #include "zkassert.hpp" #include "exit_process.hpp" +#include "memory.cuh" class Polinomial { @@ -17,6 +18,7 @@ class Polinomial uint64_t _dim = 0; uint64_t _offset = 0; bool _allocated = false; + bool _pinned = false; std::string _name = ""; public: @@ -27,6 +29,7 @@ class Polinomial _dim = 0; _offset = 0; _allocated = false; + _pinned = false; } Polinomial(void *pAddress, uint64_t degree, @@ -55,10 +58,38 @@ class Polinomial _allocated = true; }; + Polinomial(uint64_t degree, + uint64_t dim, + bool pinned, + std::string name = "") : _degree(degree), + _dim(dim), + _pinned(pinned), + _name(name) + { + if (degree == 0 || dim == 0) + return; + if (_pinned) { + _pAddress = (Goldilocks::Element *)calloc_zkevm(_degree * _dim, sizeof(Goldilocks::Element)); + } else { + _pAddress = (Goldilocks::Element *)calloc(_degree * _dim, sizeof(Goldilocks::Element)); + } + + if (_pAddress == NULL) + { + zklog.error("Polinomial::Polinomial() failed allocating polinomial with size: " + to_string(_degree * _dim * sizeof(Goldilocks::Element))); + exitProcess(); + } + _offset = _dim; + _allocated = true; + }; + ~Polinomial() { - if (_allocated) - free(_pAddress); + if (_allocated) { + if (_pinned) { free_zkevm(_pAddress); } + else { free(_pAddress); } + } + }; void potConstruct(Goldilocks::Element *pAddress, diff --git a/src/starkpil/starks.cpp b/src/starkpil/starks.cpp index 2c3b5882b..1ddae3331 100644 --- a/src/starkpil/starks.cpp +++ b/src/starkpil/starks.cpp @@ -13,9 +13,9 @@ void Starks::genProof(FRIProof &proof, Goldilocks::Element *publicInputs, Goldil uint64_t numCommited = starkInfo.nCm1; Transcript transcript; - Polinomial evals(starkInfo.evMap.size(), FIELD_EXTENSION); + Polinomial evals(starkInfo.evMap.size(), FIELD_EXTENSION, true); Polinomial xDivXSubXi(&mem[starkInfo.mapOffsets.section[eSection::xDivXSubXi_2ns]], 2 * NExtended, FIELD_EXTENSION, FIELD_EXTENSION); - Polinomial challenges(NUM_CHALLENGES, FIELD_EXTENSION); + Polinomial challenges(NUM_CHALLENGES, FIELD_EXTENSION, true); CommitPols cmPols(pAddress, starkInfo.mapDeg.section[eSection::cm1_n]); @@ -51,17 +51,6 @@ void Starks::genProof(FRIProof &proof, Goldilocks::Element *publicInputs, Goldil //-------------------------------- TimerStart(STARK_STEP_1); TimerStart(STARK_STEP_1_LDE_AND_MERKLETREE); -#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE) - uint64_t ncols = starkInfo.mapSectionsN.section[eSection::cm1_n]; - if (ncols > 0) - { - ntt.LDE_MerkleTree_Auto(treesGL[0]->get_nodes_ptr(), p_cm1_n, N, NExtended, ncols, p_cm1_2ns); - } - else - { - treesGL[0]->merkelize(); - } -#else TimerStart(STARK_STEP_1_LDE); string nttHelperStage1 = reduceMemory ? "cm1_tmp" : "cm1"; std::pair nttOffsetHelperStage1 = starkInfo.mapNTTOffsetsHelpers[nttHelperStage1]; @@ -74,16 +63,13 @@ void Starks::genProof(FRIProof &proof, Goldilocks::Element *publicInputs, Goldil nBlocksStage1++; } - if(reduceMemory) { - ntt.extendPol(p_cm1_2ns_tmp, p_cm1_n, NExtended, N, starkInfo.mapSectionsN.section[eSection::cm1_n], pBuffHelperStage1, 3, nBlocksStage1); - } else { - ntt.extendPol(p_cm1_2ns, p_cm1_n, NExtended, N, starkInfo.mapSectionsN.section[eSection::cm1_n], pBuffHelperStage1, 3, nBlocksStage1); - } + printf("cm1 offset:%lu, size:%lu\n", nttOffsetHelperStage1.first, nttOffsetHelperStage1.second); + ntt.extendPol(reduceMemory?p_cm1_2ns_tmp:p_cm1_2ns, p_cm1_n, NExtended, N, starkInfo.mapSectionsN.section[eSection::cm1_n], pBuffHelperStage1, 3, nBlocksStage1); TimerStopAndLog(STARK_STEP_1_LDE); + TimerStart(STARK_STEP_1_MERKLETREE); treesGL[0]->merkelize(); TimerStopAndLog(STARK_STEP_1_MERKLETREE); -#endif treesGL[0]->getRoot(root0.address()); zklog.info("MerkleTree rootGL 0: [ " + root0.toString(4) + " ]"); @@ -138,17 +124,6 @@ void Starks::genProof(FRIProof &proof, Goldilocks::Element *publicInputs, Goldil TimerStopAndLog(STARK_STEP_2_CALCULATEH1H2_TRANSPOSE_2); TimerStart(STARK_STEP_2_LDE_AND_MERKLETREE); -#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE) - ncols = starkInfo.mapSectionsN.section[eSection::cm2_n]; - if (ncols > 0) - { - ntt.LDE_MerkleTree_Auto(treesGL[1]->get_nodes_ptr(), p_cm2_n, N, NExtended, ncols, p_cm2_2ns); - } - else - { - treesGL[1]->merkelize(); - } -#else TimerStart(STARK_STEP_2_LDE); string nttHelperStage2 = reduceMemory ? "cm2_tmp" : "cm2"; std::pair nttOffsetHelperStage2 = starkInfo.mapNTTOffsetsHelpers[nttHelperStage2]; @@ -160,16 +135,13 @@ void Starks::genProof(FRIProof &proof, Goldilocks::Element *publicInputs, Goldil while((nttOffsetHelperStage2.second * nBlocksStage2 < buffHelperElementsStage2) || (starkInfo.mapSectionsN.section[cm2_n] > 256*nBlocksStage2) ) { nBlocksStage2++; } - if(reduceMemory) { - ntt.extendPol(p_cm2_2ns_tmp, p_cm2_n, NExtended, N, starkInfo.mapSectionsN.section[eSection::cm2_n], pBuffHelperStage2, 3, nBlocksStage2); - } else { - ntt.extendPol(p_cm2_2ns, p_cm2_n, NExtended, N, starkInfo.mapSectionsN.section[eSection::cm2_n], pBuffHelperStage2, 3, nBlocksStage2); - } + printf("cm2 offset:%lu, size:%lu\n", nttOffsetHelperStage2.first, nttOffsetHelperStage2.second); + ntt.extendPol(reduceMemory?p_cm2_2ns_tmp:p_cm2_2ns, p_cm2_n, NExtended, N, starkInfo.mapSectionsN.section[eSection::cm2_n], pBuffHelperStage2, 3, nBlocksStage2); TimerStopAndLog(STARK_STEP_2_LDE); + TimerStart(STARK_STEP_2_MERKLETREE); treesGL[1]->merkelize(); TimerStopAndLog(STARK_STEP_2_MERKLETREE); -#endif treesGL[1]->getRoot(root1.address()); zklog.info("MerkleTree rootGL 1: [ " + root1.toString(4) + " ]"); @@ -204,17 +176,6 @@ void Starks::genProof(FRIProof &proof, Goldilocks::Element *publicInputs, Goldil transposeZRows(pAddress, numCommited, newpols_); TimerStopAndLog(STARK_STEP_3_CALCULATE_Z_TRANSPOSE_2); TimerStart(STARK_STEP_3_LDE_AND_MERKLETREE); -#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE) - ncols = starkInfo.mapSectionsN.section[eSection::cm3_n]; - if (ncols > 0) - { - ntt.LDE_MerkleTree_Auto(treesGL[2]->get_nodes_ptr(), p_cm3_n, N, NExtended, ncols, p_cm3_2ns); - } - else - { - treesGL[2]->merkelize(); - } -#else TimerStart(STARK_STEP_3_LDE); std::pair nttOffsetHelperStage3 = starkInfo.mapNTTOffsetsHelpers["cm3"]; Goldilocks::Element *pBuffHelperStage3 = ¶ms.pols[nttOffsetHelperStage3.first]; @@ -225,12 +186,13 @@ void Starks::genProof(FRIProof &proof, Goldilocks::Element *publicInputs, Goldil while((nttOffsetHelperStage3.second * nBlocksStage3 < buffHelperElementsStage3) || (starkInfo.mapSectionsN.section[cm3_n] > 256*nBlocksStage3) ) { nBlocksStage3++; } + printf("cm3 offset:%lu, size:%lu\n", nttOffsetHelperStage3.first, nttOffsetHelperStage3.second); ntt.extendPol(p_cm3_2ns, p_cm3_n, NExtended, N, starkInfo.mapSectionsN.section[eSection::cm3_n], pBuffHelperStage3, 3, nBlocksStage3); TimerStopAndLog(STARK_STEP_3_LDE); + TimerStart(STARK_STEP_3_MERKLETREE); treesGL[2]->merkelize(); TimerStopAndLog(STARK_STEP_3_MERKLETREE); -#endif treesGL[2]->getRoot(root2.address()); zklog.info("MerkleTree rootGL 2: [ " + root2.toString(4) + " ]"); @@ -250,6 +212,7 @@ void Starks::genProof(FRIProof &proof, Goldilocks::Element *publicInputs, Goldil nBlocksStage1_++; } + printf("cm1 offset:%lu, size:%lu\n", nttOffsetHelperStage1_.first, nttOffsetHelperStage1_.second); ntt.extendPol(p_cm1_2ns, p_cm1_n, NExtended, N, starkInfo.mapSectionsN.section[eSection::cm1_n], pBuffHelperStage1_, 3, nBlocksStage1_); TimerStopAndLog(STARK_STEP_1_RECALCULATING_LDE); @@ -263,6 +226,7 @@ void Starks::genProof(FRIProof &proof, Goldilocks::Element *publicInputs, Goldil while((nttOffsetHelperStage2_.second * nBlocksStage2_ < buffHelperElementsStage2_) || (starkInfo.mapSectionsN.section[cm2_n] > 256*nBlocksStage2_) ) { nBlocksStage2_++; } + printf("cm2 offset:%lu, size:%lu\n", nttOffsetHelperStage2_.first, nttOffsetHelperStage2_.second); ntt.extendPol(p_cm2_2ns, p_cm2_n, NExtended, N, starkInfo.mapSectionsN.section[eSection::cm2_n], pBuffHelperStage2_, 3, nBlocksStage2_); TimerStopAndLog(STARK_STEP_2_RECALCULATING_LDE); diff --git a/src/starkpil/starks.hpp b/src/starkpil/starks.hpp index 44cd69684..ad33ef4c4 100644 --- a/src/starkpil/starks.hpp +++ b/src/starkpil/starks.hpp @@ -88,9 +88,9 @@ void merkelizeMemory(); // function for DBG purposes NExtended(config.generateProof() ? 1 << starkInfo.starkStruct.nBitsExt : 0), ntt(config.generateProof() ? 1 << starkInfo.starkStruct.nBits : 0), nttExtended(config.generateProof() ? 1 << starkInfo.starkStruct.nBitsExt : 0), - x_n(config.generateProof() ? N : 0, config.generateProof() ? 1 : 0), - x_2ns(config.generateProof() ? NExtended : 0, config.generateProof() ? 1 : 0), - zi(config.generateProof() ? NExtended : 0, config.generateProof() ? 1 : 0), + x_n(config.generateProof() ? N : 0, config.generateProof() ? 1 : 0, true), + x_2ns(config.generateProof() ? NExtended : 0, config.generateProof() ? 1 : 0, true), + zi(config.generateProof() ? NExtended : 0, config.generateProof() ? 1 : 0, true), pAddress(_pAddress), x(config.generateProof() ? N << (starkInfo.starkStruct.nBitsExt - starkInfo.starkStruct.nBits) : 0, config.generateProof() ? FIELD_EXTENSION : 0) { @@ -132,7 +132,7 @@ void merkelizeMemory(); // function for DBG purposes if (!LOAD_CONST_FILES) { TimerStart(CALCULATE_CONST_TREE_TO_MEMORY); - pConstPolsAddress2ns = (void *)malloc(NExtended * starkInfo.nConstants * sizeof(Goldilocks::Element)); + pConstPolsAddress2ns = (void *)malloc_zkevm(NExtended * starkInfo.nConstants * sizeof(Goldilocks::Element)); if(pConstPolsAddress2ns == NULL) { zklog.error("Starks::Starks() failed to allocate pConstPolsAddress2ns"); @@ -257,7 +257,7 @@ void merkelizeMemory(); // function for DBG purposes { unmapFile(pConstPolsAddress, constPolsSize); } else { - free(pConstPolsAddress); + free_zkevm(pConstPolsAddress); } if(LOAD_CONST_FILES) { @@ -267,7 +267,7 @@ void merkelizeMemory(); // function for DBG purposes free(pConstTreeAddress); } } else { - free(pConstPolsAddress2ns); + free_zkevm(pConstPolsAddress2ns); } for (uint i = 0; i < 5; i++) @@ -279,7 +279,7 @@ void merkelizeMemory(); // function for DBG purposes assert(cHelpersBinFile.get() == nullptr); assert(cHelpersBinFile == nullptr); delete pCHelpers; - + }; void genProof(FRIProof &proof, Goldilocks::Element *publicInputs, Goldilocks::Element verkey[4], CHelpersSteps *chelpersSteps); diff --git a/src/utils/memory.cu b/src/utils/memory.cu index a8317b6a8..b525b38fe 100644 --- a/src/utils/memory.cu +++ b/src/utils/memory.cu @@ -3,19 +3,27 @@ #ifdef __USE_CUDA__ void *calloc_zkevm(uint64_t count, uint64_t size) { char *a; - cudaMallocManaged(&a, count*size); + uint64_t total = count*size; + cudaMallocHost(&a, total); + if (total > (1<<20)) { + uint64_t nPieces = (1<<8); + uint64_t segment = total/nPieces; + uint64_t last_segment = total - segment*(nPieces-1); #pragma omp parallel for - for (uint64_t i = 0; i < count; i++) { - memset(a+ i*size, 0, size); + for (int i = 0; i < nPieces; i++) { + memset(a+segment*i, 0, i==nPieces-1?last_segment:segment); + } + } else { + memset(a, 0, total); } return a; } void *malloc_zkevm(uint64_t size) { char *a; - cudaMallocManaged(&a, size); + cudaMallocHost(&a, size); return a; } -void free_zkevm(void *ptr) { cudaFree(ptr); } +void free_zkevm(void *ptr) { cudaFreeHost(ptr); } #endif diff --git a/src/utils/utils.cpp b/src/utils/utils.cpp index c05c8d61c..9bd2ddb42 100644 --- a/src/utils/utils.cpp +++ b/src/utils/utils.cpp @@ -442,7 +442,7 @@ void* loadFileParallel(const string &fileName, uint64_t size) { } // Allocate memory - void* buffer = malloc(size); + void* buffer = malloc_zkevm(size); if (buffer == NULL) { zklog.error("loadFileParallel() failed calling malloc() of size: " + to_string(size)); exitProcess();