Skip to content

Commit

Permalink
further gpu optimization
Browse files Browse the repository at this point in the history
  • Loading branch information
yann-sjtu committed Aug 28, 2024
1 parent 290a81a commit 117678a
Show file tree
Hide file tree
Showing 19 changed files with 1,180 additions and 1,254 deletions.
3 changes: 2 additions & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
[submodule "src/goldilocks"]
path = src/goldilocks
url = https://github.com/0xPolygonHermez/goldilocks.git
url = https://github.com/okx/goldilocks.git
branch = "develop"
10 changes: 6 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#INFO := $(shell cd src/goldilocks && ./configure.sh && cd ../.. && sleep 2)
#include src/goldilocks/CudaArch.mk
include src/goldilocks/CudaArch.mk
NVCC := /usr/local/cuda/bin/nvcc

TARGET_ZKP := zkProver
Expand Down Expand Up @@ -45,6 +45,8 @@ else
CXXFLAGS += -O3
endif

PROVER_FORK_ID=10

ifdef PROVER_FORK_ID
CXXFLAGS += -DPROVER_FORK_ID=$(PROVER_FORK_ID)
endif
Expand All @@ -70,7 +72,7 @@ GRPC_CPP_PLUGIN = grpc_cpp_plugin
GRPC_CPP_PLUGIN_PATH ?= `which $(GRPC_CPP_PLUGIN)`

INC_DIRS := $(shell find $(SRC_DIRS) -type d) $(sort $(dir))
INC_FLAGS := $(addprefix -I,$(INC_DIRS))
INC_FLAGS := $(addprefix -I,$(INC_DIRS)) -I/usr/local/cuda/include

SRCS_ZKP := $(shell find $(SRC_DIRS) ! -path "./src/fflonk_setup/fflonk_setup*" ! -path "./tools/starkpil/bctree/*" ! -path "./test/examples/*" ! -path "./test/expressions/*" ! -path "./test/prover/*" ! -path "./src/goldilocks/benchs/*" ! -path "./src/goldilocks/benchs/*" ! -path "./src/goldilocks/tests/*" ! -path "./src/main_generator/*" ! -path "./src/pols_generator/*" ! -path "./src/pols_diff/*" ! -path "./src/witness2db/*" \( -name *.cpp -or -name *.c -or -name *.asm -or -name *.cc \))
SRCS_ZKP_GPU := $(shell find $(SRC_DIRS) ! -path "./src/fflonk_setup/fflonk_setup*" ! -path "./tools/starkpil/bctree/*" ! -path "./test/examples/*" ! -path "./test/expressions/*" ! -path "./test/prover/*" ! -path "./src/goldilocks/benchs/*" ! -path "./src/goldilocks/benchs/*" ! -path "./src/goldilocks/tests/*" ! -path "./src/main_generator/*" ! -path "./src/pols_generator/*" ! -path "./src/pols_diff/*" ! -path "./src/witness2db/*" ! -path "./src/goldilocks/utils/deviceQuery.cu" \( -name *.cpp -or -name *.c -or -name *.asm -or -name *.cc -or -name *.cu \))
Expand Down Expand Up @@ -157,7 +159,7 @@ $(BUILD_DIR_GPU)/%.asm.o: %.asm
# c++ source
$(BUILD_DIR_GPU)/%.cpp.o: %.cpp
$(MKDIR_P) $(dir $@)
$(CXX) -D__USE_CUDA__ $(CFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@
$(CXX) -DENABLE_EXPERIMENTAL_CODE -D__USE_CUDA__ $(CFLAGS) $(CPPFLAGS) $(CXXFLAGS) -c $< -o $@

$(BUILD_DIR_GPU)/%.cc.o: %.cc
$(MKDIR_P) $(dir $@)
Expand All @@ -166,7 +168,7 @@ $(BUILD_DIR_GPU)/%.cc.o: %.cc
# cuda source
$(BUILD_DIR_GPU)/%.cu.o: %.cu
$(MKDIR_P) $(dir $@)
$(NVCC) -D__USE_CUDA__ $(INC_FLAGS) -Isrc/goldilocks/utils -Xcompiler -fopenmp -Xcompiler -fPIC -Xcompiler -mavx2 -Xcompiler -O3 -O3 -arch=$(CUDA_ARCH) -O3 $< -dc --output-file $@
$(NVCC) -DENABLE_EXPERIMENTAL_CODE -D__USE_CUDA__ $(INC_FLAGS) -Isrc/goldilocks/utils -Xcompiler -fopenmp -Xcompiler -fPIC -Xcompiler -mavx2 -Xcompiler -O3 -O3 -arch=$(CUDA_ARCH) -O3 $< -dc --output-file $@

main_generator: $(BUILD_DIR)/$(TARGET_MNG)

Expand Down
4 changes: 2 additions & 2 deletions src/config/definitions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,8 @@
#define DATABASE_USE_CACHE // If defined, the Database class uses a cache
#define USE_NEW_KVTREE

#define MAIN_SM_EXECUTOR_GENERATED_CODE
#define MAIN_SM_PROVER_GENERATED_CODE
//#define MAIN_SM_EXECUTOR_GENERATED_CODE
//#define MAIN_SM_PROVER_GENERATED_CODE

#define LOAD_CONST_FILES false

Expand Down
39 changes: 24 additions & 15 deletions src/prover/prover.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@
#include "commit_pols_starks.hpp"
#include "chelpers_steps.hpp"
#include "chelpers_steps_pack.hpp"
#include "chelpers_steps_gpu.hpp"
#ifdef __AVX512__
#include "chelpers_steps_avx512.hpp"
#endif
Expand All @@ -54,6 +53,7 @@
#include "cuda_utils.hpp"
#include "ntt_goldilocks.hpp"
#include <pthread.h>
#include "chelpers_steps_pack.cuh"

int asynctask(void* (*task)(void* args), void* arg)
{
Expand Down Expand Up @@ -93,6 +93,10 @@ Prover::Prover(Goldilocks &fr,
{
if (config.generateProof())
{
#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE)
alloc_pinned_mem(uint64_t(1<<25) * 128);
warmup_gpu();
#endif
TimerStart(PROVER_INIT);

//checkSetupHash(config.zkevmVerifier);
Expand Down Expand Up @@ -136,11 +140,6 @@ Prover::Prover(Goldilocks &fr,
}
zklog.info("Prover::genBatchProof() successfully allocated " + to_string(polsSize) + " bytes");
}

#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE)
alloc_pinned_mem(uint64_t(1<<24) * _starkInfo.mapSectionsN.section[eSection::cm1_n]);
warmup_gpu();
#endif

json finalVerkeyJson;
file2json(config.finalVerkey, finalVerkeyJson);
Expand Down Expand Up @@ -519,8 +518,11 @@ void Prover::genBatchProof(ProverRequest *pProverRequest)
json recursive2Verkey;
file2json(config.recursive2Verkey, recursive2Verkey);

#ifdef __USE_CUDA__
Goldilocks::Element *publics = (Goldilocks::Element *)malloc_zkevm(starksRecursive1->starkInfo.nPublics);
#else
Goldilocks::Element publics[starksRecursive1->starkInfo.nPublics];

#endif
// oldStateRoot
publics[0] = cmPols.Main.B0[0];
publics[1] = cmPols.Main.B1[0];
Expand Down Expand Up @@ -598,24 +600,26 @@ void Prover::genBatchProof(ProverRequest *pProverRequest)
/*************************************/

#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE)
CHelpersStepsGPU cHelpersSteps;
CHelpersStepsPackGPU cHelpersStepsZkevm;
#elif defined(__AVX512__)
CHelpersStepsAvx512 cHelpersSteps;
#elif defined(__PACK__)
CHelpersStepsPack cHelpersSteps;
CHelpersStepsAvx512 cHelpersStepsZkevm;
#elif defined(__PACK__)
CHelpersStepsPack cHelpersStepsZkevm;
cHelpersSteps.nrowsPack = NROWS_PACK;
#else
CHelpersSteps cHelpersSteps;
CHelpersSteps cHelpersStepsZkevm;
#endif

CHelpersSteps cHelpersSteps;

TimerStart(STARK_PROOF_BATCH_PROOF);

ZkevmSteps zkevmChelpersSteps;
uint64_t polBits = starkZkevm->starkInfo.starkStruct.steps[starkZkevm->starkInfo.starkStruct.steps.size() - 1].nBits;
FRIProof fproof((1 << polBits), FIELD_EXTENSION, starkZkevm->starkInfo.starkStruct.steps.size(), starkZkevm->starkInfo.evMap.size(), starkZkevm->starkInfo.nPublics);

if(USE_GENERIC_PARSER) {
starkZkevm->genProof(fproof, &publics[0], zkevmVerkey, &cHelpersSteps);
starkZkevm->genProof(fproof, &publics[0], zkevmVerkey, &cHelpersStepsZkevm);
} else {
starkZkevm->genProof(fproof, &publics[0], zkevmVerkey, &zkevmChelpersSteps);
}
Expand Down Expand Up @@ -727,10 +731,15 @@ void Prover::genBatchProof(ProverRequest *pProverRequest)
jProofRecursive1["publics"] = publicStarkJson;
json2file(jProofRecursive1, pProverRequest->filePrefix + "batch_proof.proof.json");
}

#ifdef __USE_CUDA__
free_zkevm(publics);
#endif
TimerStopAndLog(SAVE_PROOF);
}

TimerStopAndLog(PROVER_BATCH_PROOF);
assert(0);
}

void Prover::genAggregatedProof(ProverRequest *pProverRequest)
Expand Down Expand Up @@ -846,7 +855,7 @@ void Prover::genAggregatedProof(ProverRequest *pProverRequest)

if(USE_GENERIC_PARSER) {
#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE)
CHelpersStepsGPU cHelpersSteps;
CHelpersStepsPackGPU cHelpersSteps;
#elif defined(__AVX512__)
CHelpersStepsAvx512 cHelpersSteps;
#elif defined(__PACK__)
Expand Down Expand Up @@ -955,7 +964,7 @@ void Prover::genFinalProof(ProverRequest *pProverRequest)
FRIProofC12 fproofRecursiveF((1 << polBitsRecursiveF), FIELD_EXTENSION, starksRecursiveF->starkInfo.starkStruct.steps.size(), starksRecursiveF->starkInfo.evMap.size(), starksRecursiveF->starkInfo.nPublics);
if(USE_GENERIC_PARSER) {
#if defined(__USE_CUDA__) && defined(ENABLE_EXPERIMENTAL_CODE)
CHelpersStepsGPU cHelpersSteps;
CHelpersStepsPackGPU cHelpersSteps;
#elif defined(__AVX512__)
CHelpersStepsAvx512 cHelpersSteps;
#elif defined(__PACK__)
Expand Down
2 changes: 1 addition & 1 deletion src/starkpil/chelpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,4 @@ void CHelpers::loadCHelpers(BinFileUtils::BinFile *cHelpersBin) {
}

cHelpersBin->endReadSection();
};
};
22 changes: 14 additions & 8 deletions src/starkpil/chelpers_steps.hpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#ifndef CHELPERS_STEPS_HPP
#define CHELPERS_STEPS_HPP
#include "chelpers.hpp"

#include "steps.hpp"
#include "definitions.hpp"

class CHelpersSteps {
public:
Expand Down Expand Up @@ -75,14 +75,19 @@ class CHelpersSteps {
bool isTmpPol = !domainExtended && s == 4;
for(uint64_t k = 0; k < nColsStages[s]; ++k) {
uint64_t dim = storePol[nColsStagesAcc[s] + k];
if(storePol[nColsStagesAcc[s] + k]) {
if(!TRANSPOSE_TMP_POLS) {
__m256i *buffT = &bufferT_[(nColsStagesAcc[s] + k)];
if(isTmpPol) {
for(uint64_t i = 0; i < dim; ++i) {
Goldilocks::store_avx(&params.pols[offsetsStages[s] + k * domainSize + row * dim + i], uint64_t(dim), buffT[i]);
Goldilocks::store_avx(&params.pols[offsetsStages[s] + k + row * nColsStages[s]], nColsStages[s], buffT[0]);
} else {
if(storePol[nColsStagesAcc[s] + k]) {
__m256i *buffT = &bufferT_[(nColsStagesAcc[s] + k)];
if(isTmpPol) {
for(uint64_t i = 0; i < dim; ++i) {
Goldilocks::store_avx(&params.pols[offsetsStages[s] + k * domainSize + row * dim + i], uint64_t(dim), buffT[i]);
}
} else {
Goldilocks::store_avx(&params.pols[offsetsStages[s] + k + row * nColsStages[s]], nColsStages[s], buffT[0]);
}
} else {
Goldilocks::store_avx(&params.pols[offsetsStages[s] + k + row * nColsStages[s]], nColsStages[s], buffT[0]);
}
}
}
Expand Down Expand Up @@ -165,6 +170,7 @@ class CHelpersSteps {
uint8_t *storePol = &parserArgs.storePols[parserParams.storePolsOffset];

setBufferTInfo(starkInfo, parserParams.stage);

Goldilocks3::Element_avx challenges[params.challenges.degree()];
Goldilocks3::Element_avx challenges_ops[params.challenges.degree()];
for(uint64_t i = 0; i < params.challenges.degree(); ++i) {
Expand Down Expand Up @@ -703,4 +709,4 @@ class CHelpersSteps {
}
};

#endif
#endif
21 changes: 14 additions & 7 deletions src/starkpil/chelpers_steps_avx512.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "chelpers.hpp"
#include "chelpers_steps.hpp"
#include "steps.hpp"
#include "definitions.hpp"

class CHelpersStepsAvx512 : public CHelpersSteps {
public:
Expand Down Expand Up @@ -75,14 +76,19 @@ class CHelpersStepsAvx512 : public CHelpersSteps {
bool isTmpPol = !domainExtended && s == 4;
for(uint64_t k = 0; k < nColsStages[s]; ++k) {
uint64_t dim = storePol[nColsStagesAcc[s] + k];
if(storePol[nColsStagesAcc[s] + k]) {
if(!TRANSPOSE_TMP_POLS) {
__m512i *buffT = &bufferT_[(nColsStagesAcc[s] + k)];
if(isTmpPol) {
for(uint64_t i = 0; i < dim; ++i) {
Goldilocks::store_avx512(&params.pols[offsetsStages[s] + k * domainSize + row * dim + i], uint64_t(dim), buffT[i]);
Goldilocks::store_avx512(&params.pols[offsetsStages[s] + k + row * nColsStages[s]], nColsStages[s], buffT[0]);
} else {
if(storePol[nColsStagesAcc[s] + k]) {
__m512i *buffT = &bufferT_[(nColsStagesAcc[s] + k)];
if(isTmpPol) {
for(uint64_t i = 0; i < dim; ++i) {
Goldilocks::store_avx512(&params.pols[offsetsStages[s] + k * domainSize + row * dim + i], uint64_t(dim), buffT[i]);
}
} else {
Goldilocks::store_avx512(&params.pols[offsetsStages[s] + k + row * nColsStages[s]], nColsStages[s], buffT[0]);
}
} else {
Goldilocks::store_avx512(&params.pols[offsetsStages[s] + k + row * nColsStages[s]], nColsStages[s], buffT[0]);
}
}
}
Expand Down Expand Up @@ -701,6 +707,7 @@ class CHelpersStepsAvx512 : public CHelpersSteps {
assert(i_args == parserParams.nArgs);
}
}

};

#endif
#endif
Loading

0 comments on commit 117678a

Please sign in to comment.