Skip to content

Commit

Permalink
Test some shapes.
Browse files Browse the repository at this point in the history
  • Loading branch information
liuliu committed Aug 16, 2024
1 parent 5c5bc18 commit cde0b15
Show file tree
Hide file tree
Showing 2 changed files with 279 additions and 1 deletion.
275 changes: 275 additions & 0 deletions bin/nnc/adversarial_shape_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,275 @@
extern "C" {
#include <ccv.h>
#include <nnc/ccv_nnc.h>
#include <sys/time.h>
#include <ctype.h>
}
#include "nnc/mfa/v2/ShaderCache.hpp"
#include "nnc/mfa/v2/GEMMDescriptor.hpp"
#include "nnc/mfa/v2/GEMMKernelDescriptor.hpp"
#include "nnc/mfa/v2/GEMMKernel.hpp"
#include "3rdparty/dsfmt/dSFMT.h"
#include <iostream>

ShaderCache shaderCache;

std::pair<int, int> profileProblemSize(GEMMDescriptor descriptor)
{
const int problemSize1 = descriptor.matrixDimensions[0];
const int problemSize2 = descriptor.matrixDimensions[1];
const int problemSize3 = descriptor.matrixDimensions[2];

// Allocate FP32 memory for the operands.
float* A = (float*)ccmalloc(sizeof(float) * problemSize1 * problemSize3);
float* B = (float*)ccmalloc(sizeof(float) * problemSize2 * problemSize3);
float* C = (float*)ccmalloc(sizeof(float) * problemSize1 * problemSize2);
float* bias = (float*)ccmalloc(sizeof(float) * problemSize2);


dsfmt_t dsfmt;
dsfmt_init_gen_rand(&dsfmt, 1);

// Initialize A to random numbers.
int i, j;
for (i = 0; i < problemSize3; i++)
for (j = 0; j < problemSize1; j++)
A[i * problemSize1 + j] = dsfmt_genrand_open_close(&dsfmt);
// Initialize B to random numbers.
for (int rowID = 0; rowID < problemSize2; rowID++)
{
for (int columnID = 0; columnID < problemSize3; columnID++)
{
const int address = rowID * problemSize3 + columnID;
B[address] = dsfmt_genrand_open_close(&dsfmt);
}
}

// Initialize C to random numbers.
for (int rowID = 0; rowID < problemSize2; rowID++)
{
bias[rowID] = dsfmt_genrand_open_close(&dsfmt);
}
void* A_storage = nullptr;
if (descriptor.memoryPrecisions.A == GEMMOperandPrecision::FP16)
{
A_storage = (uint16_t*)ccmalloc(sizeof(uint16_t) * problemSize1 * problemSize3);
ccv_float_to_half_precision(A, (uint16_t*)A_storage, problemSize1 * problemSize3);
void* t = A_storage;
A_storage = A;
A = (float*)t;
} else if (descriptor.memoryPrecisions.A == GEMMOperandPrecision::BF16) {
A_storage = (uint16_t*)ccmalloc(sizeof(uint16_t) * problemSize1 * problemSize3);
for (int i = 0; i < problemSize1 * problemSize3; i++)
((uint16_t*)A_storage)[i] = ((uint16_t*)A)[i * 2 + 1];
void* t = A_storage;
A_storage = A;
A = (float*)t;
}
void* B_storage = nullptr;
if (descriptor.memoryPrecisions.B == GEMMOperandPrecision::FP16)
{
B_storage = (uint16_t*)ccmalloc(sizeof(uint16_t) * problemSize2 * problemSize3);
ccv_float_to_half_precision(B, (uint16_t*)B_storage, problemSize2 * problemSize3);
void* t = B_storage;
B_storage = B;
B = (float*)t;
} else if (descriptor.memoryPrecisions.B == GEMMOperandPrecision::BF16) {
B_storage = (uint16_t*)ccmalloc(sizeof(uint16_t) * problemSize2 * problemSize3);
for (int i = 0; i < problemSize2 * problemSize3; i++)
((uint16_t*)B_storage)[i] = ((uint16_t*)B)[i * 2 + 1];
void* t = B_storage;
B_storage = B;
B = (float*)t;
}
void* bias_storage = nullptr;
if (descriptor.memoryPrecisions.bias == GEMMOperandPrecision::FP16)
{
bias_storage = (uint16_t*)ccmalloc(sizeof(uint16_t) * problemSize2);
ccv_float_to_half_precision(bias, (uint16_t*)bias_storage, problemSize2);
void* t = bias_storage;
bias_storage = bias;
bias = (float*)t;
} else if (descriptor.memoryPrecisions.bias == GEMMOperandPrecision::BF16) {
bias_storage = (uint16_t*)ccmalloc(sizeof(uint16_t) * problemSize2);
for (int i = 0; i < problemSize2; i++)
((uint16_t*)bias_storage)[i] = ((uint16_t*)bias)[i * 2 + 1];
void* t = bias_storage;
bias_storage = bias;
bias = (float*)t;
}

// Multiply A with B.
int maxGFLOPS = 0;
int occupancy = 0;
DeviceProperties dprops;
NS::SharedPtr<MTL::Device> device = NS::TransferPtr(MTL::CreateSystemDefaultDevice());
NS::SharedPtr<MTL::CommandQueue> queue = NS::TransferPtr(device->newCommandQueue());
{
// Generate the kernel.
auto pipelineValue = shaderCache.findKernel<GEMMKernel, GEMMDescriptor, GEMMKernelDescriptor>(descriptor, device.get(), dprops);
occupancy = pipelineValue->pipeline->maxTotalThreadsPerThreadgroup();
NS::SharedPtr<MTL::Buffer> bufferA = NS::TransferPtr(device->newBuffer(A, descriptor.memoryPrecisions.A.size() * problemSize1 * problemSize3, MTL::ResourceStorageModeShared | MTL::ResourceHazardTrackingModeTracked));
NS::SharedPtr<MTL::Buffer> bufferB = NS::TransferPtr(device->newBuffer(B, descriptor.memoryPrecisions.B.size() * problemSize2 * problemSize3, MTL::ResourceStorageModeShared | MTL::ResourceHazardTrackingModeTracked));
NS::SharedPtr<MTL::Buffer> bufferC = NS::TransferPtr(device->newBuffer(C, descriptor.memoryPrecisions.C.size() * problemSize1 * problemSize2, MTL::ResourceStorageModeShared | MTL::ResourceHazardTrackingModeTracked));
NS::SharedPtr<MTL::Buffer> bufferBias = NS::TransferPtr(device->newBuffer(bias, descriptor.memoryPrecisions.bias.size() * problemSize2, MTL::ResourceStorageModeShared | MTL::ResourceHazardTrackingModeTracked));
for (int i = 0; i < 15; i++)
{
const int duplicatedCommandCount = 20;
NS::SharedPtr<MTL::CommandBuffer> commandBuffer = NS::TransferPtr(queue->commandBuffer());
NS::SharedPtr<MTL::ComputeCommandEncoder> encoder = NS::TransferPtr(commandBuffer->computeCommandEncoder());
encoder->setComputePipelineState(pipelineValue->pipeline.get());
encoder->setThreadgroupMemoryLength(pipelineValue->kernel->threadgroupMemoryAllocation, 0);
encoder->setBuffer(bufferA.get(), 0, 0);
encoder->setBuffer(bufferB.get(), 0, 1);
encoder->setBuffer(bufferC.get(), 0, 2);
encoder->useResource(bufferA.get(), MTL::ResourceUsageRead);
encoder->useResource(bufferB.get(), MTL::ResourceUsageRead);
encoder->useResource(bufferC.get(), MTL::ResourceUsageWrite);
if (descriptor.useBias)
{
encoder->setBuffer(bufferBias.get(), 0, 3);
encoder->useResource(bufferBias.get(), MTL::ResourceUsageRead);
}
for (int j = 0; j < duplicatedCommandCount; j++)
{
auto ceilDivide =
[=](int64_t target, uint16_t granularity) -> int64_t {
return (target + int64_t(granularity) - 1) / int64_t(granularity);
};
MTL::Size gridSize = MTL::Size(ceilDivide(problemSize2, pipelineValue->kernel->blockDimensions[1]), ceilDivide(problemSize1, pipelineValue->kernel->blockDimensions[0]), 1);
MTL::Size groupSize = MTL::Size(pipelineValue->kernel->threadgroupSize, 1, 1);
encoder->dispatchThreadgroups(gridSize, groupSize);
}
encoder->endEncoding();
commandBuffer->commit();
commandBuffer->waitUntilCompleted();
auto start = commandBuffer->GPUStartTime();
auto end = commandBuffer->GPUEndTime();
auto latency = end - start;

// Determine the amount of work done.
auto operations = (int64_t)2 * problemSize1 * problemSize2 * problemSize3;
operations = operations * duplicatedCommandCount;
auto gflops = (int)((double)operations / (double)latency / 1e9);

// Report the results.
// let latencyMicroseconds = Int(latency / 1e-6)
// print(latencyMicroseconds, "μs", gflops, "GFLOPS")
maxGFLOPS = std::max(maxGFLOPS, gflops);
}
}

ccfree(A);
ccfree(B);
ccfree(C);
ccfree(bias);
if (A_storage != nullptr)
ccfree(A_storage);
if (B_storage != nullptr)
ccfree(B_storage);
if (bias_storage != nullptr)
ccfree(bias_storage);
return std::make_pair(maxGFLOPS, occupancy);
}

struct TestDescriptor {
GEMMOperandPrecision precision;
int problemSize[3];
bool transposeState[2];
bool useBias;
};

void runTest(TestDescriptor descriptor)
{
// Set up the kernel.
GEMMDescriptor gemmDesc = GEMMDescriptor();
auto precision = descriptor.precision;
unsigned int m = (unsigned int)descriptor.problemSize[0];
unsigned int n = (unsigned int)descriptor.problemSize[1];
unsigned int k = (unsigned int)descriptor.problemSize[2];
gemmDesc.matrixDimensions = simd::uint3 { m, n, k };
gemmDesc.memoryPrecisions = {
.A = precision, .B = precision, .C = precision, .bias = precision
};
gemmDesc.transposeState = simd::uchar3 { descriptor.transposeState[0], descriptor.transposeState[1], descriptor.transposeState[0] };
gemmDesc.useBias = descriptor.useBias;

// Test the kernel.
auto statistic = profileProblemSize(gemmDesc);

// Report the results.
std::cout << "problemSize = " << descriptor.problemSize[0] << "x" << descriptor.problemSize[1] << "x" << descriptor.problemSize[2] << " | ";
if (descriptor.transposeState[0])
{
std::cout << "A^T ";
} else {
std::cout << "A ";
}
if (descriptor.transposeState[1])
{
std::cout << "B^T ";
} else {
std::cout << "B ";
}
if (descriptor.useBias)
{
std::cout << "+ BIAS | ";
} else {
std::cout << " | ";
}

std::cout << statistic.first << " GFLOPS " << statistic.second << " threads/core | " << std::endl;
}

int main(int argc, char** argv)
{
ccv_nnc_init();
{
bool transposeStates[] = {
false, false,
false, true,
// true, false,
// true, true,
false, false,
false, true,
// true, false,
// true, true,
};
bool useBias[] = {
false,
false,
// false,
// false,
true,
true,
// true,
// true
};
int problemSizes[] = {
4608 * 2, 3072, 3072 * 4,
4608 * 2, 3072 * 4, 3072,
4608 * 2, 3072, 3072,
// 4608, 3072, 3072 * 3,
// 4608, 3072 * 3, 3072,
};

printf("\nPerformance tests:\n");
for (int i = 0; i < sizeof(problemSizes) / (sizeof(int) * 3); i++)
// for (int problemSize = 7936; problemSize <= 3072 * 4; problemSize += 128)
{
for (int j = 0; j < sizeof(transposeStates) / (sizeof(bool) * 2); j++)
{
TestDescriptor testDescriptor = TestDescriptor();
testDescriptor.precision = GEMMOperandPrecision::FP16;
testDescriptor.problemSize[0] = problemSizes[i * 3];
testDescriptor.problemSize[1] = problemSizes[i * 3 + 1];
testDescriptor.problemSize[2] = problemSizes[i * 3 + 2];
testDescriptor.transposeState[0] = transposeStates[j * 2];
testDescriptor.transposeState[1] = transposeStates[j * 2 + 1];
testDescriptor.useBias = useBias[j];
runTest(testDescriptor);
}
}
}
return 0;
}
5 changes: 4 additions & 1 deletion bin/nnc/makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ LDFLAGS := -L"../../lib" -lccv $(LDFLAGS)
CFLAGS := -O3 -Wall -I"../../lib" $(CFLAGS)
NVFLAGS := -O3 -I"../../lib" -lineinfo $(NVFLAGS)

TARGETS = nnc-e2e-verify nnc-e2e-sym-verify nnc-sym cifar-10 imagenet coco imdb iwslt wmt csv imdb_lstm laplacian_test
TARGETS = nnc-e2e-verify nnc-e2e-sym-verify nnc-sym cifar-10 imagenet coco imdb iwslt wmt csv imdb_lstm laplacian_test adversarial_shape_test

FUZZ_TARGETS = csv_fuzz

Expand Down Expand Up @@ -40,6 +40,9 @@ libccv.a:
laplacian_test.o: laplacian_test.cpp
$(CC) $< -o $@ -c $(CFLAGS) -std=c++17

adversarial_shape_test.o: adversarial_shape_test.cpp
$(CC) $< -o $@ -c $(CFLAGS) -std=c++17

.gitignore:
echo $(TARGETS) | tr ' ' '\n' > .gitignore

Expand Down

0 comments on commit cde0b15

Please sign in to comment.