-
Notifications
You must be signed in to change notification settings - Fork 2
/
GPU_GlobalMemory.c
136 lines (108 loc) · 3.97 KB
/
GPU_GlobalMemory.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#include <stdio.h>
#ifndef __GPU_TIMER_H__
#define __GPU_TIMER_H__
struct GpuTimer
{
cudaEvent_t start;
cudaEvent_t stop;
GpuTimer()
{
cudaEventCreate(&start);
cudaEventCreate(&stop);
}
~GpuTimer()
{
cudaEventDestroy(start);
cudaEventDestroy(stop);
}
void Start()
{
cudaEventRecord(start, 0);
}
void Stop()
{
cudaEventRecord(stop, 0);
}
float Elapsed()
{
float elapsed;
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsed, start, stop);
return elapsed;
}
};
#endif
//Global
__global__ void stringMatchingGPU(char const *data, unsigned int dataLen, char const *target, unsigned int targetLen, unsigned int *matchPositions, unsigned int *numOfMatches){
unsigned int index = blockDim.x*blockIdx.x + threadIdx.x;
if(index+targetLen > dataLen) return;
if(data[index] == target[0]){
int flag = 1;
for(unsigned int i=1; i<targetLen; i++){
if(data[index+i] != target[i]){
flag = 0; break;
}
}
if(flag == 1){
matchPositions[atomicAdd(numOfMatches,1)] = index;
}
}
}
int main(int argc,char **argv)
{
double timerElapsed = 0.0;
for(int N=0;N<1000;N++){
GpuTimer timer;
// declare and allocate host memory
char *h_data = (char *) malloc(1*1024*1024*sizeof(char)); // 1MB char array
char *h_target = (char *) malloc(1*1024*1024*sizeof(char)); // 1MB char array
strcpy(h_data, "Lorem ipsum adore itom Lorem ipsum");
strcpy(h_target, "ipsum");
unsigned int h_dataLen = 0;
while(h_data[h_dataLen++] != '\0');
--h_dataLen;
unsigned int h_targetLen = 0;
while(h_target[h_targetLen++] != '\0');
--h_targetLen;
unsigned int *h_matchPositions = (unsigned int *) malloc(h_dataLen*sizeof(unsigned int));
unsigned int *h_numOfMatches = (unsigned int *) malloc(sizeof(unsigned int));
*h_numOfMatches = 0;
// declare, allocate, and zero out GPU memory
char *d_data;
char *d_target;
unsigned int *d_matchPositions;
unsigned int *d_numOfMatches;
cudaMalloc((void **) &d_data, h_dataLen*sizeof(char));
cudaMalloc((void **) &d_target, h_targetLen*sizeof(char));
cudaMalloc((void **) &d_matchPositions, h_dataLen * sizeof(unsigned int));
cudaMalloc((void **) &d_numOfMatches, sizeof(unsigned int));
cudaMemcpy(d_data, h_data, h_dataLen*sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(d_target, h_target, h_targetLen*sizeof(char), cudaMemcpyHostToDevice);
cudaMemcpy(d_numOfMatches, h_numOfMatches, sizeof(unsigned int), cudaMemcpyHostToDevice);
cudaMemset((void *) d_matchPositions, 0, h_dataLen * sizeof(unsigned int));
// launch the kernel - comment out one of these
timer.Start();
stringMatchingGPU<<<h_dataLen/h_targetLen,h_targetLen>>>(d_data, h_dataLen, d_target, h_targetLen, d_matchPositions, d_numOfMatches);
timer.Stop();
// copy back the array of sums from GPU and print
cudaMemcpy(h_matchPositions, d_matchPositions, h_dataLen * sizeof(unsigned int), cudaMemcpyDeviceToHost);
cudaMemcpy(h_numOfMatches, d_numOfMatches, sizeof(unsigned int), cudaMemcpyDeviceToHost);
//printf("Number Of Matches = %d\n", *h_numOfMatches);
/*
for(unsigned int i=0; i<*h_numOfMatches; i++)
printf("%d\n", h_matchPositions[i]);
*/
timerElapsed += timer.Elapsed();
// free GPU memory allocation and exit
cudaFree(d_data);
cudaFree(d_target);
cudaFree(d_matchPositions);
cudaFree(d_numOfMatches);
free(h_data);
free(h_target);
free(h_matchPositions);
free(h_numOfMatches);
}
printf("Time elapsed = %g ms\n", timerElapsed/1000.0);
return 0;
}