-
Notifications
You must be signed in to change notification settings - Fork 0
/
smallptCUDA.cu
242 lines (183 loc) · 10.7 KB
/
smallptCUDA.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
// smallptCUDA by Sam Lapere, 2015
// based on smallpt, a path tracer by Kevin Beason, 2008
#include <iostream>
#include <cuda_runtime.h>
#include <vector_types.h>
#include "device_launch_parameters.h"
#include <cutil_math.h> // from http://www.icmc.usp.br/~castelo/CUDA/common/inc/cutil_math.h
#define M_PI 3.14159265359f // pi
#define width 512 // screenwidth
#define height 384 // screenheight
#define samps 1024 // samples
// __device__ : executed on the device (GPU) and callable only from the device
struct Ray {
float3 orig; // ray origin
float3 dir; // ray direction
__device__ Ray(float3 o_, float3 d_) : orig(o_), dir(d_) {}
};
enum Refl_t { DIFF, SPEC, REFR }; // material types, used in radiance(), only DIFF used here
struct Sphere {
float rad; // radius
float3 pos, emi, col; // position, emission, colour
Refl_t refl; // reflection type (e.g. diffuse)
__device__ float intersect_sphere(const Ray &r) const {
// ray/sphere intersection
// returns distance t to intersection point, 0 if no hit
// ray equation: p(x,y,z) = ray.orig + t*ray.dir
// general sphere equation: x^2 + y^2 + z^2 = rad^2
// classic quadratic equation of form ax^2 + bx + c = 0
// solution x = (-b +- sqrt(b*b - 4ac)) / 2a
// solve t^2*ray.dir*ray.dir + 2*t*(orig-p)*ray.dir + (orig-p)*(orig-p) - rad*rad = 0
// more details in "Realistic Ray Tracing" book by P. Shirley or Scratchapixel.com
float3 op = pos - r.orig; // distance from ray.orig to center sphere
float t, epsilon = 0.0001f; // epsilon required to prevent floating point precision artefacts
float b = dot(op, r.dir); // b in quadratic equation
float disc = b*b - dot(op, op) + rad*rad; // discriminant quadratic equation
if (disc<0) return 0; // if disc < 0, no real solution (we're not interested in complex roots)
else disc = sqrtf(disc); // if disc >= 0, check for solutions using negative and positive discriminant
return (t = b - disc)>epsilon ? t : ((t = b + disc)>epsilon ? t : 0); // pick closest point in front of ray origin
}
};
// SCENE
// 9 spheres forming a Cornell box
// small enough to be in constant GPU memory
// { float radius, { float3 position }, { float3 emission }, { float3 colour }, refl_type }
__constant__ Sphere spheres[] = {
{ 1e5f, { 1e5f + 1.0f, 40.8f, 81.6f }, { 0.0f, 0.0f, 0.0f }, { 0.75f, 0.25f, 0.25f }, DIFF }, //Left
{ 1e5f, { -1e5f + 99.0f, 40.8f, 81.6f }, { 0.0f, 0.0f, 0.0f }, { .25f, .25f, .75f }, DIFF }, //Rght
{ 1e5f, { 50.0f, 40.8f, 1e5f }, { 0.0f, 0.0f, 0.0f }, { .75f, .75f, .75f }, DIFF }, //Back
{ 1e5f, { 50.0f, 40.8f, -1e5f + 600.0f }, { 0.0f, 0.0f, 0.0f }, { 1.00f, 1.00f, 1.00f }, DIFF }, //Frnt
{ 1e5f, { 50.0f, 1e5f, 81.6f }, { 0.0f, 0.0f, 0.0f }, { .75f, .75f, .75f }, DIFF }, //Botm
{ 1e5f, { 50.0f, -1e5f + 81.6f, 81.6f }, { 0.0f, 0.0f, 0.0f }, { .75f, .75f, .75f }, DIFF }, //Top
{ 16.5f, { 27.0f, 16.5f, 47.0f }, { 0.0f, 0.0f, 0.0f }, { 1.0f, 1.0f, 1.0f }, DIFF }, // small sphere 1
{ 16.5f, { 73.0f, 16.5f, 78.0f }, { 0.0f, 0.0f, 0.0f }, { 1.0f, 1.0f, 1.0f }, DIFF }, // small sphere 2
{ 600.0f, { 50.0f, 681.6f - .77f, 81.6f }, { 2.0f, 1.8f, 1.6f }, { 0.0f, 0.0f, 0.0f }, DIFF } // Light
};
__device__ inline bool intersect_scene(const Ray &r, float &t, int &id){
float n = sizeof(spheres) / sizeof(Sphere), d, inf = t = 1e20; // t is distance to closest intersection, initialise t to a huge number outside scene
for (int i = int(n); i--;) // test all scene objects for intersection
if ((d = spheres[i].intersect_sphere(r)) && d<t){ // if newly computed intersection distance d is smaller than current closest intersection distance
t = d; // keep track of distance along ray to closest intersection point
id = i; // and closest intersected object
}
return t<inf; // returns true if an intersection with the scene occurred, false when no hit
}
// random number generator from https://github.com/gz/rust-raytracer
__device__ static float getrandom(unsigned int *seed0, unsigned int *seed1) {
*seed0 = 36969 * ((*seed0) & 65535) + ((*seed0) >> 16); // hash the seeds using bitwise AND and bitshifts
*seed1 = 18000 * ((*seed1) & 65535) + ((*seed1) >> 16);
unsigned int ires = ((*seed0) << 16) + (*seed1);
// Convert to float
union {
float f;
unsigned int ui;
} res;
res.ui = (ires & 0x007fffff) | 0x40000000; // bitwise AND, bitwise OR
return (res.f - 2.f) / 2.f;
}
// radiance function, the meat of path tracing
// solves the rendering equation:
// outgoing radiance (at a point) = emitted radiance + reflected radiance
// reflected radiance is sum (integral) of incoming radiance from all directions in hemisphere above point,
// multiplied by reflectance function of material (BRDF) and cosine incident angle
__device__ float3 radiance(Ray &r, unsigned int *s1, unsigned int *s2){ // returns ray color
float3 accucolor = make_float3(0.0f, 0.0f, 0.0f); // accumulates ray colour with each iteration through bounce loop
float3 mask = make_float3(1.0f, 1.0f, 1.0f);
// ray bounce loop (no Russian Roulette used)
for (int bounces = 0; bounces < 4; bounces++){ // iteration up to 4 bounces (replaces recursion in CPU code)
float t; // distance to closest intersection
int id = 0; // index of closest intersected sphere
// test ray for intersection with scene
if (!intersect_scene(r, t, id))
return make_float3(0.0f, 0.0f, 0.0f); // if miss, return black
// else, we've got a hit!
// compute hitpoint and normal
const Sphere &obj = spheres[id]; // hitobject
float3 x = r.orig + r.dir*t; // hitpoint
float3 n = normalize(x - obj.pos); // normal
float3 nl = dot(n, r.dir) < 0 ? n : n * -1; // front facing normal
// add emission of current sphere to accumulated colour
// (first term in rendering equation sum)
accucolor += mask * obj.emi;
// all spheres in the scene are diffuse
// diffuse material reflects light uniformly in all directions
// generate new diffuse ray:
// origin = hitpoint of previous ray in path
// random direction in hemisphere above hitpoint (see "Realistic Ray Tracing", P. Shirley)
// create 2 random numbers
float r1 = 2 * M_PI * getrandom(s1, s2); // pick random number on unit circle (radius = 1, circumference = 2*Pi) for azimuth
float r2 = getrandom(s1, s2); // pick random number for elevation
float r2s = sqrtf(r2);
// compute local orthonormal basis uvw at hitpoint to use for calculation random ray direction
// first vector = normal at hitpoint, second vector is orthogonal to first, third vector is orthogonal to first two vectors
float3 w = nl;
float3 u = normalize(cross((fabs(w.x) > .1 ? make_float3(0, 1, 0) : make_float3(1, 0, 0)), w));
float3 v = cross(w,u);
// compute random ray direction on hemisphere using polar coordinates
// cosine weighted importance sampling (favours ray directions closer to normal direction)
float3 d = normalize(u*cos(r1)*r2s + v*sin(r1)*r2s + w*sqrtf(1 - r2));
// new ray origin is intersection point of previous ray with scene
r.orig = x + nl*0.05f; // offset ray origin slightly to prevent self intersection
r.dir = d;
mask *= obj.col; // multiply with colour of object
mask *= dot(d,nl); // weigh light contribution using cosine of angle between incident light and normal
mask *= 2; // fudge factor
}
return accucolor;
}
// __global__ : executed on the device (GPU) and callable only from host (CPU)
// this kernel runs in parallel on all the CUDA threads
__global__ void render_kernel(float3 *output){
// assign a CUDA thread to every pixel (x,y)
// blockIdx, blockDim and threadIdx are CUDA specific keywords
// replaces nested outer loops in CPU code looping over image rows and image columns
unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;
unsigned int i = (height - y - 1)*width + x; // index of current pixel (calculated using thread index)
unsigned int s1 = x; // seeds for random number generator
unsigned int s2 = y;
// generate ray directed at lower left corner of the screen
// compute directions for all other rays by adding cx and cy increments in x and y direction
Ray cam(make_float3(50, 52, 295.6), normalize(make_float3(0, -0.042612, -1))); // first hardcoded camera ray(origin, direction)
float3 cx = make_float3(width * .5135 / height, 0.0f, 0.0f); // ray direction offset in x direction
float3 cy = normalize(cross(cx, cam.dir)) * .5135; // ray direction offset in y direction (.5135 is field of view angle)
float3 r; // r is final pixel color
r = make_float3(0.0f); // reset r to zero for every pixel
for (int s = 0; s < samps; s++){ // samples per pixel
// compute primary ray direction
float3 d = cam.dir + cx*((.25 + x) / width - .5) + cy*((.25 + y) / height - .5);
// create primary ray, add incoming radiance to pixelcolor
r = r + radiance(Ray(cam.orig + d * 40, normalize(d)), &s1, &s2)*(1. / samps);
} // Camera rays are pushed ^^^^^ forward to start in interior
// write rgb value of pixel to image buffer on the GPU, clamp value to [0.0f, 1.0f] range
output[i] = make_float3(clamp(r.x, 0.0f, 1.0f), clamp(r.y, 0.0f, 1.0f), clamp(r.z, 0.0f, 1.0f));
}
inline float clamp(float x){ return x < 0.0f ? 0.0f : x > 1.0f ? 1.0f : x; }
inline int toInt(float x){ return int(pow(clamp(x), 1 / 2.2) * 255 + .5); } // convert RGB float in range [0,1] to int in range [0, 255] and perform gamma correction
int main(){
float3* output_h = new float3[width*height]; // pointer to memory for image on the host (system RAM)
float3* output_d; // pointer to memory for image on the device (GPU VRAM)
// allocate memory on the CUDA device (GPU VRAM)
cudaMalloc(&output_d, width * height * sizeof(float3));
// dim3 is CUDA specific type, block and grid are required to schedule CUDA threads over streaming multiprocessors
dim3 block(8, 8, 1);
dim3 grid(width / block.x, height / block.y, 1);
printf("CUDA initialised.\nStart rendering...\n");
// schedule threads on device and launch CUDA kernel from host
render_kernel <<< grid, block >>>(output_d);
// copy results of computation from device back to host
cudaMemcpy(output_h, output_d, width * height *sizeof(float3), cudaMemcpyDeviceToHost);
// free CUDA memory
cudaFree(output_d);
printf("Done!\n");
// Write image to PPM file, a very simple image file format
FILE *f = fopen("smallptcuda.ppm", "w");
fprintf(f, "P3\n%d %d\n%d\n", width, height, 255);
for (int i = 0; i < width*height; i++) // loop over pixels, write RGB values
fprintf(f, "%d %d %d ", toInt(output_h[i].x),
toInt(output_h[i].y),
toInt(output_h[i].z));
printf("Saved image to 'smallptcuda.ppm'\n");
delete[] output_h;
system("PAUSE");
}