-
Notifications
You must be signed in to change notification settings - Fork 144
/
costs.cu
113 lines (96 loc) · 4.08 KB
/
costs.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
/**
This file is part of sgm. (https://github.com/dhernandez0/sgm).
Copyright (c) 2016 Daniel Hernandez Juarez.
sgm is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
sgm is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with sgm. If not, see <http://www.gnu.org/licenses/>.
**/
#include "costs.h"
#include <stdio.h>
__global__ void
__launch_bounds__(1024, 2)
CenterSymmetricCensusKernelSM2(const uint8_t *im, const uint8_t *im2, cost_t *transform, cost_t *transform2, const uint32_t rows, const uint32_t cols) {
const int idx = blockIdx.x*blockDim.x+threadIdx.x;
const int idy = blockIdx.y*blockDim.y+threadIdx.y;
const int win_cols = (32+LEFT*2); // 32+4*2 = 40
const int win_rows = (32+TOP*2); // 32+3*2 = 38
__shared__ uint8_t window[win_cols*win_rows];
__shared__ uint8_t window2[win_cols*win_rows];
const int id = threadIdx.y*blockDim.x+threadIdx.x;
const int sm_row = id / win_cols;
const int sm_col = id % win_cols;
const int im_row = blockIdx.y*blockDim.y+sm_row-TOP;
const int im_col = blockIdx.x*blockDim.x+sm_col-LEFT;
const bool boundaries = (im_row >= 0 && im_col >= 0 && im_row < rows && im_col < cols);
window[sm_row*win_cols+sm_col] = boundaries ? im[im_row*cols+im_col] : 0;
window2[sm_row*win_cols+sm_col] = boundaries ? im2[im_row*cols+im_col] : 0;
// Not enough threads to fill window and window2
const int block_size = blockDim.x*blockDim.y;
if(id < (win_cols*win_rows-block_size)) {
const int id = threadIdx.y*blockDim.x+threadIdx.x+block_size;
const int sm_row = id / win_cols;
const int sm_col = id % win_cols;
const int im_row = blockIdx.y*blockDim.y+sm_row-TOP;
const int im_col = blockIdx.x*blockDim.x+sm_col-LEFT;
const bool boundaries = (im_row >= 0 && im_col >= 0 && im_row < rows && im_col < cols);
window[sm_row*win_cols+sm_col] = boundaries ? im[im_row*cols+im_col] : 0;
window2[sm_row*win_cols+sm_col] = boundaries ? im2[im_row*cols+im_col] : 0;
}
__syncthreads();
cost_t census = 0;
cost_t census2 = 0;
if(idy < rows && idx < cols) {
for(int k = 0; k < CENSUS_HEIGHT/2; k++) {
for(int m = 0; m < CENSUS_WIDTH; m++) {
const uint8_t e1 = window[(threadIdx.y+k)*win_cols+threadIdx.x+m];
const uint8_t e2 = window[(threadIdx.y+2*TOP-k)*win_cols+threadIdx.x+2*LEFT-m];
const uint8_t i1 = window2[(threadIdx.y+k)*win_cols+threadIdx.x+m];
const uint8_t i2 = window2[(threadIdx.y+2*TOP-k)*win_cols+threadIdx.x+2*LEFT-m];
const int shft = k*CENSUS_WIDTH+m;
// Compare to the center
cost_t tmp = (e1 >= e2);
// Shift to the desired position
tmp <<= shft;
// Add it to its place
census |= tmp;
// Compare to the center
cost_t tmp2 = (i1 >= i2);
// Shift to the desired position
tmp2 <<= shft;
// Add it to its place
census2 |= tmp2;
}
}
if(CENSUS_HEIGHT % 2 != 0) {
const int k = CENSUS_HEIGHT/2;
for(int m = 0; m < CENSUS_WIDTH/2; m++) {
const uint8_t e1 = window[(threadIdx.y+k)*win_cols+threadIdx.x+m];
const uint8_t e2 = window[(threadIdx.y+2*TOP-k)*win_cols+threadIdx.x+2*LEFT-m];
const uint8_t i1 = window2[(threadIdx.y+k)*win_cols+threadIdx.x+m];
const uint8_t i2 = window2[(threadIdx.y+2*TOP-k)*win_cols+threadIdx.x+2*LEFT-m];
const int shft = k*CENSUS_WIDTH+m;
// Compare to the center
cost_t tmp = (e1 >= e2);
// Shift to the desired position
tmp <<= shft;
// Add it to its place
census |= tmp;
// Compare to the center
cost_t tmp2 = (i1 >= i2);
// Shift to the desired position
tmp2 <<= shft;
// Add it to its place
census2 |= tmp2;
}
}
transform[idy*cols+idx] = census;
transform2[idy*cols+idx] = census2;
}
}