Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
C
cudaNN
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Vyas Giridharan
cudaNN
Commits
22c744b8
Commit
22c744b8
authored
8 years ago
by
Vyas Giridharan
Browse files
Options
Downloads
Patches
Plain Diff
Added the BOMB
parents
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
knn.cu
+272
-0
272 additions, 0 deletions
knn.cu
with
272 additions
and
0 deletions
knn.cu
0 → 100644
+
272
−
0
View file @
22c744b8
#include
"cuda.h"
#include
<stdio.h>
#include
<math.h>
#include
<time.h>
texture
<
float
,
2
,
cudaReadModeElementType
>
texA
;
#ifndef min
#define min(a,b) (((a) < (b)) ? (a) : (b))
#endif
#define MAX_PITCH_VALUE_IN_BYTES 262144
#define MAX_TEXTURE_WIDTH_IN_BYTES 65536
#define MAX_TEXTURE_HEIGHT_IN_BYTES 32768
#define MAX_PART_OF_FREE_MEMORY_USED 0.9
__global__
void
cuParallelSqrt
(
float
*
dist
,
int
width
,
int
pitch
,
int
k
){
unsigned
int
xIndex
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
unsigned
int
yIndex
=
blockIdx
.
y
*
blockDim
.
y
+
threadIdx
.
y
;
if
(
xIndex
<
width
&&
yIndex
<
k
)
dist
[
yIndex
*
pitch
+
xIndex
]
=
sqrt
(
dist
[
yIndex
*
pitch
+
xIndex
]);
}
__global__
void
cuComputeDistanceGlobal
(
float
*
A
,
int
wA
,
int
pA
,
float
*
B
,
int
wB
,
int
pB
,
int
dim
,
float
*
AB
){
// Declaration of the shared memory arrays As and Bs used to store the sub-matrix of A and B
__shared__
float
shared_A
[
16
][
16
];
__shared__
float
shared_B
[
16
][
16
];
// Sub-matrix of A (begin, step, end) and Sub-matrix of B (begin, step)
__shared__
int
begin_A
;
__shared__
int
begin_B
;
__shared__
int
step_A
;
__shared__
int
step_B
;
__shared__
int
end_A
;
// Thread index
int
tx
=
threadIdx
.
x
;
int
ty
=
threadIdx
.
y
;
// Other variables
float
tmp
;
float
ssd
=
0
;
// Looping parameters
begin_A
=
16
*
blockIdx
.
y
;
begin_B
=
16
*
blockIdx
.
x
;
step_A
=
16
*
pA
;
step_B
=
16
*
pB
;
end_A
=
begin_A
+
(
dim
-
1
)
*
pA
;
// Conditions
int
cond0
=
(
begin_A
+
tx
<
wA
);
// used to write in shared memory
int
cond1
=
(
begin_B
+
tx
<
wB
);
// used to write in shared memory & to computations and to write in output matrix
int
cond2
=
(
begin_A
+
ty
<
wA
);
// used to computations and to write in output matrix
// Loop over all the sub-matrices of A and B required to compute the block sub-matrix
for
(
int
a
=
begin_A
,
b
=
begin_B
;
a
<=
end_A
;
a
+=
step_A
,
b
+=
step_B
)
{
// Load the matrices from device memory to shared memory; each thread loads one element of each matrix
if
(
a
/
pA
+
ty
<
dim
){
shared_A
[
ty
][
tx
]
=
(
cond0
)
?
A
[
a
+
pA
*
ty
+
tx
]
:
0
;
shared_B
[
ty
][
tx
]
=
(
cond1
)
?
B
[
b
+
pB
*
ty
+
tx
]
:
0
;
}
else
{
shared_A
[
ty
][
tx
]
=
0
;
shared_B
[
ty
][
tx
]
=
0
;
}
// Synchronize to make sure the matrices are loaded
__syncthreads
();
// Compute the difference between the two matrixes; each thread computes one element of the block sub-matrix
if
(
cond2
&&
cond1
){
for
(
int
k
=
0
;
k
<
BLOCK_DIM
;
++
k
){
tmp
=
shared_A
[
k
][
ty
]
-
shared_B
[
k
][
tx
];
ssd
+=
tmp
*
tmp
;
}
}
// Synchronize to make sure that the preceding computation is done before loading two new sub-matrices of A and B in the next iteration
__syncthreads
();
}
// Write the block sub-matrix to device memory; each thread writes one element
if
(
cond2
&&
cond1
)
AB
[
(
begin_A
+
ty
)
*
pB
+
begin_B
+
tx
]
=
ssd
;
}
void
knn
(
float
*
ref_host
,
int
ref_width
,
float
*
query_host
,
int
query_width
,
int
height
,
int
k
,
float
*
dist_host
,
int
*
ind_host
){
unsigned
int
size_of_float
=
sizeof
(
float
);
unsigned
int
size_of_int
=
sizeof
(
int
);
//Device variables
float
*
query_dev
;
float
*
ref_dev
;
float
*
dist_dev
;
int
*
ind_dev
;
cudaArray
*
ref_array
;
cudaError_t
result
;
//Sizes
size_t
query_pitch
;
size_t
query_pitch_in_bytes
;
size_t
ref_pitch
;
size_t
ref_pitch_in_bytes
;
size_t
ind_pitch
;
size_t
ind_pitch_in_bytes
;
size_t
max_nb_query_traited
;
size_t
actual_nb_query_width
;
size_t
memory_total
;
size_t
memory_free
;
unsigned
int
use_texture
=
(
ref_width
*
size_of_float
<=
MAX_TEXTURE_WIDTH_IN_BYTES
&&
height
*
size_of_float
<=
MAX_TEXTURE_HEIGHT_IN_BYTES
);
// CUDA Initialisation
cuInit
(
0
);
// Check free memory using driver API ; only (MAX_PART_OF_FREE_MEMORY_USED*100)% of memory will be used
CUcontext
cuContext
;
CUdevice
cuDevice
=
0
;
cuCtxCreate
(
&
cuContext
,
0
,
cuDevice
);
cuMemGetInfo
(
&
memory_free
,
&
memory_total
);
printf
(
"Memory :: Free : %d , Total : %d"
,
memory_free
,
memory_total
);
cuCtxDetach
(
cuContext
);
max_nb_query_traited
=
(
memory_free
*
MAX_PART_OF_FREE_MEMORY_USED
-
size_of_float
*
ref_width
*
height
)
/
(
size_of_float
*
(
height
+
ref_width
)
+
size_of_int
*
k
);
max_nb_query_traited
=
min
(
query_width
,
(
max_nb_query_traited
/
16
)
*
16
);
// Allocation of global memory for query points and for distances
result
=
cudaMallocPitch
(
(
void
**
)
&
query_dev
,
&
query_pitch_in_bytes
,
max_nb_query_traited
*
size_of_float
,
height
+
ref_width
);
if
(
result
){
printf
(
"
\n
%d
\n
"
,
max_nb_query_traited
*
size_of_float
*
(
height
+
ref_width
));
return
;
}
printf
(
"
\n
Query pitch : %d
\n
"
,
query_pitch_in_bytes
/
sizeof
(
float
));
query_pitch
=
query_pitch_in_bytes
/
size_of_float
;
dist_dev
=
query_dev
+
height
*
query_pitch
;
result
=
cudaMallocPitch
(
(
void
**
)
&
ind_dev
,
&
ind_pitch_in_bytes
,
max_nb_query_traited
*
size_of_int
,
k
);
if
(
result
){
cudaFree
(
query_dev
);
printErrorMessage
(
result
,
max_nb_query_traited
*
size_of_int
*
k
);
return
;
}
ind_pitch
=
ind_pitch_in_bytes
/
size_of_int
;
if
(
use_texture
){
// Allocation of texture memory
cudaChannelFormatDesc
channelDescA
=
cudaCreateChannelDesc
<
float
>
();
result
=
cudaMallocArray
(
&
ref_array
,
&
channelDescA
,
ref_width
,
height
);
if
(
result
){
printErrorMessage
(
result
,
ref_width
*
height
*
size_of_float
);
cudaFree
(
ind_dev
);
cudaFree
(
query_dev
);
return
;
}
cudaMemcpyToArray
(
ref_array
,
0
,
0
,
ref_host
,
ref_width
*
height
*
size_of_float
,
cudaMemcpyHostToDevice
);
// Set texture parameters and bind texture to array
texA
.
addressMode
[
0
]
=
cudaAddressModeClamp
;
texA
.
addressMode
[
1
]
=
cudaAddressModeClamp
;
texA
.
filterMode
=
cudaFilterModePoint
;
texA
.
normalized
=
0
;
cudaBindTextureToArray
(
texA
,
ref_array
);
}
else
{
// Allocation of global memory
result
=
cudaMallocPitch
(
(
void
**
)
&
ref_dev
,
&
ref_pitch_in_bytes
,
ref_width
*
size_of_float
,
height
);
if
(
result
){
printErrorMessage
(
result
,
ref_width
*
size_of_float
*
height
);
cudaFree
(
ind_dev
);
cudaFree
(
query_dev
);
return
;
}
ref_pitch
=
ref_pitch_in_bytes
/
size_of_float
;
cudaMemcpy2D
(
ref_dev
,
ref_pitch_in_bytes
,
ref_host
,
ref_width
*
size_of_float
,
ref_width
*
size_of_float
,
height
,
cudaMemcpyHostToDevice
);
}
// Split queries to fit in GPU memory
for
(
int
i
=
0
;
i
<
query_width
;
i
+=
max_nb_query_traited
){
// Number of query points considered
actual_nb_query_width
=
min
(
max_nb_query_traited
,
query_width
-
i
);
// Copy of part of query actually being treated
cudaMemcpy2D
(
query_dev
,
query_pitch_in_bytes
,
&
query_host
[
i
],
query_width
*
size_of_float
,
query_width
*
size_of_float
,
height
,
cudaMemcpyHostToDevice
);
//Block size for computing distances
dim3
g_16x16
(
actual_nb_query_width
/
16
,
ref_width
/
16
,
1
);
dim3
t_16x16
(
16
,
16
,
1
);
if
(
actual_nb_query_width
%
16
!=
0
)
g_16x16
.
x
+=
1
;
if
(
ref_width
%
16
!=
0
)
g_16x16
.
y
+=
1
;
//Block size for sorting distances
dim3
g_256x1
(
actual_nb_query_width
/
256
,
1
,
1
);
dim3
t_256x1
(
256
,
1
,
1
);
if
(
actual_nb_query_width
%
256
!=
0
)
g_256x1
.
x
+=
1
;
//Computing square root of k elements
dim3
g_k_16x16
(
actual_nb_query_width
/
16
,
k
/
16
,
1
);
dim3
t_k_16x16
(
16
,
16
,
1
);
if
(
actual_nb_query_width
%
16
!=
0
)
g_k_16x16
.
x
+=
1
;
if
(
k
%
16
!=
0
)
g_k_16x16
.
y
+=
1
;
}
}
int
main
(
void
){
// Variables and parameters
float
*
ref
;
// Pointer to reference point array
float
*
query
;
// Pointer to query point array
float
*
dist
;
// Pointer to distance array
int
*
ind
;
// Pointer to index array
int
ref_nb
=
65535
;
// Reference point number, max=65535
int
query_nb
=
65535
;
// Query point number, max=65535
int
dim
=
32
;
// Dimension of points
int
k
=
20
;
// Nearest neighbors to consider
int
iterations
=
100
;
int
i
;
ref
=
(
float
*
)
malloc
(
ref_nb
*
dim
*
sizeof
(
float
));
query
=
(
float
*
)
malloc
(
query_nb
*
dim
*
sizeof
(
float
));
dist
=
(
float
*
)
malloc
(
query_nb
*
k
*
sizeof
(
float
));
ind
=
(
int
*
)
malloc
(
query_nb
*
k
*
sizeof
(
float
));
// Fill random values
srand
(
time
(
NULL
));
for
(
i
=
0
;
i
<
ref_nb
*
dim
;
i
++
)
ref
[
i
]
=
(
float
)
rand
()
/
(
float
)
RAND_MAX
;
for
(
i
=
0
;
i
<
query_nb
*
dim
;
i
++
)
query
[
i
]
=
(
float
)
rand
()
/
(
float
)
RAND_MAX
;
cudaEvent_t
start
,
stop
;
cudaEventCreate
(
&
start
);
cudaEventCreate
(
&
stop
);
float
elapsed_time
;
cudaEventRecord
(
start
,
0
);
for
(
i
=
0
;
i
<
iterations
;
i
++
)
knn
(
ref
,
ref_nb
,
query
,
query_nb
,
dim
,
k
,
dist
,
ind
);
cudaEventRecord
(
stop
,
0
);
cudaEventSynchronize
(
stop
);
cudaEventElapsedTime
(
&
elapsed_time
,
start
,
stop
);
//printf(" done in %f s for %d iterations (%f s by iteration)\n", elapsed_time/1000, iterations, elapsed_time/(iterations*1000));
// Destroy cuda event object and free memory
cudaEventDestroy
(
start
);
cudaEventDestroy
(
stop
);
free
(
ind
);
free
(
dist
);
free
(
query
);
free
(
ref
);
}
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment