2#include <cuda_runtime.h>
4#include "AMP/utils/cuda/GPUFunctionTable.hpp"
10template<class TYPE, typename LAMBDA>
11__global__ void transform( LAMBDA &fun, TYPE *d_x, TYPE *d_y, size_t n )
13 for ( int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x ) {
14 d_y[i] = fun( d_x[i] );
18template<class TYPE, typename LAMBDA>
19__global__ void transform( LAMBDA &fun, TYPE *d_x, TYPE *d_y, TYPE *d_z, size_t n )
21 for ( int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x ) {
22 d_z[i] = fun( d_x[i], d_y[i] );
28__global__ void transformReLUKernel( const TYPE *d_a, TYPE *d_b, size_t n )
30 for ( int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x ) {
31 d_b[i] = fmax( d_a[i], static_cast<TYPE>( 0 ) );
36__global__ void transformAbsKernel( const TYPE *d_a, TYPE *d_b, size_t n )
38 for ( int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x ) {
39 d_b[i] = fabs( d_a[i] );
44__global__ void transformTanhKernel( const TYPE *d_a, TYPE *d_b, size_t n )
46 for ( int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x ) {
47 d_b[i] = tanh( d_a[i] );
52__global__ void transformHardTanhKernel( const TYPE *d_a, TYPE *d_b, size_t n )
54 for ( int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x ) {
55 d_b[i] = fmax( -1.0, fmin( 1.0, d_a[i] ) );
60__global__ void transformSigmoidKernel( const TYPE *d_a, TYPE *d_b, size_t n )
62 for ( int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x ) {
63 d_b[i] = 1.0 / ( 1.0 + exp( -d_a[i] ) );
68__global__ void transformSoftPlusKernel( const TYPE *d_a, TYPE *d_b, size_t n )
70 for ( int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x ) {
71 d_b[i] = log1p( exp( d_a[i] ) );
75template<class TYPE, unsigned int blockSize>
76__global__ void sumKernel( const TYPE *g_idata, TYPE *g_odata, size_t n )
79 extern __shared__ int sdata[];
81 unsigned int tid = threadIdx.x;
82 unsigned int i = blockIdx.x * blockSize * 2 + tid;
83 unsigned int gridSize = blockSize * 2 * gridDim.x;
87 sdata[tid] += g_idata[i];
88 if ( ( i + blockSize ) < n ) {
89 sdata[tid] += g_idata[i + blockSize];
95 if ( blockSize >= 512 ) {
97 sdata[tid] += sdata[tid + 256];
101 if ( blockSize >= 256 ) {
103 sdata[tid] += sdata[tid + 128];
107 if ( blockSize >= 128 ) {
109 sdata[tid] += sdata[tid + 64];
114 if ( blockSize >= 64 ) {
116 sdata[tid] += sdata[tid + 32];
120 if ( blockSize >= 32 ) {
122 sdata[tid] += sdata[tid + 16];
126 if ( blockSize >= 16 ) {
128 sdata[tid] += sdata[tid + 8];
132 if ( blockSize >= 8 ) {
134 sdata[tid] += sdata[tid + 4];
138 if ( blockSize >= 4 ) {
140 sdata[tid] += sdata[tid + 2];
144 if ( blockSize >= 2 ) {
146 sdata[tid] += sdata[tid + 1];
152 g_odata[blockIdx.x] = sdata[0];
155template<class TYPE, unsigned int blockSize>
156__global__ void equalsKernel( const TYPE *d_a, const TYPE *d_b, bool *g_odata, size_t n, TYPE tol )
159 extern __shared__ bool esdata[];
161 unsigned int tid = threadIdx.x;
162 unsigned int i = blockIdx.x * blockSize * 2 + tid;
163 unsigned int gridSize = blockSize * 2 * gridDim.x;
167 esdata[tid] = esdata[tid] && ( fabs( d_a[i] - d_b[i] ) < tol );
168 if ( ( i + blockSize ) < n ) {
169 esdata[tid] = esdata[tid] && ( fabs( d_a[i + blockSize] - d_b[i + blockSize] ) < tol );
175 if ( blockSize >= 512 ) {
177 esdata[tid] = esdata[tid] && esdata[tid + 256];
181 if ( blockSize >= 256 ) {
183 esdata[tid] = esdata[tid] && esdata[tid + 128];
187 if ( blockSize >= 128 ) {
189 esdata[tid] = esdata[tid] && esdata[tid + 64];
194 if ( blockSize >= 64 ) {
196 esdata[tid] = esdata[tid] && esdata[tid + 32];
200 if ( blockSize >= 32 ) {
202 esdata[tid] = esdata[tid] && esdata[tid + 16];
206 if ( blockSize >= 16 ) {
208 esdata[tid] = esdata[tid] && esdata[tid + 8];
212 if ( blockSize >= 8 ) {
214 esdata[tid] = esdata[tid] && esdata[tid + 4];
218 if ( blockSize >= 4 ) {
220 esdata[tid] = esdata[tid] && esdata[tid + 2];
224 if ( blockSize >= 2 ) {
226 esdata[tid] = esdata[tid] && esdata[tid + 1];
232 g_odata[blockIdx.x] = esdata[0];
237void transformReLUW( const TYPE *d_a, TYPE *d_b, size_t n )
241 setKernelDims( n, BlockDim, GridDim );
242 transformReLUKernel<TYPE><<<GridDim, BlockDim>>>( d_a, d_b, n );
243 cudaDeviceSynchronize();
247void transformAbsW( const TYPE *d_a, TYPE *d_b, size_t n )
251 setKernelDims( n, BlockDim, GridDim );
252 transformAbsKernel<TYPE><<<GridDim, BlockDim>>>( d_a, d_b, n );
253 cudaDeviceSynchronize();
257void transformTanhW( const TYPE *d_a, TYPE *d_b, size_t n )
261 setKernelDims( n, BlockDim, GridDim );
262 transformTanhKernel<TYPE><<<GridDim, BlockDim>>>( d_a, d_b, n );
263 cudaDeviceSynchronize();
267void transformHardTanhW( const TYPE *d_a, TYPE *d_b, size_t n )
271 setKernelDims( n, BlockDim, GridDim );
272 transformHardTanhKernel<TYPE><<<GridDim, BlockDim>>>( d_a, d_b, n );
273 cudaDeviceSynchronize();
277void transformSigmoidW( const TYPE *d_a, TYPE *d_b, size_t n )
281 setKernelDims( n, BlockDim, GridDim );
282 transformSigmoidKernel<TYPE><<<GridDim, BlockDim>>>( d_a, d_b, n );
283 cudaDeviceSynchronize();
287void transformSoftPlusW( const TYPE *d_a, TYPE *d_b, size_t n )
291 setKernelDims( n, BlockDim, GridDim );
292 transformSoftPlusKernel<TYPE><<<GridDim, BlockDim>>>( d_a, d_b, n );
293 cudaDeviceSynchronize();
297TYPE sumW( const TYPE *d_a, size_t n )
299 unsigned int threads = 128;
301 dim3 gridSize( blocks, 1, 1 );
302 dim3 blockSize( threads, 1, 1 );
303 int smemsize = ( threads <= 32 ) ? 2 * threads * sizeof( TYPE ) : threads * sizeof( TYPE );
304 TYPE *d_odata, *h_odata;
305 size_t bytes = sizeof( TYPE ) * blocks;
306 h_odata = (TYPE *) malloc( bytes );
307 cudaMalloc( (void **) &d_odata, bytes );
308 sumKernel<TYPE, 128><<<gridSize, blockSize, smemsize>>>( d_a, d_odata, n );
309 cudaDeviceSynchronize();
310 cudaMemcpy( h_odata, d_odata, bytes, cudaMemcpyDeviceToHost );
312 for ( int i = 0; i < blocks; i++ ) {
322bool equalsW( const TYPE *d_a, const TYPE *d_b, TYPE tol, size_t n )
325 unsigned int threads = 128;
327 dim3 gridSize( blocks, 1, 1 );
328 dim3 blockSize( threads, 1, 1 );
329 int smemsize = ( threads <= 32 ) ? 2 * threads * sizeof( bool ) : threads * sizeof( bool );
330 bool *d_odata, *h_odata;
331 size_t bytes = sizeof( bool ) * blocks;
332 h_odata = (bool *) malloc( bytes );
333 cudaMalloc( (void **) &d_odata, bytes );
334 equalsKernel<TYPE, 128><<<gridSize, blockSize, smemsize>>>( d_a, d_b, d_odata, n, tol );
335 cudaDeviceSynchronize();
336 cudaMemcpy( h_odata, d_odata, bytes, cudaMemcpyDeviceToHost );
337 for ( int i = 0; i < blocks; i++ ) {
338 eq = eq && h_odata[i];
346/********************************************************
347 * Explicit instantiations of GPUFunctionTable *
349 ********************************************************/
350#define INSTANTIATE( T ) \
351 template class GPUFunctionTable<T>; \
352 template T sumW<T>( const T *d_a, size_t n ); \
353 template bool equalsW<T>( const T *d_a, const T *d_b, T tol, size_t n ); \
354 template void transformReLUW<T>( const T *d_a, T *d_b, size_t n ); \
355 template void transformAbsW<T>( const T *d_a, T *d_b, size_t n ); \
356 template void transformTanhW<T>( const T *d_a, T *d_b, size_t n ); \
357 template void transformHardTanhW<T>( const T *d_a, T *d_b, size_t n ); \
358 template void transformSigmoidW<T>( const T *d_a, T *d_b, size_t n ); \
359 template void transformSoftPlusW<T>( const T *d_a, T *d_b, size_t n )
360INSTANTIATE( double );