Advanced Multi-Physics (AMP)
On-Line Documentation
GPUFunctionTable.cu
Go to the documentation of this file.
1#include <cuda.h>
2#include <cuda_runtime.h>
3
4#include "AMP/utils/cuda/GPUFunctionTable.hpp"
5
6
7namespace AMP {
8
9// Kernels
10template<class TYPE, typename LAMBDA>
11__global__ void transform( LAMBDA &fun, TYPE *d_x, TYPE *d_y, size_t n )
12{
13 for ( int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x ) {
14 d_y[i] = fun( d_x[i] );
15 }
16}
17
18template<class TYPE, typename LAMBDA>
19__global__ void transform( LAMBDA &fun, TYPE *d_x, TYPE *d_y, TYPE *d_z, size_t n )
20{
21 for ( int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x ) {
22 d_z[i] = fun( d_x[i], d_y[i] );
23 }
24}
25
26
27template<class TYPE>
28__global__ void transformReLUKernel( const TYPE *d_a, TYPE *d_b, size_t n )
29{
30 for ( int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x ) {
31 d_b[i] = fmax( d_a[i], static_cast<TYPE>( 0 ) );
32 }
33}
34
35template<class TYPE>
36__global__ void transformAbsKernel( const TYPE *d_a, TYPE *d_b, size_t n )
37{
38 for ( int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x ) {
39 d_b[i] = fabs( d_a[i] );
40 }
41}
42
43template<class TYPE>
44__global__ void transformTanhKernel( const TYPE *d_a, TYPE *d_b, size_t n )
45{
46 for ( int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x ) {
47 d_b[i] = tanh( d_a[i] );
48 }
49}
50
51template<class TYPE>
52__global__ void transformHardTanhKernel( const TYPE *d_a, TYPE *d_b, size_t n )
53{
54 for ( int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x ) {
55 d_b[i] = fmax( -1.0, fmin( 1.0, d_a[i] ) );
56 }
57}
58
59template<class TYPE>
60__global__ void transformSigmoidKernel( const TYPE *d_a, TYPE *d_b, size_t n )
61{
62 for ( int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x ) {
63 d_b[i] = 1.0 / ( 1.0 + exp( -d_a[i] ) );
64 }
65}
66
67template<class TYPE>
68__global__ void transformSoftPlusKernel( const TYPE *d_a, TYPE *d_b, size_t n )
69{
70 for ( int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x ) {
71 d_b[i] = log1p( exp( d_a[i] ) );
72 }
73}
74
75template<class TYPE, unsigned int blockSize>
76__global__ void sumKernel( const TYPE *g_idata, TYPE *g_odata, size_t n )
77{
78
79 extern __shared__ int sdata[];
80
81 unsigned int tid = threadIdx.x;
82 unsigned int i = blockIdx.x * blockSize * 2 + tid;
83 unsigned int gridSize = blockSize * 2 * gridDim.x;
84 sdata[tid] = 0;
85
86 while ( i < n ) {
87 sdata[tid] += g_idata[i];
88 if ( ( i + blockSize ) < n ) {
89 sdata[tid] += g_idata[i + blockSize];
90 }
91 i += gridSize;
92 }
93 __syncthreads();
94
95 if ( blockSize >= 512 ) {
96 if ( tid < 256 ) {
97 sdata[tid] += sdata[tid + 256];
98 }
99 __syncthreads();
100 }
101 if ( blockSize >= 256 ) {
102 if ( tid < 128 ) {
103 sdata[tid] += sdata[tid + 128];
104 }
105 __syncthreads();
106 }
107 if ( blockSize >= 128 ) {
108 if ( tid < 64 ) {
109 sdata[tid] += sdata[tid + 64];
110 }
111 __syncthreads();
112 }
113
114 if ( blockSize >= 64 ) {
115 if ( tid < 32 ) {
116 sdata[tid] += sdata[tid + 32];
117 }
118 }
119 __syncthreads();
120 if ( blockSize >= 32 ) {
121 if ( tid < 16 ) {
122 sdata[tid] += sdata[tid + 16];
123 }
124 }
125 __syncthreads();
126 if ( blockSize >= 16 ) {
127 if ( tid < 8 ) {
128 sdata[tid] += sdata[tid + 8];
129 }
130 }
131 __syncthreads();
132 if ( blockSize >= 8 ) {
133 if ( tid < 4 ) {
134 sdata[tid] += sdata[tid + 4];
135 }
136 }
137 __syncthreads();
138 if ( blockSize >= 4 ) {
139 if ( tid < 2 ) {
140 sdata[tid] += sdata[tid + 2];
141 }
142 }
143 __syncthreads();
144 if ( blockSize >= 2 ) {
145 if ( tid < 1 ) {
146 sdata[tid] += sdata[tid + 1];
147 }
148 }
149 __syncthreads();
150
151 if ( tid == 0 )
152 g_odata[blockIdx.x] = sdata[0];
153}
154
155template<class TYPE, unsigned int blockSize>
156__global__ void equalsKernel( const TYPE *d_a, const TYPE *d_b, bool *g_odata, size_t n, TYPE tol )
157{
158
159 extern __shared__ bool esdata[];
160
161 unsigned int tid = threadIdx.x;
162 unsigned int i = blockIdx.x * blockSize * 2 + tid;
163 unsigned int gridSize = blockSize * 2 * gridDim.x;
164 esdata[tid] = true;
165
166 while ( i < n ) {
167 esdata[tid] = esdata[tid] && ( fabs( d_a[i] - d_b[i] ) < tol );
168 if ( ( i + blockSize ) < n ) {
169 esdata[tid] = esdata[tid] && ( fabs( d_a[i + blockSize] - d_b[i + blockSize] ) < tol );
170 }
171 i += gridSize;
172 }
173 __syncthreads();
174
175 if ( blockSize >= 512 ) {
176 if ( tid < 256 ) {
177 esdata[tid] = esdata[tid] && esdata[tid + 256];
178 }
179 __syncthreads();
180 }
181 if ( blockSize >= 256 ) {
182 if ( tid < 128 ) {
183 esdata[tid] = esdata[tid] && esdata[tid + 128];
184 }
185 __syncthreads();
186 }
187 if ( blockSize >= 128 ) {
188 if ( tid < 64 ) {
189 esdata[tid] = esdata[tid] && esdata[tid + 64];
190 }
191 __syncthreads();
192 }
193
194 if ( blockSize >= 64 ) {
195 if ( tid < 32 ) {
196 esdata[tid] = esdata[tid] && esdata[tid + 32];
197 }
198 }
199 __syncthreads();
200 if ( blockSize >= 32 ) {
201 if ( tid < 16 ) {
202 esdata[tid] = esdata[tid] && esdata[tid + 16];
203 }
204 }
205 __syncthreads();
206 if ( blockSize >= 16 ) {
207 if ( tid < 8 ) {
208 esdata[tid] = esdata[tid] && esdata[tid + 8];
209 }
210 }
211 __syncthreads();
212 if ( blockSize >= 8 ) {
213 if ( tid < 4 ) {
214 esdata[tid] = esdata[tid] && esdata[tid + 4];
215 }
216 }
217 __syncthreads();
218 if ( blockSize >= 4 ) {
219 if ( tid < 2 ) {
220 esdata[tid] = esdata[tid] && esdata[tid + 2];
221 }
222 }
223 __syncthreads();
224 if ( blockSize >= 2 ) {
225 if ( tid < 1 ) {
226 esdata[tid] = esdata[tid] && esdata[tid + 1];
227 }
228 }
229 __syncthreads();
230
231 if ( tid == 0 )
232 g_odata[blockIdx.x] = esdata[0];
233}
234
235// Wrappers
236template<class TYPE>
237void transformReLUW( const TYPE *d_a, TYPE *d_b, size_t n )
238{
239 dim3 BlockDim;
240 dim3 GridDim;
241 setKernelDims( n, BlockDim, GridDim );
242 transformReLUKernel<TYPE><<<GridDim, BlockDim>>>( d_a, d_b, n );
243 cudaDeviceSynchronize();
244}
245
246template<class TYPE>
247void transformAbsW( const TYPE *d_a, TYPE *d_b, size_t n )
248{
249 dim3 BlockDim;
250 dim3 GridDim;
251 setKernelDims( n, BlockDim, GridDim );
252 transformAbsKernel<TYPE><<<GridDim, BlockDim>>>( d_a, d_b, n );
253 cudaDeviceSynchronize();
254}
255
256template<class TYPE>
257void transformTanhW( const TYPE *d_a, TYPE *d_b, size_t n )
258{
259 dim3 BlockDim;
260 dim3 GridDim;
261 setKernelDims( n, BlockDim, GridDim );
262 transformTanhKernel<TYPE><<<GridDim, BlockDim>>>( d_a, d_b, n );
263 cudaDeviceSynchronize();
264}
265
266template<class TYPE>
267void transformHardTanhW( const TYPE *d_a, TYPE *d_b, size_t n )
268{
269 dim3 BlockDim;
270 dim3 GridDim;
271 setKernelDims( n, BlockDim, GridDim );
272 transformHardTanhKernel<TYPE><<<GridDim, BlockDim>>>( d_a, d_b, n );
273 cudaDeviceSynchronize();
274}
275
276template<class TYPE>
277void transformSigmoidW( const TYPE *d_a, TYPE *d_b, size_t n )
278{
279 dim3 BlockDim;
280 dim3 GridDim;
281 setKernelDims( n, BlockDim, GridDim );
282 transformSigmoidKernel<TYPE><<<GridDim, BlockDim>>>( d_a, d_b, n );
283 cudaDeviceSynchronize();
284}
285
286template<class TYPE>
287void transformSoftPlusW( const TYPE *d_a, TYPE *d_b, size_t n )
288{
289 dim3 BlockDim;
290 dim3 GridDim;
291 setKernelDims( n, BlockDim, GridDim );
292 transformSoftPlusKernel<TYPE><<<GridDim, BlockDim>>>( d_a, d_b, n );
293 cudaDeviceSynchronize();
294}
295
296template<class TYPE>
297TYPE sumW( const TYPE *d_a, size_t n )
298{
299 unsigned int threads = 128;
300 int blocks = 64;
301 dim3 gridSize( blocks, 1, 1 );
302 dim3 blockSize( threads, 1, 1 );
303 int smemsize = ( threads <= 32 ) ? 2 * threads * sizeof( TYPE ) : threads * sizeof( TYPE );
304 TYPE *d_odata, *h_odata;
305 size_t bytes = sizeof( TYPE ) * blocks;
306 h_odata = (TYPE *) malloc( bytes );
307 cudaMalloc( (void **) &d_odata, bytes );
308 sumKernel<TYPE, 128><<<gridSize, blockSize, smemsize>>>( d_a, d_odata, n );
309 cudaDeviceSynchronize();
310 cudaMemcpy( h_odata, d_odata, bytes, cudaMemcpyDeviceToHost );
311 TYPE sum = (TYPE) 0;
312 for ( int i = 0; i < blocks; i++ ) {
313 sum += h_odata[i];
314 }
315 free( h_odata );
316 cudaFree( d_odata );
317 return sum;
318}
319
320
321template<class TYPE>
322bool equalsW( const TYPE *d_a, const TYPE *d_b, TYPE tol, size_t n )
323{
324 bool eq = true;
325 unsigned int threads = 128;
326 int blocks = 64;
327 dim3 gridSize( blocks, 1, 1 );
328 dim3 blockSize( threads, 1, 1 );
329 int smemsize = ( threads <= 32 ) ? 2 * threads * sizeof( bool ) : threads * sizeof( bool );
330 bool *d_odata, *h_odata;
331 size_t bytes = sizeof( bool ) * blocks;
332 h_odata = (bool *) malloc( bytes );
333 cudaMalloc( (void **) &d_odata, bytes );
334 equalsKernel<TYPE, 128><<<gridSize, blockSize, smemsize>>>( d_a, d_b, d_odata, n, tol );
335 cudaDeviceSynchronize();
336 cudaMemcpy( h_odata, d_odata, bytes, cudaMemcpyDeviceToHost );
337 for ( int i = 0; i < blocks; i++ ) {
338 eq = eq && h_odata[i];
339 }
340 free( h_odata );
341 cudaFree( d_odata );
342 return eq;
343}
344
345
346/********************************************************
347 * Explicit instantiations of GPUFunctionTable *
348 * and the wrappers *
349 ********************************************************/
350#define INSTANTIATE( T ) \
351 template class GPUFunctionTable<T>; \
352 template T sumW<T>( const T *d_a, size_t n ); \
353 template bool equalsW<T>( const T *d_a, const T *d_b, T tol, size_t n ); \
354 template void transformReLUW<T>( const T *d_a, T *d_b, size_t n ); \
355 template void transformAbsW<T>( const T *d_a, T *d_b, size_t n ); \
356 template void transformTanhW<T>( const T *d_a, T *d_b, size_t n ); \
357 template void transformHardTanhW<T>( const T *d_a, T *d_b, size_t n ); \
358 template void transformSigmoidW<T>( const T *d_a, T *d_b, size_t n ); \
359 template void transformSoftPlusW<T>( const T *d_a, T *d_b, size_t n )
360INSTANTIATE( double );
361INSTANTIATE( float );
362
363
364} // namespace AMP



Advanced Multi-Physics (AMP)
Oak Ridge National Laboratory
Idaho National Laboratory
Los Alamos National Laboratory
This page automatically produced from the
source code by doxygen
Last updated: Tue Mar 10 2026 13:06:41.
Comments on this page