8#include "AMP/utils/UtilityMacros.h"
9#include "AMP/utils/cuda/helper_string.h"
11#include "StackTrace/source_location.h"
14#include <cuda_runtime.h>
16#define hostDeviceId cudaCpuDeviceId
18#define deviceMemAttachGlobal cudaMemAttachGlobal
20#define deviceMemcpyHostToDevice cudaMemcpyHostToDevice
21#define deviceMemcpyDeviceToHost cudaMemcpyDeviceToHost
22#define deviceMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
24#define deviceInit( ... ) checkCudaErrors( cuInit( __VA_ARGS__ ) )
25#define deviceGetCount( ... ) checkCudaErrors( cudaGetDeviceCount( __VA_ARGS__ ) )
26#define deviceBind( ... ) checkCudaErrors( cudaSetDevice( __VA_ARGS__ ) )
27#define deviceId( ... ) checkCudaErrors( cudaGetDevice( __VA_ARGS__ ) )
28#define deviceSynchronize() checkCudaErrors( cudaDeviceSynchronize() )
29#define deviceMalloc( ... ) checkCudaErrors( cudaMalloc( __VA_ARGS__ ) )
30#define deviceMallocManaged( ... ) checkCudaErrors( cudaMallocManaged( __VA_ARGS__ ) )
31#define deviceMemcpy( ... ) checkCudaErrors( cudaMemcpy( __VA_ARGS__ ) )
32#define deviceMemset( ... ) checkCudaErrors( cudaMemset( __VA_ARGS__ ) )
33#define deviceFree( ... ) checkCudaErrors( cudaFree( __VA_ARGS__ ) )
34#define deviceMemPrefetchAsync( ... ) checkCudaErrors( cudaMemPrefetchAsync( __VA_ARGS__ ) )
42#ifdef __DRIVER_TYPES_H__
44 #define DEVICE_RESET cudaDeviceReset();
68 const StackTrace::source_location &source = SOURCE_LOCATION_CURRENT() );
72 const StackTrace::source_location &source = SOURCE_LOCATION_CURRENT() );
75 #define MAX( a, b ) ( a > b ? a : b )
79inline int ftoi(
float value )
81 return ( value >= 0 ? (
int) ( value + 0.5 ) : (int) ( value - 0.5 ) );
93 sSMtoCores nGpuArchCoresPerSM[] = { { 0x20, 32 },
105 while ( nGpuArchCoresPerSM[index].SM != -1 ) {
106 if ( nGpuArchCoresPerSM[index].SM == ( ( major << 4 ) + minor ) ) {
107 return nGpuArchCoresPerSM[index].Cores;
114 printf(
"MapSMtoCores for SM %d.%d is undefined. Default to use %d Cores/SM\n",
117 nGpuArchCoresPerSM[index - 1].Cores );
118 return nGpuArchCoresPerSM[index - 1].Cores;
122#ifdef __CUDA_RUNTIME_H__
124int gpuDeviceInit(
int devID );
127int gpuGetMaxGflopsDeviceId();
131int findCudaDevice(
int argc,
const char **argv );
134bool checkCudaCapabilities(
int major_version,
int minor_version );
144 constexpr int warpSize = 32;
145 constexpr int maxGridSize = 32 * 80;
148 int warpCount = ( n / warpSize ) + ( ( ( n % warpSize ) == 0 ) ? 0 : 1 );
149 int warpPerBlock = std::max( 1, std::min( 4, warpCount ) );
150 int threadCount = warpSize * warpPerBlock;
151 int blockCount = std::min( maxGridSize, std::max( 1, warpCount / warpPerBlock ) );
152 BlockDim = dim3( threadCount, 1, 1 );
153 GridDim = dim3( blockCount, 1, 1 );
int _ConvertSMVer2Cores(int major, int minor)
static void setKernelDims(size_t n, dim3 &BlockDim, dim3 &GridDim)
void checkCudaErrors(T result, const StackTrace::source_location &source=SOURCE_LOCATION_CURRENT())
void getLastDeviceError(const char *errorMessage, const StackTrace::source_location &source=SOURCE_LOCATION_CURRENT())
const char * cudaGetName(T result)
AMP::Utilities::MemoryType getCudaMemoryType(const void *ptr)
MemoryType
Enum to store pointer type.