Programming Interface following code sample allocates a width x height 2D array of floating-point values and shows how to loop over the array elements in device code: /Host code int width 64,height 64; float*devptr; size t pitch; cudaMallocPitch(&devPtr,&pitch, width sizeof(float),height); MyKernel<<<100,512>>>(devPtr,pitch,width,height); /Device code _global_void MyKernel(float*devptr, size t pitch,int width,int height) for (int r 0;r height;++r) float+row =(float*)((char*)devPtr +r pitch); for (int c 0;c width;++c)( float element row[c]; The following code sample allocates a width x height x depth 3D array of floating- point values and shows how to loop over the array elements in device code: /Host code int width 64,height 64,depth =64; cudaExtent extent make cudaExtent(width sizeof(float), height,depth); cudapitchedPtr devpitchedptr; cudaMalloc3D(&devPitchedptr,extent); MyKernel<<<100,512>>>(devPitchedPtr,width,height,depth); //Device code _global_void MyKernel(cudaPitchedPtr devpitchedptr, int width,int height,int depth) char*devptr devpitchedptr.ptr; size t pitch devpitchedptr.pitch; size t slicepitch pitch height; for (int z 0;z depth;++z){ chart slice devPtr z*slicepitch; for (int y=0;y<height;++y){ float*row =(float+)(slice +y pitch); for (int x=0;x width;++x){ float element row [x]; The reference manual lists all the various functions used to copy memory between linear memory allocated with cudaMalloc(),linear memory allocated with cudaMallocPitch()or cudaMalloc3D(),CUDA arrays,and memory allocated for variables declared in global or constant memory space. www.nvidia.com CUDA C Programming Guide PG-02829-001_v8.0|22
Programming Interface www.nvidia.com CUDA C Programming Guide PG-02829-001_v8.0 | 22 following code sample allocates a width x height 2D array of floating-point values and shows how to loop over the array elements in device code: // Host code int width = 64, height = 64; float* devPtr; size_t pitch; cudaMallocPitch(&devPtr, &pitch, width * sizeof(float), height); MyKernel<<<100, 512>>>(devPtr, pitch, width, height); // Device code __global__ void MyKernel(float* devPtr, size_t pitch, int width, int height) { for (int r = 0; r < height; ++r) { float* row = (float*)((char*)devPtr + r * pitch); for (int c = 0; c < width; ++c) { float element = row[c]; } } } The following code sample allocates a width x height x depth 3D array of floatingpoint values and shows how to loop over the array elements in device code: // Host code int width = 64, height = 64, depth = 64; cudaExtent extent = make_cudaExtent(width * sizeof(float), height, depth); cudaPitchedPtr devPitchedPtr; cudaMalloc3D(&devPitchedPtr, extent); MyKernel<<<100, 512>>>(devPitchedPtr, width, height, depth); // Device code __global__ void MyKernel(cudaPitchedPtr devPitchedPtr, int width, int height, int depth) { char* devPtr = devPitchedPtr.ptr; size_t pitch = devPitchedPtr.pitch; size_t slicePitch = pitch * height; for (int z = 0; z < depth; ++z) { char* slice = devPtr + z * slicePitch; for (int y = 0; y < height; ++y) { float* row = (float*)(slice + y * pitch); for (int x = 0; x < width; ++x) { float element = row[x]; } } } } The reference manual lists all the various functions used to copy memory between linear memory allocated with cudaMalloc(), linear memory allocated with cudaMallocPitch() or cudaMalloc3D(), CUDA arrays, and memory allocated for variables declared in global or constant memory space
Programming Interface The following code sample illustrates various ways of accessing global variables via the runtime API: constant float constData[256]; float data[256]; cudaMemcpyToSymbol(constData,data,sizeof(data)); cudaMemcpyFromSymbol(data,constData,sizeof(data)); device float devData; float value 3.14f; cudaMemcpyToSymbol(devData,&value,sizeof(float)); device float*devPointer; float*ptri cudaMalloc(sptr,256 sizeof(float)); cudaMemcpyToSymbol(devPointer,sptr,sizeof(ptr)); cudaGetsymbolAddress()is used to retrieve the address pointing to the memory allocated for a variable declared in global memory space.The size of the allocated memory is obtained through cudaGetsymbolsize(). 3.2.3.Shared Memory As detailed in Variable Type Qualifiers shared memory is allocated using the shared qualifier. Shared memory is expected to be much faster than global memory as mentioned in Thread Hierarchy and detailed in Shared Memory.Any opportunity to replace global memory accesses by shared memory accesses should therefore be exploited as illustrated by the following matrix multiplication example. The following code sample is a straightforward implementation of matrix multiplication that does not take advantage of shared memory.Each thread reads one row of A and one www.nvidia.com CUDA C Programming Guide PG-02829-001_v8.0|23
Programming Interface www.nvidia.com CUDA C Programming Guide PG-02829-001_v8.0 | 23 The following code sample illustrates various ways of accessing global variables via the runtime API: __constant__ float constData[256]; float data[256]; cudaMemcpyToSymbol(constData, data, sizeof(data)); cudaMemcpyFromSymbol(data, constData, sizeof(data)); __device__ float devData; float value = 3.14f; cudaMemcpyToSymbol(devData, &value, sizeof(float)); __device__ float* devPointer; float* ptr; cudaMalloc(&ptr, 256 * sizeof(float)); cudaMemcpyToSymbol(devPointer, &ptr, sizeof(ptr)); cudaGetSymbolAddress() is used to retrieve the address pointing to the memory allocated for a variable declared in global memory space. The size of the allocated memory is obtained through cudaGetSymbolSize(). 3.2.3. Shared Memory As detailed in Variable Type Qualifiers shared memory is allocated using the __shared__ qualifier. Shared memory is expected to be much faster than global memory as mentioned in Thread Hierarchy and detailed in Shared Memory. Any opportunity to replace global memory accesses by shared memory accesses should therefore be exploited as illustrated by the following matrix multiplication example. The following code sample is a straightforward implementation of matrix multiplication that does not take advantage of shared memory. Each thread reads one row of A and one
Programming Interface column of B and computes the corresponding element of C as illustrated in Figure 9.A is therefore read B.width times from global memory and B is read A.height times. /Matrices are stored in row-major order: /M(row,col)=*(M.elements row M.width col) typedef struct int width; int height; float*elements; Matrix; /Thread block size #define BLOCK SIZE 16 /Forward declaration of the matrix multiplication kernel _global_void MatMulKernel(const Matrix,const Matrix,Matrix); /Matrix multiplication Host code /Matrix dimensions are assumed to be multiples of BLOCK SIZE void MatMul(const Matrix A,const Matrix B,Matrix C) /Load A and B to device memory Matrix d A; d A.width A.width;d A.height A.height; size t size A.width A.height sizeof(float); cudaMalloc(&d A.elements,size); cudaMemcpy(d A.elements,A.elements,size, cudaMemcpyHostToDevice); Matrix d Bi d B.width B.width;d B.height B.height; size B.width B.height sizeof(float); cudaMalloc(&d B.elements,size); cudaMemcpy(d B.elements,B.elements,size, cudaMemcpyHostToDevice); /Allocate c in device memory Matrix d C; d C.width =C.width;d C.height C.height; size =C.width C.height sizeof(float); cudaMalloc(&d C.elements,size); /Invoke kernel dim3 dimBlock(BLOCK SIZE,BLOCK SIZE); dim3 dimGrid(B.width dimBlock.x,A.height dimBlock.y); MatMulKernel<<<dimGrid,dimBlock>>>(d A,d B,d C); /Read c from device memory cudaMemcpy(C.elements,Cd.elements,size, cudaMemcpyDeviceToHost); /Free device memory cudaFree(d A.elements); cudaFree(d B.elements); cudaFree(d C.elements); /Matrix multiplication kernel called by MatMul() 1gXTI1P'仅XTI1e)T3 uIaxTnp3 EN PTOA一Teg /Each thread computes one element of C /by accumulating results into Cvalue float Cvalue =0; int row =blockIdx.y blockDim.y threadIdx.y; int col blockIdx.x blockDim.x+threadIdx.x; for (int e 0;e A.width;+te) Cvalue +=A.elements[row A.width e] B.elements[e B.width col]; C.elements[row C.width col]Cvalue; www.nvidia.com CUDA C Programming Guide PG-02829-001v8.0124
Programming Interface www.nvidia.com CUDA C Programming Guide PG-02829-001_v8.0 | 24 column of B and computes the corresponding element of C as illustrated in Figure 9. A is therefore read B.width times from global memory and B is read A.height times. // Matrices are stored in row-major order: // M(row, col) = *(M.elements + row * M.width + col) typedef struct { int width; int height; float* elements; } Matrix; // Thread block size #define BLOCK_SIZE 16 // Forward declaration of the matrix multiplication kernel __global__ void MatMulKernel(const Matrix, const Matrix, Matrix); // Matrix multiplication - Host code // Matrix dimensions are assumed to be multiples of BLOCK_SIZE void MatMul(const Matrix A, const Matrix B, Matrix C) { // Load A and B to device memory Matrix d_A; d_A.width = A.width; d_A.height = A.height; size_t size = A.width * A.height * sizeof(float); cudaMalloc(&d_A.elements, size); cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice); Matrix d_B; d_B.width = B.width; d_B.height = B.height; size = B.width * B.height * sizeof(float); cudaMalloc(&d_B.elements, size); cudaMemcpy(d_B.elements, B.elements, size, cudaMemcpyHostToDevice); // Allocate C in device memory Matrix d_C; d_C.width = C.width; d_C.height = C.height; size = C.width * C.height * sizeof(float); cudaMalloc(&d_C.elements, size); // Invoke kernel dim3 dimBlock(BLOCK_SIZE, BLOCK_SIZE); dim3 dimGrid(B.width / dimBlock.x, A.height / dimBlock.y); MatMulKernel<<<dimGrid, dimBlock>>>(d_A, d_B, d_C); // Read C from device memory cudaMemcpy(C.elements, Cd.elements, size, cudaMemcpyDeviceToHost); // Free device memory cudaFree(d_A.elements); cudaFree(d_B.elements); cudaFree(d_C.elements); } // Matrix multiplication kernel called by MatMul() __global__ void MatMulKernel(Matrix A, Matrix B, Matrix C) { // Each thread computes one element of C // by accumulating results into Cvalue float Cvalue = 0; int row = blockIdx.y * blockDim.y + threadIdx.y; int col = blockIdx.x * blockDim.x + threadIdx.x; for (int e = 0; e < A.width; ++e) Cvalue += A.elements[row * A.width + e] * B.elements[e * B.width + col]; C.elements[row * C.width + col] = Cvalue; }
Programming Interface 8 col 山 B 0 A C row A.width B.width A.height-1 Figure 9 Matrix Multiplication without Shared Memory The following code sample is an implementation of matrix multiplication that does take advantage of shared memory.In this implementation,each thread block is responsible for computing one square sub-matrix Csub of C and each thread within the block is responsible for computing one element of Csub.As illustrated in Figure 10,Csub is equal to the product of two rectangular matrices:the sub-matrix of A of dimension(A.width, block_size)that has the same row indices as Csub,and the sub-matrix of B of dimension (block_size,A.width )that has the same column indices as Csub.In order to fit into the device's resources,these two rectangular matrices are divided into as many square matrices of dimension block_size as necessary and Csub is computed as the sum of the products of these square matrices.Each of these products is performed by first loading the two corresponding square matrices from global memory to shared memory with one thread loading one element of each matrix,and then by having each thread compute one element of the product.Each thread accumulates the result of each of these products into a register and once done writes the result to global memory. www.nvidia.com CUDA C Programming Guide PG-02829-001_v8.0|25
Programming Interface www.nvidia.com CUDA C Programming Guide PG-02829-001_v8.0 | 25 A B C A.w idth B. w idth 0 col A.height B. height B. w idth-1 row 0 A.height-1 Figure 9 Matrix Multiplication without Shared Memory The following code sample is an implementation of matrix multiplication that does take advantage of shared memory. In this implementation, each thread block is responsible for computing one square sub-matrix Csub of C and each thread within the block is responsible for computing one element of Csub. As illustrated in Figure 10, Csub is equal to the product of two rectangular matrices: the sub-matrix of A of dimension (A.width, block_size) that has the same row indices as Csub, and the sub-matrix of B of dimension (block_size, A.width )that has the same column indices as Csub. In order to fit into the device's resources, these two rectangular matrices are divided into as many square matrices of dimension block_size as necessary and Csub is computed as the sum of the products of these square matrices. Each of these products is performed by first loading the two corresponding square matrices from global memory to shared memory with one thread loading one element of each matrix, and then by having each thread compute one element of the product. Each thread accumulates the result of each of these products into a register and once done writes the result to global memory
Programming Interface By blocking the computation this way,we take advantage of fast shared memory and save a lot of global memory bandwidth since A is only read(B.width block_size)times from global memory and B is read (A.height/block_size)times. The Matrix type from the previous code sample is augmented with a stride field,so that sub-matrices can be efficiently represented with the same type.device_functions are used to get and set elements and build any sub-matrix from a matrix. /Matrices are stored in row-major order: /M(row,col)=*(M.elements row M.stride col) typedef struct int width; int height; int stride; float*elements; Matrix; /Get a matrix element _device_float GetElement(const Matrix A,int row,int col) return A.elements [row *A.stride col]; } /Set a matrix element device void SetElement (Matrix A,int row,int col, float value) A.elements[row A.stride col]value; /Get the BLOCK SIZExBLOCK SIZE sub-matrix Asub of A that is // located col sub-matrices to the right and row sub-matrices down /from the upper-left corner of A device Matrix GetSubMatrix(Matrix A,int row,int col) Matrix Asub; Asub.width BLOCK SIZE; Asub.height BLOCK SIZE; Asub.stride A.stride; Asub.elements &A.elements[A.stride BLOCK SIZE row BLOCK SIZE col]; return Asub; } /Thread block size #define BLOCK SIZE 16 /Forward declaration of the matrix multiplication kernel global void MatMulKernel(const Matrix,const Matrix,Matrix); /Matrix multiplication-Host code /Matrix dimensions are assumed to be multiples of BLOCK SIZE void MatMul(const Matrix A,const Matrix B,Matrix C) /Load A and B to device memory Matrix d A; d A.width d A.stride A.width;d A.height A.height; size t size A.width A.height sizeof(float); cudaMalloc(&d A.elements,size); cudaMemcpy(d_A.elements,A.elements,size, cudaMemcpyHostToDevice); Matrix d Bi d B.width d B.stride =B.width;d B.height =B.height; size B.width B.height sizeof(float); www.nvidia.com CUDA C Programming Guide PG-02829-001_v8.0|26
Programming Interface www.nvidia.com CUDA C Programming Guide PG-02829-001_v8.0 | 26 By blocking the computation this way, we take advantage of fast shared memory and save a lot of global memory bandwidth since A is only read (B.width / block_size) times from global memory and B is read (A.height / block_size) times. The Matrix type from the previous code sample is augmented with a stride field, so that sub-matrices can be efficiently represented with the same type. __device__ functions are used to get and set elements and build any sub-matrix from a matrix. // Matrices are stored in row-major order: // M(row, col) = *(M.elements + row * M.stride + col) typedef struct { int width; int height; int stride; float* elements; } Matrix; // Get a matrix element __device__ float GetElement(const Matrix A, int row, int col) { return A.elements[row * A.stride + col]; } // Set a matrix element __device__ void SetElement(Matrix A, int row, int col, float value) { A.elements[row * A.stride + col] = value; } // Get the BLOCK_SIZExBLOCK_SIZE sub-matrix Asub of A that is // located col sub-matrices to the right and row sub-matrices down // from the upper-left corner of A __device__ Matrix GetSubMatrix(Matrix A, int row, int col) { Matrix Asub; Asub.width = BLOCK_SIZE; Asub.height = BLOCK_SIZE; Asub.stride = A.stride; Asub.elements = &A.elements[A.stride * BLOCK_SIZE * row + BLOCK_SIZE * col]; return Asub; } // Thread block size #define BLOCK_SIZE 16 // Forward declaration of the matrix multiplication kernel __global__ void MatMulKernel(const Matrix, const Matrix, Matrix); // Matrix multiplication - Host code // Matrix dimensions are assumed to be multiples of BLOCK_SIZE void MatMul(const Matrix A, const Matrix B, Matrix C) { // Load A and B to device memory Matrix d_A; d_A.width = d_A.stride = A.width; d_A.height = A.height; size_t size = A.width * A.height * sizeof(float); cudaMalloc(&d_A.elements, size); cudaMemcpy(d_A.elements, A.elements, size, cudaMemcpyHostToDevice); Matrix d_B; d_B.width = d_B.stride = B.width; d_B.height = B.height; size = B.width * B.height * sizeof(float);