CSR Data Layout row ptr 0 2 2 5 7 data 3 1 2 4 1 1 1 col index 0 2 1 2 3 0 3 11
data 3 1 2 4 1 1 1 0 2 1 2 3 0 3 row_ptr 0 2 2 5 7 col_index CSR Data Layout 11
CSR Kernel Design Dot product ptr With vector 12
CSR Kernel Design 12 CSR Format ptr Dot product With vector
A Parallel SpMV/CSR Kernel (CUDA) 1.global void SpMV CSR(int num rows,float *data, int *col index,int *row ptr,float *x,float *y){ 2. int row blockIdx.x blockDim.x threadIdx.x; 3. if (row num rows){ 4. float dot 0; 5. int row start row ptr[row]; 6. int row end row ptr[row+1]; 7. for (int elem row start;elem row end;elem++){ 8. dot +data[elem]x[col index[elem]]; } 9. y[row]dot; Row 0 Row 2 Row 3 Nonzero values data[7] {3,1,2,4,1,1,1} Column indices col index[7]{0,2,1,2,3,0,3 Row Pointers row_ptr[5] {0,2,2,5,7} 13
1. __global__ void SpMV_CSR(int num_rows, float *data, int *col_index, int *row_ptr, float *x, float *y) { 2. int row = blockIdx.x * blockDim.x + threadIdx.x; 3. if (row < num_rows) { 4. float dot = 0; 5. int row_start = row_ptr[row]; 6. int row_end = row_ptr[row+1]; 7. for (int elem = row_start; elem < row_end; elem++) { 8. dot += data[elem] * x[col_index[elem]]; } 9. y[row] = dot; } } 13 A Parallel SpMV/CSR Kernel (CUDA) Row 0 Row 2 Row 3 Nonzero values data[7] { 3, 1, 2, 4, 1, 1, 1 } Column indices col_index[7] { 0, 2, 1, 2, 3, 0, 3 } Row Pointers row_ptr[5] { 0, 2, 2, 5, 7 }
CSR Kernel Control Divergence Threads execute different number of iterations in the kernel for-loop row ptr 2 2 5 7 data 3 1 2 4 1 1 col index 0 2 1 2 3 0 3 14
data 3 1 2 4 1 1 1 0 2 1 2 3 0 3 row_ptr 0 2 2 5 7 col_index CSR Kernel Control Divergence • Threads execute different number of iterations in the kernel for-loop 14
CSR Kernel Memory Divergence (Uncoalesced Accesses) Adjacent threads access non-adjacent memory locations Grey elements are accessed by all threads in iteration 0 row ptr 2 2 5 7 data 3 1 2 4 1 1 col index 0 2 1 2 3 0 3 15
data 3 1 2 4 1 1 1 0 2 1 2 3 0 3 row_ptr 0 2 2 5 7 col_index CSR Kernel Memory Divergence (Uncoalesced Accesses) • Adjacent threads access non-adjacent memory locations – Grey elements are accessed by all threads in iteration 0 15