Stokhos Package Browser (Single Doxygen Collection) Version of the Day
Loading...
Searching...
No Matches
csr_vector.h
Go to the documentation of this file.
1/*
2 * Copyright 2008-2009 NVIDIA Corporation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17
18#pragma once
19
20#include <cusp/detail/device/arch.h>
21#include <cusp/detail/device/common.h>
22#include <cusp/detail/device/utils.h>
23#include <cusp/detail/device/texture.h>
24#include <thrust/device_ptr.h>
25#include <cudaProfiler.h>
26#include <cuda_profiler_api.h>
27#include <stdio.h>
28#include <cusp/detail/device/arch.h>
29
30#include "Stokhos_config.h"
31#if 0 && defined(HAVE_STOKHOS_CUSPARSE)
32#define USE_CUSPARSE_ROW 0
33#define USE_CUSPARSE_COL 1
34#else
35#define USE_CUSPARSE_ROW 0
36#define USE_CUSPARSE_COL 0
37#endif
38
39#if USE_CUSPARSE_ROW || USE_CUSPARSE_COL
40#include <sstream>
41#include <stdexcept>
42#include <cuda_runtime.h>
43#include <cusparse.h>
44#endif
45
46namespace cusp
47{
48namespace detail
49{
50namespace device
51{
52
53#if USE_CUSPARSE_ROW || USE_CUSPARSE_COL
54
55class CudaSparseSingleton {
56public:
57
58 cusparseStatus_t status;
59 cusparseHandle_t handle;
60 cusparseMatDescr_t descra;
61
62 static CudaSparseSingleton & singleton();
63
64private:
65
66 CudaSparseSingleton()
67 {
68 status = cusparseCreate(&handle);
69 if(status != CUSPARSE_STATUS_SUCCESS)
70 {
71 throw std::runtime_error( std::string("ERROR - CUSPARSE Library Initialization failed" ) );
72 }
73
74 status = cusparseCreateMatDescr(&descra);
75 if(status != CUSPARSE_STATUS_SUCCESS)
76 {
77 throw std::runtime_error( std::string("ERROR - CUSPARSE Library Matrix descriptor failed" ) );
78 }
79
80 cusparseSetMatType(descra , CUSPARSE_MATRIX_TYPE_GENERAL);
81 cusparseSetMatIndexBase(descra , CUSPARSE_INDEX_BASE_ZERO);
82 }
83
84 CudaSparseSingleton( const CudaSparseSingleton & );
85 CudaSparseSingleton & operator = ( const CudaSparseSingleton & );
86};
87
88CudaSparseSingleton & CudaSparseSingleton::singleton()
89{
90 static CudaSparseSingleton s ; return s ;
91}
92
93#endif
94
95#if USE_CUSPARSE_ROW
96
98 const csr_matrix<int,double,device_memory>& A,
99 const array2d<double, device_memory, column_major>& x,
100 array2d<double, device_memory, column_major>& y,
101 cusp::row_major)
102{
103 CudaSparseSingleton & s = CudaSparseSingleton::singleton();
104 const double alpha = 1 , beta = 0 ;
105 cusparseStatus_t status =
106 cusparseDcsrmm2(s.handle,
107 CUSPARSE_OPERATION_NON_TRANSPOSE,
108 CUSPARSE_OPERATION_TRANSPOSE,
109 A.num_rows, x.num_cols, A.num_cols, A.num_entries,
110 alpha,
111 s.descra,
112 thrust::raw_pointer_cast(&A.values[0]),
113 thrust::raw_pointer_cast(&A.row_offsets[0]),
114 thrust::raw_pointer_cast(&A.column_indices[0]),
115 thrust::raw_pointer_cast(&(x.values)[0]),
116 x.num_rows,
117 beta,
118 thrust::raw_pointer_cast(&(y.values)[0]),
119 y.num_rows);
120
121 if ( CUSPARSE_STATUS_SUCCESS != status ) {
122 throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
123 }
124}
125
126#else
127
128template <typename IndexType, typename ValueType, unsigned MAX_NNZ_PER_ROW>
129__global__ void
130spmm_csr_vector_kernel_row(const IndexType Anum_rows,
131 const IndexType xnum_rows,
132 const IndexType xnum_cols,
133 const IndexType * Ar,
134 const IndexType * Ac,
135 const ValueType * Aval,
136 const ValueType * x,
137 ValueType * y)
138{
139
140 // Allocate storage for shared A row
141 extern __shared__ int sh[];
142 volatile ValueType * const sh_Aval = (ValueType*) sh;
143 volatile IndexType * const sh_Ac = (IndexType*) (sh_Aval + MAX_NNZ_PER_ROW * blockDim.y);
144
145 // Global row
146 const IndexType row = threadIdx.y + blockDim.y * blockIdx.x;
147 if (row < Anum_rows) {
148 const IndexType row_start = Ar[row];
149 const IndexType row_end = Ar[row+1];
150
151 // Initialize y for this row
152 for (IndexType j=threadIdx.x; j<xnum_cols; j+=blockDim.x) {
153 y[j+xnum_cols*row] = 0.0;
154 }
155
156 // Loop over cols of A MAX_NNZ_PER_ROW at a time
157 for (IndexType block_col=row_start; block_col<row_end;
158 block_col+=MAX_NNZ_PER_ROW) {
159 const IndexType r = block_col + MAX_NNZ_PER_ROW < row_end ?
160 MAX_NNZ_PER_ROW : row_end-block_col;
161
162 // Read row of A into shared mem using all threads in block
163 // Store in shared-memory in a transposed layout to avoid
164 // bank conflicts between warps (the kernel is completely bandwidth
165 // limited, so this doesn't really help performance).
166 // And we don't need synchronization since blockDim.x <= warp_size
167 for (IndexType i=threadIdx.x; i<r; i+=blockDim.x){
168 sh_Aval[i*blockDim.y+threadIdx.y] = Aval[i+block_col];
169 sh_Ac[ i*blockDim.y+threadIdx.y] = Ac [i+block_col];
170 }
171
172 // Loop over cols of x
173 for (IndexType j=threadIdx.x; j<xnum_cols; j+=blockDim.x){
174
175 // Loop over cols of A
176 ValueType sum = 0.0;
177 for (IndexType jj=0; jj<r; jj++){
178 IndexType J = sh_Ac[jj*blockDim.y+threadIdx.y];
179 sum += sh_Aval[jj*blockDim.y+threadIdx.y] * x[j+xnum_cols*J];
180 }
181 y[j+xnum_cols*row] += sum;
182
183 } // Loop over cols of x
184
185 } // Loop over block cols of A
186 }
187}
188
189template <typename Matrix, typename Vector2, typename Vector3>
190void __spmm_csr_vector(const Matrix& A, const Vector2& x, Vector3& y,
191 cusp::row_major)
192{
193 typedef typename Matrix::index_type IndexType;
194 typedef typename Matrix::value_type ValueType;
195 typedef typename Matrix::memory_space MemorySpace;
196
197 // Here we do a half-warp for columns of x to get a little better
198 // granularity with the number of columns not being divisible by 32.
199 // These numbers work out to 4 warps/block and 16 blocks/SM (based both on
200 // shared memory and registers for Kepler). This results in 100% occupancy.
201 const unsigned MAX_NNZ_PER_ROW = 32;
202 const size_t COLS_PER_BLOCK = 16;
203 const size_t ROWS_PER_BLOCK = 8;
204 const size_t shared =
205 ROWS_PER_BLOCK * MAX_NNZ_PER_ROW * (sizeof(IndexType) + sizeof(ValueType));
206 const size_t NUM_BLOCKS = (A.num_rows + ROWS_PER_BLOCK-1) / ROWS_PER_BLOCK;
207
208 dim3 block(COLS_PER_BLOCK, ROWS_PER_BLOCK, 1);
209 dim3 grid(NUM_BLOCKS, 1);
210
211 spmm_csr_vector_kernel_row<IndexType, ValueType, MAX_NNZ_PER_ROW> <<<grid, block, shared>>>
212 (A.num_rows, x.num_rows, x.num_cols,
213 thrust::raw_pointer_cast(&A.row_offsets[0]),
214 thrust::raw_pointer_cast(&A.column_indices[0]),
215 thrust::raw_pointer_cast(&A.values[0]),
216 thrust::raw_pointer_cast(&(x.values)[0]),
217 thrust::raw_pointer_cast(&(y.values)[0]));
218}
219
220#endif
221
222#if USE_CUSPARSE_COL
223
225 const csr_matrix<int,double,device_memory>& A,
226 const array2d<double, device_memory, column_major>& x,
227 array2d<double, device_memory, column_major>& y,
228 cusp::column_major)
229{
230 CudaSparseSingleton & s = CudaSparseSingleton::singleton();
231 const double alpha = 1 , beta = 0 ;
232 cusparseStatus_t status =
233 cusparseDcsrmm(s.handle,
234 CUSPARSE_OPERATION_NON_TRANSPOSE,
235 A.num_rows, x.num_cols, A.num_cols,
236 alpha,
237 s.descra,
238 thrust::raw_pointer_cast(&A.values[0]),
239 thrust::raw_pointer_cast(&A.row_offsets[0]),
240 thrust::raw_pointer_cast(&A.column_indices[0]),
241 thrust::raw_pointer_cast(&(x.values)[0]),
242 x.num_rows,
243 beta,
244 thrust::raw_pointer_cast(&(y.values)[0]),
245 y.num_rows);
246
247 if ( CUSPARSE_STATUS_SUCCESS != status ) {
248 throw std::runtime_error( std::string("ERROR - cusparseDcsrmv " ) );
249 }
250}
251
252#else
253
254#if 1
255
256template <typename IndexType, typename ValueType, unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR>
257__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
258__global__ void
259spmm_csr_vector_kernel_col(const IndexType Anum_rows,
260 const IndexType xnum_rows,
261 const IndexType xnum_cols,
262 const IndexType * Ap,
263 const IndexType * Aj,
264 const ValueType * Ax,
265 const ValueType * x,
266 ValueType * y)
267{
268 __shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals
269 __shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2];
270
271 const IndexType THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
272
273 const IndexType thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index
274 const IndexType thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector
275 const IndexType vector_id = thread_id / THREADS_PER_VECTOR; // global vector index
276 const IndexType vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block
277 const IndexType num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors
278
279 for(IndexType row = vector_id; row < Anum_rows; row += num_vectors)
280 {
281 // use two threads to fetch Ap[row] and Ap[row+1]
282 // this is considerably faster than the straightforward version
283 if(thread_lane < 2)
285
286 const IndexType row_start = ptrs[vector_lane][0]; //same as: row_start = Ap[row];
287 const IndexType row_end = ptrs[vector_lane][1]; //same as: row_end = Ap[row+1];
288
289 //loop over cols of x
290 for (IndexType j = 0; j < xnum_cols; j++){
291
292 // initialize local sum
293 ValueType sum = 0;
294
295 if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
296 {
297 // ensure aligned memory access to Aj and Ax
298
299 IndexType jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
300
301 // accumulate local sums
302 if(jj >= row_start && jj < row_end)
303 sum += Ax[jj] * x[Aj[jj]+xnum_rows*j];
304
305 // accumulate local sums
306 for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR)
307 sum += Ax[jj] * x[Aj[jj]+xnum_rows*j];
308 }
309 else
310 {
311 // accumulate local sums
312 for(IndexType jj = row_start + thread_lane; jj < row_end; jj += THREADS_PER_VECTOR)
313 sum += Ax[jj] * x[Aj[jj]+xnum_rows*j];
314 }
315
316 // store local sum in shared memory
317 sdata[threadIdx.x] = sum;
318
319 // reduce local sums to row sum
320 if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
321 if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8];
322 if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4];
323 if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2];
324 if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1];
325
326 // first thread writes the result
327 if (thread_lane == 0)
328 y[j*Anum_rows+row] = sdata[threadIdx.x];
329
330 }
331 }
332}
333
334#else
335
336template <typename IndexType, typename ValueType, unsigned int VECTORS_PER_BLOCK, unsigned int THREADS_PER_VECTOR>
337__launch_bounds__(VECTORS_PER_BLOCK * THREADS_PER_VECTOR,1)
338__global__ void
339spmm_csr_vector_kernel_col(const IndexType Anum_rows,
340 const IndexType xnum_rows,
341 const IndexType xnum_cols,
342 const IndexType * Ar,
343 const IndexType * Ac,
344 const ValueType * Aval,
345 const ValueType * x,
346 ValueType * y)
347{
348 __shared__ volatile ValueType sdata[VECTORS_PER_BLOCK * THREADS_PER_VECTOR + THREADS_PER_VECTOR / 2]; // padded to avoid reduction conditionals
349 __shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2];
350
351 extern __shared__ int sha[];
352 ValueType * const sh_Aval = (ValueType*) sha;
353 IndexType * const sh_Ac = (IndexType*) (sh_Aval + 32 * VECTORS_PER_BLOCK);
354
355 const IndexType THREADS_PER_BLOCK = VECTORS_PER_BLOCK * THREADS_PER_VECTOR;
356
357 const IndexType thread_id = THREADS_PER_BLOCK * blockIdx.x + threadIdx.x; // global thread index
358 const IndexType thread_lane = threadIdx.x & (THREADS_PER_VECTOR - 1); // thread index within the vector
359 const IndexType vector_id = thread_id / THREADS_PER_VECTOR; // global vector index i.e. global warp id
360 const IndexType vector_lane = threadIdx.x / THREADS_PER_VECTOR; // vector index within the block
361 const IndexType num_vectors = VECTORS_PER_BLOCK * gridDim.x; // total number of active vectors
362
363 for(IndexType row = vector_id; row < Anum_rows; row += num_vectors) {
364 // use two threads to fetch Ar[row] and Ar[row+1]
365 // this is considerably faster than the straightforward version
366
367 if(thread_lane < 2)
369
370 const IndexType row_start = ptrs[vector_lane][0]; //same as: row_start = Ar[row];
371 const IndexType row_end = ptrs[vector_lane][1]; //same as: row_end = Ar[row+1];
372 //nnz in row
373 const IndexType r = row_end - row_start;
374
375 //fetch Aval and Ac for current row of A and store in shared mem
376 for (IndexType i = thread_lane; i < r; i+= THREADS_PER_VECTOR){
377 sh_Aval[vector_lane*32+i] = Aval[i+row_start];
378 sh_Ac[vector_lane*32+i] = Ac[i+row_start];
379 }
380 //loop over cols of x
381 for (IndexType j = 0; j < xnum_cols; j++){
382
383
384 // initialize local sum
385
386 ValueType sum = 0;
387
388 if (THREADS_PER_VECTOR == 32 && row_end - row_start > 32)
389 {
390 // ensure aligned memory access to Ac and Aval
391 IndexType jj = row_start - (row_start & (THREADS_PER_VECTOR - 1)) + thread_lane;
392 // accumulate local sums
393 if(jj >= row_start && jj < row_end)
394 sum += Aval[jj] * x[j*xnum_rows+Ac[jj]];
395 // accumulate local sums
396 for(jj += THREADS_PER_VECTOR; jj < row_end; jj += THREADS_PER_VECTOR)
397 sum += Aval[jj] * x[j*xnum_rows+Ac[jj]];
398 }
399 else
400 {
401 //accumulate local sums
402 for (IndexType jj = thread_lane; jj < r; jj+=THREADS_PER_VECTOR)
403 //sum += sh_Aval[jj+vector_lane*32] * x[j * xnum_rows + sh_Ac[vector_lane*32+jj]];
404 sum += Aval[jj] * x[j*xnum_rows+Ac[jj]];
405
406 }
407 // store local sum in shared memory
408 sdata[threadIdx.x] = sum;
409 // reduce local sums to row sum
410 if (THREADS_PER_VECTOR > 16) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
411 if (THREADS_PER_VECTOR > 8) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8];
412 if (THREADS_PER_VECTOR > 4) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4];
413 if (THREADS_PER_VECTOR > 2) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2];
414 if (THREADS_PER_VECTOR > 1) sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1];
415 // first thread writes the result
416 if (thread_lane == 0)
417 y[j*Anum_rows+row] = sdata[threadIdx.x];
418 }
419 }
420}
421
422#endif
423
424template <bool UseCache, unsigned int THREADS_PER_VECTOR,
425 typename Matrix, typename Vector2, typename Vector3>
426void __spmm_csr_vector_col(const Matrix& A, const Vector2& x, Vector3& y)
427{
428 typedef typename Matrix::index_type IndexType;
429 typedef typename Matrix::value_type ValueType;
430
431 const size_t THREADS_PER_BLOCK = 256;
432 const size_t VECTORS_PER_BLOCK = THREADS_PER_BLOCK / THREADS_PER_VECTOR;
433 //const size_t SHARED = VECTORS_PER_BLOCK * 32 * (sizeof(IndexType)+sizeof(ValueType));
434 const size_t SHARED = 0;
435
436 const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(spmm_csr_vector_kernel_col<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR>, THREADS_PER_BLOCK, SHARED);
437 const size_t NUM_BLOCKS = std::min<size_t>(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, VECTORS_PER_BLOCK));
438
439 spmm_csr_vector_kernel_col<IndexType, ValueType, VECTORS_PER_BLOCK, THREADS_PER_VECTOR> <<<NUM_BLOCKS, THREADS_PER_BLOCK, SHARED>>>
440 (A.num_rows, x.num_rows, x.num_cols,
441 thrust::raw_pointer_cast(&A.row_offsets[0]),
442 thrust::raw_pointer_cast(&A.column_indices[0]),
443 thrust::raw_pointer_cast(&A.values[0]),
444 thrust::raw_pointer_cast(&(x.values)[0]),
445 thrust::raw_pointer_cast(&(y.values)[0]));
446}
447
448template <typename Matrix, typename Vector2, typename Vector3>
449void __spmm_csr_vector(const Matrix& A, const Vector2& x, Vector3& y,
450 cusp::column_major)
451{
452#if 0
453 typedef typename Vector2::index_type IndexType;
454 for (IndexType col=0; col<x.num_cols; ++col) {
455 multiply(A, x.column(col), y.column(col));
456 }
457#else
458 typedef typename Matrix::index_type IndexType;
459
460 const IndexType nnz_per_row = A.num_entries / A.num_rows;
461
462 if (nnz_per_row <= 2) { __spmm_csr_vector_col<false, 2>(A, x, y); return; }
463 if (nnz_per_row <= 4) { __spmm_csr_vector_col<false, 4>(A, x, y); return; }
464 if (nnz_per_row <= 8) { __spmm_csr_vector_col<false, 8>(A, x, y); return; }
465 if (nnz_per_row <= 16) { __spmm_csr_vector_col<false,16>(A, x, y); return; }
466
467 __spmm_csr_vector_col<false,32>(A, x, y);
468#endif
469}
470
471#endif
472
473template <typename Matrix, typename Vector2, typename Vector3>
474void spmm_csr_vector(const Matrix& A, const Vector2& x, Vector3& y)
475{
476 y.resize(A.num_rows, x.num_cols);
477 __spmm_csr_vector(A, x, y, typename Vector2::orientation());
478}
479
480} // end namespace device
481} // end namespace detail
482} // end namespace cusp
Kokkos::DefaultExecutionSpace device
std::enable_if< Kokkos::is_view_uq_pce< Kokkos::View< RD, RP... > >::value &&Kokkos::is_view_uq_pce< Kokkos::View< XD, XP... > >::value >::type sum(const Kokkos::View< RD, RP... > &r, const Kokkos::View< XD, XP... > &x)
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType * x
Definition: csr_vector.h:265
const IndexType thread_lane
Definition: csr_vector.h:274
const IndexType num_vectors
Definition: csr_vector.h:277
void spmm_csr_vector(const Matrix &A, const Vector2 &x, Vector3 &y)
Definition: csr_vector.h:474
void __spmm_csr_vector_col(const Matrix &A, const Vector2 &x, Vector3 &y)
Definition: csr_vector.h:426
void __spmm_csr_vector(const Matrix &A, const Vector2 &x, Vector3 &y, cusp::row_major)
Definition: csr_vector.h:190
const IndexType vector_id
Definition: csr_vector.h:275
const IndexType const IndexType const IndexType * Ap
Definition: csr_vector.h:262
const IndexType const IndexType const IndexType const IndexType * Aj
Definition: csr_vector.h:263
__shared__ volatile IndexType ptrs[VECTORS_PER_BLOCK][2]
Definition: csr_vector.h:269
__launch_bounds__(VECTORS_PER_BLOCK *THREADS_PER_VECTOR, 1) __global__ void spmm_csr_vector_kernel_col(const IndexType Anum_rows
__global__ void spmm_csr_vector_kernel_row(const IndexType Anum_rows, const IndexType xnum_rows, const IndexType xnum_cols, const IndexType *Ar, const IndexType *Ac, const ValueType *Aval, const ValueType *x, ValueType *y)
Definition: csr_vector.h:130
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType ValueType * y
Definition: csr_vector.h:267
const IndexType THREADS_PER_BLOCK
Definition: csr_vector.h:271
const IndexType xnum_rows
Definition: csr_vector.h:260
const IndexType vector_lane
Definition: csr_vector.h:276
const IndexType const IndexType xnum_cols
Definition: csr_vector.h:261
const IndexType thread_id
Definition: csr_vector.h:273
const IndexType const IndexType const IndexType const IndexType const ValueType * Ax
Definition: csr_vector.h:264