Stokhos Package Browser (Single Doxygen Collection) Version of the Day
Loading...
Searching...
No Matches
array2d.h
Go to the documentation of this file.
1/*
2 * Copyright 2008-2009 NVIDIA Corporation
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16#include <cusp/array2d.h>
17#include <cusp/array1d.h>
18#include <cuda.h>
19#include <iostream>
20
21namespace cusp
22{
23namespace detail
24{
25namespace device
26{
27
28
29template <typename IndexType,
30 typename ValueType>
31__global__ void
32row_axpby_kernel(const ValueType a, const ValueType b, const IndexType num_rows,
33 const IndexType num_cols,
34 const ValueType * x,
35 const ValueType * y,
36 ValueType * z)
37{
38 for (IndexType row = threadIdx.y + blockDim.y * blockIdx.x; row < num_rows;
39 row += gridDim.x * blockDim.y)
40 for (IndexType j = threadIdx.x; j < num_cols; j+=blockDim.x )
41 z[j+num_cols*row]= a * x[j+num_cols*row] + b * y[j+num_cols*row];
42}
43
44template <typename IndexType,
45 typename ValueType>
46__global__ void
47col_axpby_kernel(const ValueType a, const ValueType b, const IndexType num_rows,
48 const IndexType num_cols,
49 const ValueType * x,
50 const ValueType * y,
51 ValueType * z)
52{
53 for (IndexType j = threadIdx.y; j < num_cols; j+=blockDim.y )
54 for (IndexType row = threadIdx.x + blockDim.x * blockIdx.x; row < num_rows;
55 row += gridDim.x * blockDim.x)
56 z[j*num_rows+row]= a * x[j*num_rows+row] + b * y[j*num_rows+row];
57}
58
59template <typename IndexType,
60 typename ValueType>
61__global__ void
62row_spmm_dense_diag_kernel(const IndexType Anum_rows,
63 const IndexType Anum_cols,
64 const ValueType * Aval,
65 const ValueType * x,
66 ValueType * y)
67{
68 for (IndexType row = threadIdx.y + blockDim.y * blockIdx.x; row < Anum_rows;
69 row += gridDim.x * blockDim.y)
70 for (IndexType j = threadIdx.x; j < Anum_cols; j+=blockDim.x )
71 y[j+Anum_cols*row]= x[j] * Aval[j+Anum_cols*row];
72}
73
74template <typename IndexType,
75 typename ValueType>
76__global__ void
77col_spmm_dense_diag_kernel(const IndexType Anum_rows,
78 const IndexType Anum_cols,
79 const ValueType * Aval,
80 const ValueType * x,
81 ValueType * y)
82{
83 for (IndexType j = threadIdx.y; j < Anum_cols; j+=blockDim.y )
84 for (IndexType row = threadIdx.x + blockDim.x * blockIdx.x; row < Anum_rows;
85 row += gridDim.x * blockDim.x)
86 y[j*Anum_rows+row]= x[j] * Aval[j*Anum_rows+row];
87}
88
89template <typename IndexType,
90 typename ValueType>
91__global__ void
92row_spmm_MVdot_kernel(IndexType ROWS_PER_BLOCK, const IndexType Anum_rows,
93 const IndexType Anum_cols,
94 const ValueType * Aval,
95 const ValueType * x,
96 ValueType * temp)
97{
98 extern __shared__ int sh[];
99 ValueType * const sdata = (ValueType*) sh;
100
101 for (IndexType col = threadIdx.x; col < Anum_cols; col += blockDim.x){
102 sdata[col + Anum_cols*threadIdx.y] = 0;
103 }
104 for (IndexType row = threadIdx.y + blockDim.y * blockIdx.x; row < Anum_rows;
105 row += gridDim.x * blockDim.y) {
106 for (IndexType col = threadIdx.x; col < Anum_cols; col += blockDim.x){
107 sdata[col + Anum_cols*threadIdx.y] +=
108 Aval[col + Anum_cols * row] * x[col + Anum_cols * row];
109 }
110 }
111 //sum all local sums together to get
112 __syncthreads();
113 IndexType nwarp = blockDim.y / 2;
114 while (nwarp > 0) {
115 for (IndexType col = threadIdx.x; col < Anum_cols; col += blockDim.x){
116 IndexType j = threadIdx.y;
117 if (j < nwarp){
118 IndexType j2 = j+nwarp;
119 sdata[col+j*Anum_cols] += sdata[col+j2*Anum_cols];
120 }
121 __syncthreads();
122 }
123 nwarp /= 2;
124 }
125 __syncthreads();
126 for (IndexType col = threadIdx.x; col < Anum_cols; col+=blockDim.x){
127 temp[ Anum_cols * blockIdx.x + col ] = sdata[col];
128 }
129
130}
131
132template <typename IndexType,
133 typename ValueType>
134__global__ void
135col_spmm_MVdot_kernel(const IndexType Anum_rows,
136 const IndexType Anum_cols,
137 const ValueType * Aval,
138 const ValueType * x,
139 ValueType * temp)
140{
141 extern __shared__ int sh[];
142 volatile ValueType * const sdata = (ValueType*) sh;
143
144 IndexType tid = threadIdx.x + threadIdx.y*blockDim.x;
145
146 for (IndexType col = threadIdx.y; col < Anum_cols; col += blockDim.y) {
147
148 ValueType s = 0.0;
149 for (IndexType row = threadIdx.x+blockDim.x*blockIdx.x; row < Anum_rows;
150 row += gridDim.x*blockDim.x) {
151 s += Aval[col*Anum_rows+row] * x[col*Anum_rows+row];
152 }
153
154 // sum all local sums together to get
155 sdata[tid] = s;
156 if (threadIdx.x+16 < blockDim.x) sdata[tid] += sdata[tid + 16];
157 if (threadIdx.x+ 8 < blockDim.x) sdata[tid] += sdata[tid + 8];
158 if (threadIdx.x+ 4 < blockDim.x) sdata[tid] += sdata[tid + 4];
159 if (threadIdx.x+ 2 < blockDim.x) sdata[tid] += sdata[tid + 2];
160 if (threadIdx.x+ 1 < blockDim.x) sdata[tid] += sdata[tid + 1];
161
162 // store results
163 if (threadIdx.x == 0)
164 temp[col*gridDim.x+blockIdx.x] = sdata[tid];
165 }
166
167}
168
169//kernel to compute final dot product by adding the sums across the blocks
170template <typename IndexType,
171 typename ValueType>
172__global__ void
173row_reduce_kernel(const IndexType num_rows,
174 const IndexType num_cols,
175 const ValueType * temp,
176 ValueType * y)
177{
178 extern __shared__ int sh[];
179 ValueType * const sdata = (ValueType*) sh;
180 for (IndexType col = threadIdx.x; col < num_cols; col += blockDim.x){
181 sdata[col + num_cols*threadIdx.y] = 0;
182 }
183 for (IndexType row = threadIdx.y + blockDim.y * blockIdx.x; row < num_rows;
184 row += gridDim.x * blockDim.y) {
185 for (IndexType col = threadIdx.x; col < num_cols; col += blockDim.x){
186 sdata[col + num_cols*threadIdx.y] += temp[col + num_cols * row];
187 }
188 }
189 __syncthreads();
190 IndexType nwarp = blockDim.y / 2;
191 while (nwarp > 0) {
192 for (IndexType col = threadIdx.x; col < num_cols; col += blockDim.x){
193 IndexType j = threadIdx.y;
194 if (j < nwarp){
195 IndexType j2 = j+nwarp;
196 sdata[col+j*num_cols] += sdata[col+j2*num_cols];
197 }
198 __syncthreads();
199 }
200 nwarp /= 2;
201 }
202 __syncthreads();
203 for (IndexType col = threadIdx.x; col < num_cols; col+=blockDim.x){
204 y[col + num_cols * blockIdx.x] = sdata[col];
205 }
206}
207
208//kernel to compute final dot product by adding the sums across the blocks
209template <typename IndexType,
210 typename ValueType>
211__global__ void
212col_reduce_kernel(const IndexType num_rows,
213 const IndexType num_cols,
214 const ValueType * temp,
215 ValueType * y)
216{
217 extern __shared__ int sh[];
218 volatile ValueType * const sdata = (ValueType*) sh;
219
220 IndexType tid = threadIdx.x + threadIdx.y*blockDim.x;
221
222 for (IndexType col = threadIdx.y; col < num_cols; col += blockDim.y) {
223
224 ValueType s = 0.0;
225 for (IndexType row = threadIdx.x; row < num_rows; row += blockDim.x) {
226 s += temp[col*num_rows+row];
227 }
228
229 // sum all local sums together to get
230 sdata[tid] = s;
231 if (threadIdx.x+16 < blockDim.x) sdata[tid] += sdata[tid + 16];
232 if (threadIdx.x+ 8 < blockDim.x) sdata[tid] += sdata[tid + 8];
233 if (threadIdx.x+ 4 < blockDim.x) sdata[tid] += sdata[tid + 4];
234 if (threadIdx.x+ 2 < blockDim.x) sdata[tid] += sdata[tid + 2];
235 if (threadIdx.x+ 1 < blockDim.x) sdata[tid] += sdata[tid + 1];
236
237 // store results
238 if (threadIdx.x == 0)
239 y[col] = sdata[tid];
240 }
241}
242
243
244template <typename Vector1,
245 typename Vector2,
246 typename Vector3>
247void __spmm_MVdot(const int numRHS, const Vector1& A,
248 const Vector2& x,
249 Vector3& y, cusp::row_major)
250{
251 CUSP_PROFILE_SCOPED();
252 typedef typename Vector2::index_type IndexType;
253 typedef typename Vector2::value_type ValueType;
254 typedef typename Vector2::memory_space MemorySpace;
255 const size_t BLOCK_SIZE = 32;
256
257 const size_t ROWS_PER_BLOCK = 8;
258
259 dim3 block(BLOCK_SIZE, ROWS_PER_BLOCK);
260 const size_t shared = block.y * numRHS * sizeof(ValueType);
261
262
263 const size_t MAX_BLOCKS = 96;
264 const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, ROWS_PER_BLOCK));
265 dim3 grid(NUM_BLOCKS, 1);
266
267
268 cusp::array2d<ValueType, MemorySpace, cusp::row_major> temp(NUM_BLOCKS, A.num_cols, 0);
269
270 row_spmm_MVdot_kernel<IndexType,ValueType> <<<grid, block, shared>>>
271 (ROWS_PER_BLOCK, A.num_rows, A.num_cols,
272 thrust::raw_pointer_cast(&A.values[0]),
273 thrust::raw_pointer_cast(&x.values[0]),
274 thrust::raw_pointer_cast(&temp.values[0]));
275
276 // add rows of temp (sum from each block) for final answer
277 IndexType num_rows = NUM_BLOCKS;
278 IndexType num_blocks = 16;
279 cusp::array2d<ValueType, MemorySpace, cusp::row_major> sum_temp(num_blocks, numRHS, 0);
280 //Need size to be power of 2
281 while (num_blocks > 0){
282 dim3 sum_grid(num_blocks, 1);
283 IndexType size = 1;
284 while (size < num_rows / num_blocks){
285 size <<=1;
286 }
287 dim3 sum_block( 32, size);
288 size_t sum_shared = sum_block.y * numRHS * sizeof(ValueType);
289 sum_temp.resize(num_blocks, numRHS);
290 row_reduce_kernel<IndexType,ValueType> <<<sum_grid, sum_block, sum_shared>>>
291 (temp.num_rows, temp.num_cols, thrust::raw_pointer_cast(&temp.values[0]),
292 thrust::raw_pointer_cast(&sum_temp.values[0]));
293 cusp::copy(sum_temp, temp);
294 num_rows = num_blocks;
295 num_blocks /= 2;
296 }
297 for (IndexType i = 0; i < numRHS; i++){
298 y[i] = sum_temp(0, i);
299 }
300}
301
302template <typename Vector1,
303 typename Vector2,
304 typename Vector3>
305void __spmm_MVdot(const int numRHS, const Vector1& A,
306 const Vector2& x,
307 Vector3& y, cusp::column_major)
308{
309#if 0
310 typedef typename Vector2::index_type IndexType;
311 for (IndexType col=0; col<A.num_cols; ++col) {
312 y[col] = cusp::blas::dotc(A.column(col), x.column(col));
313 }
314#else
315 CUSP_PROFILE_SCOPED();
316 typedef typename Vector2::index_type IndexType;
317 typedef typename Vector2::value_type ValueType;
318 typedef typename Vector2::memory_space MemorySpace;
319
320 const size_t BLOCK_SIZE = 32;
321 const size_t COLS_PER_BLOCK = 8;
322
323 dim3 block(BLOCK_SIZE, COLS_PER_BLOCK);
324 const size_t shared = block.x * block.y * sizeof(ValueType);
325
326 const size_t MAX_BLOCKS = 96;
327 const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, BLOCK_SIZE));
328 dim3 grid(NUM_BLOCKS, 1);
329
330
331 cusp::array2d<ValueType, MemorySpace, cusp::column_major> temp(NUM_BLOCKS, A.num_cols, 0);
332
333 col_spmm_MVdot_kernel<IndexType,ValueType> <<<grid, block, shared>>>
334 (A.num_rows, A.num_cols,
335 thrust::raw_pointer_cast(&A.values[0]),
336 thrust::raw_pointer_cast(&x.values[0]),
337 thrust::raw_pointer_cast(&temp.values[0]));
338
339 // add rows of temp (sum from each block) for final answer
340 dim3 sum_grid(1, 1);
341 col_reduce_kernel<IndexType,ValueType> <<<sum_grid, block, shared>>>
342 (NUM_BLOCKS, A.num_cols,
343 thrust::raw_pointer_cast(&temp.values[0]),
344 thrust::raw_pointer_cast(&y[0]));
345#endif
346}
347
348template <typename Vector1,
349 typename Vector2,
350 typename Vector3>
351void __spmm_dense_diag(const Vector1& A,
352 const Vector2& x,
353 Vector3& y,
354 cusp::row_major)
355{
356 CUSP_PROFILE_SCOPED();
357 typedef typename Vector3::index_type IndexType;
358 typedef typename Vector3::value_type ValueType;
359 typedef typename Vector3::memory_space MemorySpace;
360 const size_t BLOCK_SIZE = 32;
361 const size_t ROWS_PER_BLOCK = 8;
362 const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(row_spmm_dense_diag_kernel<IndexType, ValueType>, BLOCK_SIZE, (size_t) 0);
363 const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, ROWS_PER_BLOCK));
364 dim3 block(BLOCK_SIZE, ROWS_PER_BLOCK);
365 dim3 grid(NUM_BLOCKS, 1);
366
367 row_spmm_dense_diag_kernel<IndexType,ValueType> <<<grid, block>>>
368 (A.num_rows, A.num_cols,
369 thrust::raw_pointer_cast(&A.values[0]),
370 thrust::raw_pointer_cast(&x[0]),
371 thrust::raw_pointer_cast(&(y.values)[0]));
372}
373
374template <typename Vector1,
375 typename Vector2,
376 typename Vector3>
377void __spmm_dense_diag(const Vector1& A,
378 const Vector2& x,
379 Vector3& y,
380 cusp::column_major)
381{
382#if 0
383 typedef typename Vector1::index_type IndexType;
384 typedef typename Vector1::value_type ValueType;
385 for (IndexType col=0; col<A.num_cols; ++col) {
386 cusp::blas::axpby(A.column(col), A.column(col), y.column(col),
387 x[col], ValueType(0.0));
388 }
389#else
390 CUSP_PROFILE_SCOPED();
391 typedef typename Vector3::index_type IndexType;
392 typedef typename Vector3::value_type ValueType;
393 typedef typename Vector3::memory_space MemorySpace;
394 const size_t BLOCK_SIZE = 32;
395 const size_t COLS_PER_BLOCK = 8;
396 const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(col_spmm_dense_diag_kernel<IndexType, ValueType>, BLOCK_SIZE, (size_t) 0);
397 const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(A.num_rows, BLOCK_SIZE));
398 dim3 block(BLOCK_SIZE, COLS_PER_BLOCK, 1);
399 dim3 grid(NUM_BLOCKS, 1);
400
401 col_spmm_dense_diag_kernel<IndexType,ValueType> <<<grid, block>>>
402 (A.num_rows, A.num_cols,
403 thrust::raw_pointer_cast(&A.values[0]),
404 thrust::raw_pointer_cast(&x[0]),
405 thrust::raw_pointer_cast(&(y.values)[0]));
406#endif
407}
408
409template <typename ValueType,
410 typename Vector1,
411 typename Vector2>
412void __spmm_axpby(const ValueType& a,
413 const Vector1& x,
414 const ValueType& b,
415 const Vector1& y,
416 Vector2& z,
417 cusp::row_major)
418{
419 CUSP_PROFILE_SCOPED();
420 typedef typename Vector2::index_type IndexType;
421 typedef typename Vector2::memory_space MemorySpace;
422 const size_t BLOCK_SIZE = 32;
423 const size_t ROWS_PER_BLOCK = 6;
424 const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(row_axpby_kernel<IndexType, ValueType>, BLOCK_SIZE, (size_t) 0);
425 const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(x.num_rows, ROWS_PER_BLOCK));
426 dim3 block(BLOCK_SIZE, ROWS_PER_BLOCK);
427 dim3 grid(NUM_BLOCKS, 1);
428
429 row_axpby_kernel<IndexType,ValueType> <<<grid, block>>>
430 (a,b, x.num_rows, x.num_cols,
431 thrust::raw_pointer_cast(&x.values[0]),
432 thrust::raw_pointer_cast(&(y.values)[0]),
433 thrust::raw_pointer_cast(&(z.values)[0]));
434}
435
436template <typename ValueType,
437 typename Vector1,
438 typename Vector2>
439void __spmm_axpby(const ValueType& a,
440 const Vector1& x,
441 const ValueType& b,
442 const Vector1& y,
443 Vector2& z,
444 cusp::column_major)
445{
446#if 0
447 typedef typename Vector1::index_type IndexType;
448 for (IndexType col=0; col<x.num_cols; ++col) {
449 cusp::blas::axpby(x.column(col), y.column(col), z.column(col), a, b);
450 }
451#else
452 CUSP_PROFILE_SCOPED();
453 typedef typename Vector2::index_type IndexType;
454 typedef typename Vector2::memory_space MemorySpace;
455 const size_t BLOCK_SIZE = 32;
456 const size_t COLS_PER_BLOCK = 6;
457 const size_t MAX_BLOCKS = cusp::detail::device::arch::max_active_blocks(col_axpby_kernel<IndexType, ValueType>, BLOCK_SIZE, (size_t) 0);
458 const size_t NUM_BLOCKS = std::min(MAX_BLOCKS, DIVIDE_INTO(x.num_rows, BLOCK_SIZE));
459 dim3 block(BLOCK_SIZE, COLS_PER_BLOCK, 1);
460 dim3 grid(NUM_BLOCKS, 1);
461
462 col_axpby_kernel<IndexType,ValueType> <<<grid, block>>>
463 (a,b, x.num_rows, x.num_cols,
464 thrust::raw_pointer_cast(&x.values[0]),
465 thrust::raw_pointer_cast(&(y.values)[0]),
466 thrust::raw_pointer_cast(&(z.values)[0]));
467#endif
468}
469
470template <typename Vector1,
471 typename Vector2,
472 typename Vector3>
473void spmm_MVdot(const Vector1& A,
474 const Vector2& x,
475 Vector3& y)
476{
477 //Determine if row-wise or col-wise then call appropriate multiply
478 __spmm_MVdot(A.num_cols, A, x, y, typename Vector2::orientation());
479}
480
481template <typename Vector1,
482 typename Vector2,
483 typename Vector3>
484void spmm_dense_diag(const Vector1& A,
485 const Vector2& x,
486 Vector3& y)
487{
488 //Determine if row-wise or col-wise then call appropriate multiply
489 __spmm_dense_diag(A, x, y, typename Vector1::orientation());
490}
491
492template <typename ValueType,
493 typename Vector1,
494 typename Vector2>
495void spmm_axpby(const ValueType& a, const Vector1& x, const ValueType& b,
496 const Vector1& y,
497 Vector2& z)
498{
499 __spmm_axpby(a, x, b, y, z, typename Vector1::orientation());
500}
501
502
503} // end namespace device
504} // end namespace detail
505} // end namespace cusp
Kokkos::DefaultExecutionSpace device
__global__ void row_axpby_kernel(const ValueType a, const ValueType b, const IndexType num_rows, const IndexType num_cols, const ValueType *x, const ValueType *y, ValueType *z)
Definition: array2d.h:32
__global__ void row_spmm_dense_diag_kernel(const IndexType Anum_rows, const IndexType Anum_cols, const ValueType *Aval, const ValueType *x, ValueType *y)
Definition: array2d.h:62
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType * x
Definition: csr_vector.h:265
__global__ void col_reduce_kernel(const IndexType num_rows, const IndexType num_cols, const ValueType *temp, ValueType *y)
Definition: array2d.h:212
__global__ void col_spmm_dense_diag_kernel(const IndexType Anum_rows, const IndexType Anum_cols, const ValueType *Aval, const ValueType *x, ValueType *y)
Definition: array2d.h:77
__global__ void row_reduce_kernel(const IndexType num_rows, const IndexType num_cols, const ValueType *temp, ValueType *y)
Definition: array2d.h:173
void __spmm_dense_diag(const Vector1 &A, const Vector2 &x, Vector3 &y, cusp::row_major)
Definition: array2d.h:351
__global__ void col_spmm_MVdot_kernel(const IndexType Anum_rows, const IndexType Anum_cols, const ValueType *Aval, const ValueType *x, ValueType *temp)
Definition: array2d.h:135
void __spmm_MVdot(const int numRHS, const Vector1 &A, const Vector2 &x, Vector3 &y, cusp::row_major)
Definition: array2d.h:247
void spmm_MVdot(const Vector1 &A, const Vector2 &x, Vector3 &y)
Definition: array2d.h:473
const IndexType const IndexType const IndexType const IndexType const ValueType const ValueType ValueType * y
Definition: csr_vector.h:267
void __spmm_axpby(const ValueType &a, const Vector1 &x, const ValueType &b, const Vector1 &y, Vector2 &z, cusp::row_major)
Definition: array2d.h:412
void spmm_dense_diag(const Vector1 &A, const Vector2 &x, Vector3 &y)
Definition: array2d.h:484
void spmm_axpby(const ValueType &a, const Vector1 &x, const ValueType &b, const Vector1 &y, Vector2 &z)
Definition: array2d.h:495
__global__ void row_spmm_MVdot_kernel(IndexType ROWS_PER_BLOCK, const IndexType Anum_rows, const IndexType Anum_cols, const ValueType *Aval, const ValueType *x, ValueType *temp)
Definition: array2d.h:92
__global__ void col_axpby_kernel(const ValueType a, const ValueType b, const IndexType num_rows, const IndexType num_cols, const ValueType *x, const ValueType *y, ValueType *z)
Definition: array2d.h:47