275struct TensorEvaluator<const TensorConvolutionOp<Indices, InputArgType, KernelArgType>,
Eigen::SyclDevice> {
276 typedef TensorConvolutionOp<Indices, InputArgType, KernelArgType> XprType;
278 static const int NumDims =
279 internal::array_size<typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions>::value;
280 static const int NumKernelDims = internal::array_size<Indices>::value;
281 typedef typename XprType::Index Index;
282 typedef DSizes<Index, NumDims> Dimensions;
283 typedef typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions KernelDimensions;
284 typedef const Eigen::SyclDevice Device;
285 typedef typename XprType::CoeffReturnType CoeffReturnType;
286 typedef typename PacketType<CoeffReturnType, Eigen::SyclDevice>::type PacketReturnType;
287 typedef typename InputArgType::Scalar Scalar;
288 static const int PacketSize = PacketType<CoeffReturnType, Device>::size;
289 typedef StorageMemory<CoeffReturnType, Eigen::SyclDevice> Storage;
290 typedef typename Storage::Type EvaluatorPointerType;
291 typedef StorageMemory<const CoeffReturnType, Eigen::SyclDevice> KernelStorage;
294 IsAligned = TensorEvaluator<InputArgType, Eigen::SyclDevice>::IsAligned &
295 TensorEvaluator<KernelArgType, Eigen::SyclDevice>::IsAligned,
296 PacketAccess =
false,
298 PreferBlockAccess =
false,
299 Layout = TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout,
305 typedef internal::TensorBlockNotImplemented TensorBlock;
308 TensorEvaluator(
const XprType &op,
const Eigen::SyclDevice &
device)
309 : m_inputImpl(op.inputExpression(),
device),
310 m_kernelArg(op.kernelExpression()),
311 m_kernelImpl(op.kernelExpression(),
device),
312 m_indices(op.indices()),
315 m_local_kernel(
false),
317 EIGEN_STATIC_ASSERT((
static_cast<int>(TensorEvaluator<InputArgType, Eigen::SyclDevice>::Layout) ==
318 static_cast<int>(TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Layout)),
319 YOU_MADE_A_PROGRAMMING_MISTAKE);
321 const typename TensorEvaluator<InputArgType, Eigen::SyclDevice>::Dimensions &input_dims = m_inputImpl.dimensions();
322 const typename TensorEvaluator<KernelArgType, Eigen::SyclDevice>::Dimensions &kernel_dims =
323 m_kernelImpl.dimensions();
325 m_dimensions = m_inputImpl.dimensions();
326 for (
int i = 0; i < NumKernelDims; ++i) {
327 const Index index = op.indices()[i];
328 const Index input_dim = input_dims[index];
329 const Index kernel_dim = kernel_dims[i];
330 const Index result_dim = input_dim - kernel_dim + 1;
331 m_dimensions[index] = result_dim;
335 EIGEN_DEVICE_FUNC
const Dimensions &dimensions()
const {
return m_dimensions; }
337 EIGEN_STRONG_INLINE
bool evalSubExprsIfNeeded(EvaluatorPointerType
data) {
339 m_inputImpl.evalSubExprsIfNeeded(NULL);
344 m_buf = (EvaluatorPointerType)m_device.get(
345 (Scalar *)m_device.allocate_temp(dimensions().TotalSize() *
sizeof(Scalar)));
351 EIGEN_STRONG_INLINE
void cleanup() {
352 m_inputImpl.cleanup();
354 m_device.deallocate_temp(m_buf);
357 if (m_local_kernel) {
358 m_device.deallocate_temp(m_kernel);
359 m_local_kernel =
false;
364 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
const Device &
device()
const {
return m_device; }
366 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE EvaluatorPointerType
data()
const {
return m_buf; }
368 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void preloadKernel() {
371 typename KernelStorage::Type in_place = m_kernelImpl.data();
374 m_local_kernel =
false;
376 ptrdiff_t kernel_sz = m_kernelImpl.dimensions().TotalSize() *
sizeof(Scalar);
377 EvaluatorPointerType local = (EvaluatorPointerType)m_device.get((Scalar *)m_device.allocate_temp(kernel_sz));
378 typedef TensorEvalToOp<const KernelArgType> EvalTo;
379 EvalTo evalToTmp(m_device.get(local), m_kernelArg);
380 const bool PacketAccess = internal::IsVectorizable<Eigen::SyclDevice, KernelArgType>::value;
381 internal::TensorExecutor<const EvalTo, Eigen::SyclDevice, PacketAccess>::run(evalToTmp, m_device);
383 m_local_kernel =
true;
387 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void executeEval(EvaluatorPointerType data)
const {
388 typedef TensorEvaluator<InputArgType, Eigen::SyclDevice> InputEvaluator;
389 typedef typename InputEvaluator::Dimensions InputDims;
390 switch (NumKernelDims) {
392 const size_t numX = dimensions()[m_indices[0]];
393 const size_t numP = dimensions().TotalSize() / numX;
394 const auto input_dim = std::array<size_t, 2>{numX, numP};
395 auto global_range = cl::sycl::range<2>{};
396 auto local_range = cl::sycl::range<2>{};
397 const size_t kernel_size = m_kernelImpl.dimensions().TotalSize();
399 m_device.parallel_for_setup(input_dim, global_range, local_range);
400 const size_t local_memory_size = (local_range[0] + kernel_size - 1) * (local_range[1]);
401 gpu_assert(
static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
402 const array<Index, 1> indices{{m_indices[0]}};
403 const array<Index, 1> kernel_dims{{m_kernelImpl.dimensions()[0]}};
404 internal::IndexMapper<Index, InputDims, 1, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
406 typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
407 typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV1D>
410 m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
411 m_inputImpl, m_kernel, data, cl::sycl::nd_range<2>(global_range, local_range), local_memory_size,
412 indexMapper, kernel_size, cl::sycl::range<2>(input_dim[0], input_dim[1]));
417 auto kernel_index = std::array<size_t, 2>{
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 : 1,
418 static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 1 : 0};
419 auto kernel_size = cl::sycl::range<2>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
420 (size_t)m_kernelImpl.dimensions()[kernel_index[1]]};
421 const size_t numX = dimensions()[m_indices[kernel_index[0]]];
422 const size_t numY = dimensions()[m_indices[kernel_index[1]]];
423 const size_t numP = dimensions().TotalSize() / (numX * numY);
424 auto input_dim = std::array<size_t, 3>{numX, numY, numP};
426 auto global_range = cl::sycl::range<3>{};
427 auto local_range = cl::sycl::range<3>{};
429 m_device.parallel_for_setup(input_dim, global_range, local_range);
431 const size_t local_memory_size =
432 (local_range[0] + kernel_size[0] - 1) * (local_range[1] + kernel_size[1] - 1) * local_range[2];
433 gpu_assert(
static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
434 const array<Index, 2> indices{{m_indices[kernel_index[0]], m_indices[kernel_index[1]]}};
435 const array<Index, 2> kernel_dims{
436 {m_kernelImpl.dimensions()[kernel_index[0]], m_kernelImpl.dimensions()[kernel_index[1]]}};
437 internal::IndexMapper<Index, InputDims, 2, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
438 typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
439 typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV2D>
441 m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
442 m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
443 indexMapper, kernel_size, cl::sycl::range<3>{input_dim[0], input_dim[1], input_dim[2]});
448 auto kernel_index = std::array<size_t, 3>{
static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 0 : 2,
449 static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 1 : 1,
450 static_cast<int>(Layout) ==
static_cast<int>(
ColMajor) ? 2 : 0};
452 auto kernel_size = cl::sycl::range<3>{(size_t)m_kernelImpl.dimensions()[kernel_index[0]],
453 (size_t)m_kernelImpl.dimensions()[kernel_index[1]],
454 (size_t)m_kernelImpl.dimensions()[kernel_index[2]]};
456 const size_t numX = dimensions()[m_indices[kernel_index[0]]];
457 const size_t numY = dimensions()[m_indices[kernel_index[1]]];
458 const size_t numZ = dimensions()[m_indices[kernel_index[2]]];
459 auto input_dim = std::array<size_t, 3>{numX, numY, numZ};
460 const size_t numP = dimensions().TotalSize() / (numX * numY * numZ);
462 const array<Index, 3> indices{
463 {m_indices[kernel_index[0]], m_indices[kernel_index[1]], m_indices[kernel_index[2]]}};
464 const array<Index, 3> kernel_dims{{m_kernelImpl.dimensions()[kernel_index[0]],
465 m_kernelImpl.dimensions()[kernel_index[1]],
466 m_kernelImpl.dimensions()[kernel_index[2]]}};
468 internal::IndexMapper<Index, InputDims, 3, Layout> indexMapper(m_inputImpl.dimensions(), kernel_dims, indices);
470 auto global_range = cl::sycl::range<3>{};
471 auto local_range = cl::sycl::range<3>{};
473 m_device.parallel_for_setup(input_dim, global_range, local_range);
474 auto local_memory_range = (local_range + kernel_size - 1);
475 const size_t local_memory_size = local_memory_range[0] * local_memory_range[1] * local_memory_range[2];
477 gpu_assert(
static_cast<unsigned long>(local_memory_size) <= m_device.sharedMemPerBlock());
478 typedef EigenConvolutionKernel<InputEvaluator, CoeffReturnType, Scalar, Index, InputDims,
479 typename KernelStorage::Type, EvaluatorPointerType, convolution_type::CONV3D>
481 m_device.template binary_kernel_launcher<CoeffReturnType, ConvKernel>(
482 m_inputImpl, m_kernel, data, cl::sycl::nd_range<3>(global_range, local_range), local_memory_size,
483 indexMapper, kernel_size, cl::sycl::range<3>(input_dim[0], input_dim[1], input_dim[2]), numP);
488 EIGEN_STATIC_ASSERT((NumKernelDims >= 1 && NumKernelDims <= 3),
489 THIS_METHOD_IS_ONLY_FOR_OBJECTS_OF_A_SPECIFIC_SIZE);
494 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index)
const {
495 eigen_assert(m_buf != NULL);
496 eigen_assert(index < m_dimensions.TotalSize());
500 template <
int LoadMode>
501 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(
const Index index)
const {
502 eigen_assert(m_buf != NULL);
503 eigen_assert(index < m_dimensions.TotalSize());
504 return internal::ploadt<PacketReturnType, LoadMode>(m_buf + index);
507 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(
bool vectorized)
const {
510 const double kernel_size = m_kernelImpl.dimensions().TotalSize();
512 const double convolve_compute_cost = TensorOpCost::AddCost<Scalar>() + TensorOpCost::MulCost<Scalar>();
513 const double firstIndex_compute_cost =
515 (2 * TensorOpCost::AddCost<Index>() + 2 * TensorOpCost::MulCost<Index>() + TensorOpCost::DivCost<Index>());
516 return TensorOpCost(0, 0, firstIndex_compute_cost, vectorized, PacketSize) +
517 kernel_size * (m_inputImpl.costPerCoeff(vectorized) + m_kernelImpl.costPerCoeff(vectorized) +
518 TensorOpCost(0, 0, convolve_compute_cost, vectorized, PacketSize));
521 EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
void bind(cl::sycl::handler &cgh)
const {
522 m_kernelImpl.bind(cgh);
523 m_inputImpl.bind(cgh);
530 TensorEvaluator &operator=(
const TensorEvaluator &);
531 TensorEvaluator<InputArgType, Eigen::SyclDevice> m_inputImpl;
532 KernelArgType m_kernelArg;
533 TensorEvaluator<KernelArgType, Eigen::SyclDevice> m_kernelImpl;
535 Dimensions m_dimensions;
536 EvaluatorPointerType m_buf;
537 typename KernelStorage::Type m_kernel;
539 const Eigen::SyclDevice EIGEN_DEVICE_REF m_device;