17#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
18#include <Kokkos_Macros.hpp>
20 "Including non-public Kokkos header files is not allowed.");
22#ifndef KOKKOS_CUDASPACE_HPP
23#define KOKKOS_CUDASPACE_HPP
25#include <Kokkos_Macros.hpp>
26#if defined(KOKKOS_ENABLE_CUDA)
28#include <Kokkos_Core_fwd.hpp>
35#include <Kokkos_HostSpace.hpp>
36#include <impl/Kokkos_SharedAlloc.hpp>
38#include <impl/Kokkos_Profiling_Interface.hpp>
40#include <Cuda/Kokkos_Cuda_abort.hpp>
42#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
43extern "C" bool kokkos_impl_cuda_pin_uvm_to_host();
44extern "C" void kokkos_impl_cuda_set_pin_uvm_to_host(
bool);
53struct is_cuda_type_space :
public std::false_type {};
62 using memory_space = CudaSpace;
63 using execution_space = Kokkos::Cuda;
64 using device_type = Kokkos::Device<execution_space, memory_space>;
66 using size_type =
unsigned int;
71 CudaSpace(CudaSpace&& rhs) =
default;
72 CudaSpace(
const CudaSpace& rhs) =
default;
73 CudaSpace& operator=(CudaSpace&& rhs) =
default;
74 CudaSpace& operator=(
const CudaSpace& rhs) =
default;
75 ~CudaSpace() =
default;
78 void* allocate(
const Cuda& exec_space,
const size_t arg_alloc_size)
const;
79 void* allocate(
const Cuda& exec_space,
const char* arg_label,
80 const size_t arg_alloc_size,
81 const size_t arg_logical_size = 0)
const;
82 void* allocate(
const size_t arg_alloc_size)
const;
83 void* allocate(
const char* arg_label,
const size_t arg_alloc_size,
84 const size_t arg_logical_size = 0)
const;
87 void deallocate(
void*
const arg_alloc_ptr,
const size_t arg_alloc_size)
const;
88 void deallocate(
const char* arg_label,
void*
const arg_alloc_ptr,
89 const size_t arg_alloc_size,
90 const size_t arg_logical_size = 0)
const;
93 template <
class,
class,
class,
class>
95 void* impl_allocate(
const Cuda& exec_space,
const char* arg_label,
96 const size_t arg_alloc_size,
97 const size_t arg_logical_size = 0,
98 const Kokkos::Tools::SpaceHandle =
99 Kokkos::Tools::make_space_handle(name()))
const;
100 void* impl_allocate(
const char* arg_label,
const size_t arg_alloc_size,
101 const size_t arg_logical_size = 0,
102 const Kokkos::Tools::SpaceHandle =
103 Kokkos::Tools::make_space_handle(name()))
const;
104 void impl_deallocate(
const char* arg_label,
void*
const arg_alloc_ptr,
105 const size_t arg_alloc_size,
106 const size_t arg_logical_size = 0,
107 const Kokkos::Tools::SpaceHandle =
108 Kokkos::Tools::make_space_handle(name()))
const;
112 static constexpr const char* name() {
return m_name; }
117 static constexpr const char* m_name =
"Cuda";
118 friend class Kokkos::Impl::SharedAllocationRecord<Kokkos::CudaSpace, void>;
122struct Impl::is_cuda_type_space<CudaSpace> :
public std::true_type {};
137 using memory_space = CudaUVMSpace;
138 using execution_space = Cuda;
139 using device_type = Kokkos::Device<execution_space, memory_space>;
140 using size_type =
unsigned int;
142#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
144 KOKKOS_DEPRECATED
static bool available();
152 CudaUVMSpace(CudaUVMSpace&& rhs) =
default;
153 CudaUVMSpace(
const CudaUVMSpace& rhs) =
default;
154 CudaUVMSpace& operator=(CudaUVMSpace&& rhs) =
default;
155 CudaUVMSpace& operator=(
const CudaUVMSpace& rhs) =
default;
156 ~CudaUVMSpace() =
default;
159 void* allocate(
const size_t arg_alloc_size)
const;
160 void* allocate(
const char* arg_label,
const size_t arg_alloc_size,
161 const size_t arg_logical_size = 0)
const;
164 void deallocate(
void*
const arg_alloc_ptr,
const size_t arg_alloc_size)
const;
165 void deallocate(
const char* arg_label,
void*
const arg_alloc_ptr,
166 const size_t arg_alloc_size,
167 const size_t arg_logical_size = 0)
const;
170 template <
class,
class,
class,
class>
172 void* impl_allocate(
const char* arg_label,
const size_t arg_alloc_size,
173 const size_t arg_logical_size = 0,
174 const Kokkos::Tools::SpaceHandle =
175 Kokkos::Tools::make_space_handle(name()))
const;
176 void impl_deallocate(
const char* arg_label,
void*
const arg_alloc_ptr,
177 const size_t arg_alloc_size,
178 const size_t arg_logical_size = 0,
179 const Kokkos::Tools::SpaceHandle =
180 Kokkos::Tools::make_space_handle(name()))
const;
184 static constexpr const char* name() {
return m_name; }
186#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
187 static bool cuda_pin_uvm_to_host();
188 static void cuda_set_pin_uvm_to_host(
bool val);
195#ifdef KOKKOS_IMPL_DEBUG_CUDA_PIN_UVM_TO_HOST
196 static bool kokkos_impl_cuda_pin_uvm_to_host_v;
198 static constexpr const char* m_name =
"CudaUVM";
202struct Impl::is_cuda_type_space<CudaUVMSpace> :
public std::true_type {};
214class CudaHostPinnedSpace {
218 using execution_space = HostSpace::execution_space;
219 using memory_space = CudaHostPinnedSpace;
220 using device_type = Kokkos::Device<execution_space, memory_space>;
221 using size_type =
unsigned int;
225 CudaHostPinnedSpace();
226 CudaHostPinnedSpace(CudaHostPinnedSpace&& rhs) =
default;
227 CudaHostPinnedSpace(
const CudaHostPinnedSpace& rhs) =
default;
228 CudaHostPinnedSpace& operator=(CudaHostPinnedSpace&& rhs) =
default;
229 CudaHostPinnedSpace& operator=(
const CudaHostPinnedSpace& rhs) =
default;
230 ~CudaHostPinnedSpace() =
default;
233 void* allocate(
const size_t arg_alloc_size)
const;
234 void* allocate(
const char* arg_label,
const size_t arg_alloc_size,
235 const size_t arg_logical_size = 0)
const;
238 void deallocate(
void*
const arg_alloc_ptr,
const size_t arg_alloc_size)
const;
239 void deallocate(
const char* arg_label,
void*
const arg_alloc_ptr,
240 const size_t arg_alloc_size,
241 const size_t arg_logical_size = 0)
const;
244 template <
class,
class,
class,
class>
246 void* impl_allocate(
const char* arg_label,
const size_t arg_alloc_size,
247 const size_t arg_logical_size = 0,
248 const Kokkos::Tools::SpaceHandle =
249 Kokkos::Tools::make_space_handle(name()))
const;
250 void impl_deallocate(
const char* arg_label,
void*
const arg_alloc_ptr,
251 const size_t arg_alloc_size,
252 const size_t arg_logical_size = 0,
253 const Kokkos::Tools::SpaceHandle =
254 Kokkos::Tools::make_space_handle(name()))
const;
258 static constexpr const char* name() {
return m_name; }
261 static constexpr const char* m_name =
"CudaHostPinned";
267struct Impl::is_cuda_type_space<CudaHostPinnedSpace> :
public std::true_type {};
277cudaStream_t cuda_get_deep_copy_stream();
279const std::unique_ptr<Kokkos::Cuda>& cuda_get_deep_copy_space(
280 bool initialize =
true);
283 Kokkos::CudaSpace>::assignable,
286 Kokkos::CudaUVMSpace>::assignable,
290 Kokkos::CudaHostPinnedSpace>::assignable,
296struct MemorySpaceAccess<Kokkos::HostSpace, Kokkos::CudaSpace> {
297 enum :
bool { assignable =
false };
298 enum :
bool { accessible =
false };
299 enum :
bool { deepcopy =
true };
303struct MemorySpaceAccess<Kokkos::HostSpace, Kokkos::CudaUVMSpace> {
305 enum :
bool { assignable =
false };
306 enum :
bool { accessible =
true };
307 enum :
bool { deepcopy =
true };
311struct MemorySpaceAccess<Kokkos::HostSpace, Kokkos::CudaHostPinnedSpace> {
313 enum :
bool { assignable =
true };
314 enum :
bool { accessible =
true };
315 enum :
bool { deepcopy =
true };
322 enum :
bool { assignable =
false };
323 enum :
bool { accessible =
false };
324 enum :
bool { deepcopy =
true };
328struct MemorySpaceAccess<Kokkos::CudaSpace, Kokkos::CudaUVMSpace> {
330 enum :
bool { assignable =
true };
331 enum :
bool { accessible =
true };
332 enum :
bool { deepcopy =
true };
336struct MemorySpaceAccess<Kokkos::CudaSpace, Kokkos::CudaHostPinnedSpace> {
338 enum :
bool { assignable =
false };
339 enum :
bool { accessible =
true };
340 enum :
bool { deepcopy =
true };
349 enum :
bool { assignable =
false };
350 enum :
bool { accessible =
false };
351 enum :
bool { deepcopy =
true };
355struct MemorySpaceAccess<Kokkos::CudaUVMSpace, Kokkos::CudaSpace> {
358 enum :
bool { assignable =
false };
361 enum :
bool { accessible =
true };
362 enum :
bool { deepcopy =
true };
366struct MemorySpaceAccess<Kokkos::CudaUVMSpace, Kokkos::CudaHostPinnedSpace> {
368 enum :
bool { assignable =
false };
369 enum :
bool { accessible =
true };
370 enum :
bool { deepcopy =
true };
379 enum :
bool { assignable =
false };
380 enum :
bool { accessible =
true };
381 enum :
bool { deepcopy =
true };
385struct MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, Kokkos::CudaSpace> {
386 enum :
bool { assignable =
false };
387 enum :
bool { accessible =
false };
388 enum :
bool { deepcopy =
true };
392struct MemorySpaceAccess<Kokkos::CudaHostPinnedSpace, Kokkos::CudaUVMSpace> {
393 enum :
bool { assignable =
false };
394 enum :
bool { accessible =
true };
395 enum :
bool { deepcopy =
true };
409void DeepCopyCuda(
void* dst,
const void* src,
size_t n);
410void DeepCopyAsyncCuda(
const Cuda& instance,
void* dst,
const void* src,
412void DeepCopyAsyncCuda(
void* dst,
const void* src,
size_t n);
414template <
class MemSpace>
415struct DeepCopy<MemSpace, HostSpace, Cuda,
416 std::enable_if_t<is_cuda_type_space<MemSpace>::value>> {
417 DeepCopy(
void* dst,
const void* src,
size_t n) { DeepCopyCuda(dst, src, n); }
418 DeepCopy(
const Cuda& instance,
void* dst,
const void* src,
size_t n) {
419 DeepCopyAsyncCuda(instance, dst, src, n);
423template <
class MemSpace>
424struct DeepCopy<HostSpace, MemSpace, Cuda,
425 std::enable_if_t<is_cuda_type_space<MemSpace>::value>> {
426 DeepCopy(
void* dst,
const void* src,
size_t n) { DeepCopyCuda(dst, src, n); }
427 DeepCopy(
const Cuda& instance,
void* dst,
const void* src,
size_t n) {
428 DeepCopyAsyncCuda(instance, dst, src, n);
432template <
class MemSpace1,
class MemSpace2>
433struct DeepCopy<MemSpace1, MemSpace2, Cuda,
434 std::enable_if_t<is_cuda_type_space<MemSpace1>::value &&
435 is_cuda_type_space<MemSpace2>::value>> {
436 DeepCopy(
void* dst,
const void* src,
size_t n) { DeepCopyCuda(dst, src, n); }
437 DeepCopy(
const Cuda& instance,
void* dst,
const void* src,
size_t n) {
438 DeepCopyAsyncCuda(instance, dst, src, n);
442template <
class MemSpace1,
class MemSpace2,
class ExecutionSpace>
443struct DeepCopy<MemSpace1, MemSpace2, ExecutionSpace,
444 std::enable_if_t<is_cuda_type_space<MemSpace1>::value &&
445 is_cuda_type_space<MemSpace2>::value &&
446 !std::is_same<ExecutionSpace, Cuda>::value>> {
447 inline DeepCopy(
void* dst,
const void* src,
size_t n) {
448 DeepCopyCuda(dst, src, n);
451 inline DeepCopy(
const ExecutionSpace& exec,
void* dst,
const void* src,
453 exec.fence(fence_string());
454 DeepCopyAsyncCuda(dst, src, n);
458 static const std::string& fence_string() {
459 static const std::string
string =
460 std::string(
"Kokkos::Impl::DeepCopy<") + MemSpace1::name() +
"Space, " +
462 "Space, ExecutionSpace>::DeepCopy: fence before copy";
467template <
class MemSpace,
class ExecutionSpace>
468struct DeepCopy<MemSpace, HostSpace, ExecutionSpace,
469 std::enable_if_t<is_cuda_type_space<MemSpace>::value &&
470 !std::is_same<ExecutionSpace, Cuda>::value>> {
471 inline DeepCopy(
void* dst,
const void* src,
size_t n) {
472 DeepCopyCuda(dst, src, n);
475 inline DeepCopy(
const ExecutionSpace& exec,
void* dst,
const void* src,
477 exec.fence(fence_string());
478 DeepCopyAsyncCuda(dst, src, n);
482 static const std::string& fence_string() {
483 static const std::string
string =
484 std::string(
"Kokkos::Impl::DeepCopy<") + MemSpace::name() +
485 "Space, HostSpace, ExecutionSpace>::DeepCopy: fence before copy";
490template <
class MemSpace,
class ExecutionSpace>
491struct DeepCopy<HostSpace, MemSpace, ExecutionSpace,
492 std::enable_if_t<is_cuda_type_space<MemSpace>::value &&
493 !std::is_same<ExecutionSpace, Cuda>::value>> {
494 inline DeepCopy(
void* dst,
const void* src,
size_t n) {
495 DeepCopyCuda(dst, src, n);
498 inline DeepCopy(
const ExecutionSpace& exec,
void* dst,
const void* src,
500 exec.fence(fence_string());
501 DeepCopyAsyncCuda(dst, src, n);
505 static const std::string& fence_string() {
506 static const std::string
string =
507 std::string(
"Kokkos::Impl::DeepCopy<HostSpace, ") + MemSpace::name() +
508 "Space, ExecutionSpace>::DeepCopy: fence before copy";
523class SharedAllocationRecord<Kokkos::CudaSpace, void>
524 :
public HostInaccessibleSharedAllocationRecordCommon<Kokkos::CudaSpace> {
526 friend class SharedAllocationRecord<Kokkos::CudaUVMSpace, void>;
527 friend class SharedAllocationRecordCommon<Kokkos::CudaSpace>;
528 friend class HostInaccessibleSharedAllocationRecordCommon<Kokkos::CudaSpace>;
530 using RecordBase = SharedAllocationRecord<void, void>;
532 HostInaccessibleSharedAllocationRecordCommon<Kokkos::CudaSpace>;
534 SharedAllocationRecord(const SharedAllocationRecord&) = delete;
535 SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
537 static ::cudaTextureObject_t attach_texture_object(
538 const unsigned sizeof_alias, void* const alloc_ptr,
539 const size_t alloc_size);
541#ifdef KOKKOS_ENABLE_DEBUG
542 static RecordBase s_root_record;
545 ::cudaTextureObject_t m_tex_obj = 0;
546 const Kokkos::CudaSpace m_space;
549 ~SharedAllocationRecord();
550 SharedAllocationRecord() = default;
556 template <typename ExecutionSpace>
557 SharedAllocationRecord(
558 const ExecutionSpace& , const Kokkos::CudaSpace& arg_space,
559 const std::string& arg_label, const size_t arg_alloc_size,
560 const RecordBase::function_type arg_dealloc = &base_t::deallocate)
562#ifdef KOKKOS_ENABLE_DEBUG
563 &SharedAllocationRecord<Kokkos::CudaSpace, void>::s_root_record,
565 Impl::checked_allocation_with_header(arg_space, arg_label,
567 sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
572 SharedAllocationHeader header;
574 this->base_t::_fill_host_accessible_header_info(header, arg_label);
579 deep_copy_header_no_exec(RecordBase::m_alloc_ptr, &header);
582 SharedAllocationRecord(
583 const Kokkos::Cuda& exec_space,
const Kokkos::CudaSpace& arg_space,
584 const std::string& arg_label,
const size_t arg_alloc_size,
585 const RecordBase::function_type arg_dealloc = &base_t::deallocate);
587 SharedAllocationRecord(
588 const Kokkos::CudaSpace& arg_space,
const std::string& arg_label,
589 const size_t arg_alloc_size,
590 const RecordBase::function_type arg_dealloc = &base_t::deallocate);
594 static void deep_copy_header_no_exec(
void*,
const void*);
597 template <
typename AliasType>
598 inline ::cudaTextureObject_t attach_texture_object() {
599 static_assert((std::is_same<AliasType, int>::value ||
600 std::is_same<AliasType, ::int2>::value ||
601 std::is_same<AliasType, ::int4>::value),
602 "Cuda texture fetch only supported for alias types of int, "
603 "::int2, or ::int4");
605 if (m_tex_obj == 0) {
606 m_tex_obj = attach_texture_object(
sizeof(AliasType),
607 (
void*)RecordBase::m_alloc_ptr,
608 RecordBase::m_alloc_size);
614 template <
typename AliasType>
615 inline int attach_texture_object_offset(
const AliasType*
const ptr) {
617 return ptr -
reinterpret_cast<AliasType*
>(RecordBase::m_alloc_ptr);
622class SharedAllocationRecord<Kokkos::CudaUVMSpace, void>
623 :
public SharedAllocationRecordCommon<Kokkos::CudaUVMSpace> {
625 friend class SharedAllocationRecordCommon<Kokkos::CudaUVMSpace>;
627 using base_t = SharedAllocationRecordCommon<Kokkos::CudaUVMSpace>;
628 using RecordBase = SharedAllocationRecord<void, void>;
630 SharedAllocationRecord(const SharedAllocationRecord&) = delete;
631 SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
633 static RecordBase s_root_record;
635 ::cudaTextureObject_t m_tex_obj = 0;
636 const Kokkos::CudaUVMSpace m_space;
639 ~SharedAllocationRecord();
640 SharedAllocationRecord() = default;
646 template <typename ExecutionSpace>
647 SharedAllocationRecord(
648 const ExecutionSpace& ,
649 const Kokkos::CudaUVMSpace& arg_space, const std::string& arg_label,
650 const size_t arg_alloc_size,
651 const RecordBase::function_type arg_dealloc = &base_t::deallocate)
653#ifdef KOKKOS_ENABLE_DEBUG
654 &SharedAllocationRecord<Kokkos::CudaUVMSpace, void>::s_root_record,
656 Impl::checked_allocation_with_header(arg_space, arg_label,
658 sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
662 this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
666 SharedAllocationRecord(
667 const Kokkos::CudaUVMSpace& arg_space,
const std::string& arg_label,
668 const size_t arg_alloc_size,
669 const RecordBase::function_type arg_dealloc = &base_t::deallocate);
672 template <
typename AliasType>
673 inline ::cudaTextureObject_t attach_texture_object() {
674 static_assert((std::is_same<AliasType, int>::value ||
675 std::is_same<AliasType, ::int2>::value ||
676 std::is_same<AliasType, ::int4>::value),
677 "Cuda texture fetch only supported for alias types of int, "
678 "::int2, or ::int4");
680 if (m_tex_obj == 0) {
681 m_tex_obj = SharedAllocationRecord<Kokkos::CudaSpace, void>::
682 attach_texture_object(
sizeof(AliasType),
683 (
void*)RecordBase::m_alloc_ptr,
684 RecordBase::m_alloc_size);
690 template <
typename AliasType>
691 inline int attach_texture_object_offset(
const AliasType*
const ptr) {
693 return ptr -
reinterpret_cast<AliasType*
>(RecordBase::m_alloc_ptr);
698class SharedAllocationRecord<Kokkos::CudaHostPinnedSpace, void>
699 :
public SharedAllocationRecordCommon<Kokkos::CudaHostPinnedSpace> {
701 friend class SharedAllocationRecordCommon<Kokkos::CudaHostPinnedSpace>;
703 using RecordBase = SharedAllocationRecord<void, void>;
704 using base_t = SharedAllocationRecordCommon<Kokkos::CudaHostPinnedSpace>;
706 SharedAllocationRecord(const SharedAllocationRecord&) = delete;
707 SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete;
709 static RecordBase s_root_record;
711 const Kokkos::CudaHostPinnedSpace m_space;
714 ~SharedAllocationRecord();
715 SharedAllocationRecord() = default;
721 template <typename ExecutionSpace>
722 SharedAllocationRecord(
723 const ExecutionSpace& ,
724 const Kokkos::CudaHostPinnedSpace& arg_space,
725 const std::string& arg_label, const size_t arg_alloc_size,
726 const RecordBase::function_type arg_dealloc = &base_t::deallocate)
728#ifdef KOKKOS_ENABLE_DEBUG
729 &SharedAllocationRecord<Kokkos::CudaHostPinnedSpace,
730 void>::s_root_record,
732 Impl::checked_allocation_with_header(arg_space, arg_label,
734 sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc,
737 this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr,
741 SharedAllocationRecord(
742 const Kokkos::CudaHostPinnedSpace& arg_space,
743 const std::string& arg_label,
const size_t arg_alloc_size,
744 const RecordBase::function_type arg_dealloc = &base_t::deallocate);
LogicalMemorySpace is a space that is identical to another space, but differentiable by name and temp...
Memory management for host memory.
bool available()
Query if hwloc is available.
Access relationship between DstMemorySpace and SrcMemorySpace.