10 template <
typename DsDataType,
15 typename CDEElementwiseOperation,
16 typename BlockwiseGemmPipe>
27 __device__
static constexpr
bool IsLDSNeeded() {
return false; }
31 typename DsGridPointer,
32 typename DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
33 typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>
34 __device__
static void Run(CThreadBuf& c_thread_buf,
38 const DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&,
39 const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock&
40 e_grid_desc_mblock_mperblock_nblock_nperblock,
41 CDEElementwiseOperation& cde_element_op,
45 auto e_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
46 p_e_grid, e_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
49 constexpr
auto c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs =
51 GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs();
54 constexpr
auto c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp =
56 GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs();
58 constexpr
auto MWave =
59 c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
61 constexpr
auto MSubGroup =
62 c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
64 constexpr
auto NWave =
65 c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
67 constexpr
auto NThreadPerSubGroup =
68 c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
70 constexpr
auto MAccVgprs =
71 c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp
75 const auto c_thread_mtx_on_block =
76 BlockwiseGemmPipe::CalculateCThreadOriginDataIndex(
I0,
I0);
78 const auto m_thread_data_on_grid_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor =
84 const auto m_thread_data_on_grid_idx =
85 m_thread_data_on_grid_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor.CalculateBottomIndex(
88 const auto n_thread_data_on_grid_to_nrepeat_nwave_nthreadpersubgroup_adaptor =
94 const auto n_thread_data_on_grid_idx =
95 n_thread_data_on_grid_to_nrepeat_nwave_nthreadpersubgroup_adaptor.CalculateBottomIndex(
99 const auto c_grid_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs =
101 e_grid_desc_mblock_mperblock_nblock_nperblock,
117 decltype(c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs),
118 decltype(c_grid_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs),
119 CDEElementwiseOperation,
124 EGlobalMemoryDataOperation,
126 false>{c_grid_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs,
128 m_thread_data_on_grid_idx[
I1],
129 m_thread_data_on_grid_idx[
I2],
130 n_thread_data_on_grid_idx[
I0],
131 n_thread_data_on_grid_idx[
I1],
132 n_thread_data_on_grid_idx[
I2],
133 m_thread_data_on_grid_idx[
I3]),
137 c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs,
140 c_grid_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs,
__host__ constexpr __device__ auto make_multi_index(Xs &&... xs)
Definition: array_multi_index.hpp:15
InMemoryDataOperationEnum
Definition: ck.hpp:279
__host__ constexpr __device__ auto make_merge_transform(const LowLengths &low_lengths)
Definition: multi_index_transform_helper.hpp:55
__host__ constexpr __device__ auto make_single_stage_tensor_adaptor(const Transforms &transforms, LowerDimensionOldTopIdss, UpperDimensionNewTopIdss)
Definition: tensor_adaptor.hpp:425
__host__ constexpr __device__ auto make_freeze_transform(const LowerIndex &low_idx)
Definition: multi_index_transform_helper.hpp:151
__host__ constexpr __device__ auto make_tuple(Xs &&... xs)
Definition: tuple.hpp:212
__host__ constexpr __device__ auto make_unmerge_transform(const UpLengths &up_lengths, integral_constant< bool, Use24BitIntegerCalculation >=integral_constant< bool, false >{})
Definition: multi_index_transform_helper.hpp:90
int32_t index_t
Definition: ck.hpp:301
__host__ constexpr __device__ auto transform_tensor_descriptor(const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
Definition: tensor_descriptor.hpp:319
Definition: epilogue_direct_store.hpp:18
static constexpr auto I4
Definition: epilogue_direct_store.hpp:23
static constexpr auto I5
Definition: epilogue_direct_store.hpp:24
static constexpr auto I1
Definition: epilogue_direct_store.hpp:20
static constexpr __device__ bool IsLDSNeeded()
Definition: epilogue_direct_store.hpp:27
static constexpr auto I0
Definition: epilogue_direct_store.hpp:19
static constexpr auto I6
Definition: epilogue_direct_store.hpp:25
static constexpr auto I2
Definition: epilogue_direct_store.hpp:21
static constexpr auto I3
Definition: epilogue_direct_store.hpp:22
static __device__ void Run(CThreadBuf &c_thread_buf, DsGridPointer, EDataType *p_e_grid, void *, const DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock &, const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock &e_grid_desc_mblock_mperblock_nblock_nperblock, CDEElementwiseOperation &cde_element_op, const index_t &block_m_id, const index_t &block_n_id)
Definition: epilogue_direct_store.hpp:34
Definition: sequence.hpp:43
Definition: threadwise_tensor_slice_transfer.hpp:39
Definition: integral_constant.hpp:20