/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/3643/include/ck/tensor_operation/gpu/device/device_grouped_gemm_tile_loop.hpp Source File

/home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/3643/include/ck/tensor_operation/gpu/device/device_grouped_gemm_tile_loop.hpp Source File#

Composable Kernel: /home/docs/checkouts/readthedocs.org/user_builds/advanced-micro-devices-composable-kernel/checkouts/3643/include/ck/tensor_operation/gpu/device/device_grouped_gemm_tile_loop.hpp Source File
device_grouped_gemm_tile_loop.hpp
Go to the documentation of this file.
1 // Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
2 // SPDX-License-Identifier: MIT
3 
4 #pragma once
5 
6 #include "ck/ck.hpp"
10 
11 #include "device_grouped_gemm.hpp"
12 
13 namespace ck {
14 namespace tensor_operation {
15 namespace device {
16 
25 
26 template <typename ALayout,
27  typename BLayout,
28  typename DsLayout,
29  typename ELayout,
30  typename ADataType,
31  typename BDataType,
32  typename DsDataType,
33  typename EDataType,
34  typename AElementwiseOperation,
35  typename BElementwiseOperation,
36  typename CDEElementwiseOperation>
38  BLayout,
39  DsLayout,
40  ELayout,
41  ADataType,
42  BDataType,
43  DsDataType,
44  EDataType,
45  AElementwiseOperation,
46  BElementwiseOperation,
47  CDEElementwiseOperation>
48 {
49 };
50 
51 template <ck::index_t BlockSize>
53 {
54  // The oversubscription factor for the number of blocks that can simultaneously reside on
55  // GPU.
56  static constexpr int BLOCK_SUBSCRIPTION_FACTOR = 1;
57  // static constexpr int BLOCK_WAVES = BlockSize / get_warp_size();
58  static constexpr int CU_SIMDS = 4;
59  // Assume we want to have at most 2 waves per SIMD
60  // static constexpr int CU_BLOCKS = math::integer_divide_floor(2 * CU_SIMDS, BLOCK_WAVES);
61  static int GetCuBlocks()
62  {
63  int BLOCK_WAVES = BlockSize / get_warp_size();
64  return ck::math::integer_divide_floor(2 * CU_SIMDS, BLOCK_WAVES);
65  }
66 
67  template <typename KernelFunction>
68  static int CalculateMaxOccupancyGridSize(const KernelFunction& kernel,
69  const StreamConfig& stream_config)
70  {
71  // Calculate max number of workgroups that can simultaneously reside on the CU.
72  int occ_num_blocks = GetKernelOccupancy(kernel);
73  int cu_count = getAvailableComputeUnitCount(stream_config);
74 
75  if(stream_config.log_level_ > 0)
76  {
77  std::cout << "MaxActiveBlocksPerCU: " << occ_num_blocks
78  << ", available CUs count: " << cu_count << ", occup. grid size: "
79  << ck::math::min(occ_num_blocks, GetCuBlocks()) * cu_count << std::endl;
80  }
81 
82  return cu_count * ck::math::min(occ_num_blocks, GetCuBlocks());
83  }
84 
85  template <typename KernelFunction>
86  static int GetKernelOccupancy(const KernelFunction& kernel)
87  {
88  int occupancy = 0;
90  hipOccupancyMaxActiveBlocksPerMultiprocessor(&occupancy, kernel, BlockSize, 0));
91  return occupancy;
92  }
93 
94  static int GetComputeUnitCount()
95  {
96  hipDeviceProp_t dev_prop;
97  hipDevice_t dev;
98  ck::hip_check_error(hipGetDevice(&dev));
99  ck::hip_check_error(hipGetDeviceProperties(&dev_prop, dev));
100  return dev_prop.multiProcessorCount;
101  }
102 };
103 
104 } // namespace device
105 } // namespace tensor_operation
106 } // namespace ck
__host__ constexpr __device__ auto integer_divide_floor(X x, Y y)
Definition: math.hpp:66
__host__ constexpr __device__ T min(T x)
Definition: math.hpp:116
Definition: ck.hpp:270
constexpr __device__ index_t get_warp_size()
Definition: get_id.hpp:10
void hip_check_error(hipError_t x)
Definition: hip_check_error.hpp:12
Definition: stream_config.hpp:9
int log_level_
Definition: stream_config.hpp:12
Definition: device_grouped_gemm.hpp:100
Grouped GEMM kernel using output Tile Looping algorithm.
Definition: device_grouped_gemm_tile_loop.hpp:48
Definition: device_grouped_gemm_tile_loop.hpp:53
static constexpr int BLOCK_SUBSCRIPTION_FACTOR
Definition: device_grouped_gemm_tile_loop.hpp:56
static constexpr int CU_SIMDS
Definition: device_grouped_gemm_tile_loop.hpp:58
static int CalculateMaxOccupancyGridSize(const KernelFunction &kernel, const StreamConfig &stream_config)
Definition: device_grouped_gemm_tile_loop.hpp:68
static int GetComputeUnitCount()
Definition: device_grouped_gemm_tile_loop.hpp:94
static int GetCuBlocks()
Definition: device_grouped_gemm_tile_loop.hpp:61
static int GetKernelOccupancy(const KernelFunction &kernel)
Definition: device_grouped_gemm_tile_loop.hpp:86