47 std::cerr <<
"WARNING: Device result is all zeros - kernel may not have executed "
53 std::cerr <<
", " << std::setprecision(2) << std::fixed <<
error_percentage()
54 <<
"% wrong values" << std::endl;
61 template <
typename ComputeDataType,
typename OutDataType,
typename AccDataType = ComputeDataType>
72 if constexpr(std::is_same_v<ComputeDataType, I8> || std::is_same_v<ComputeDataType, I16> ||
73 std::is_same_v<ComputeDataType, I32> || std::is_same_v<ComputeDataType, int>)
78 else if constexpr((std::is_same_v<ComputeDataType, F16> ||
79 std::is_same_v<ComputeDataType, BF16> ||
80 std::is_same_v<ComputeDataType, F32>) &&
81 (std::is_same_v<OutDataType, F16> || std::is_same_v<OutDataType, BF16> ||
82 std::is_same_v<OutDataType, F32>) &&
83 (std::is_same_v<AccDataType, F16> || std::is_same_v<AccDataType, BF16> ||
84 std::is_same_v<AccDataType, F32>))
86 return static_cast<float>(
87 ck::utils::get_relative_threshold<ComputeDataType, OutDataType, AccDataType>(
88 number_of_accumulations));
93 if constexpr(std::is_same_v<OutDataType, F16>)
97 else if constexpr(std::is_same_v<OutDataType, BF16>)
126 template <
typename T>
128 const T* __restrict__ reference_result,
134 constexpr
int block_size = 256;
137 __shared__
unsigned long long shared_error_count[block_size];
138 __shared__
float shared_max_error[block_size];
139 __shared__
int shared_has_error[block_size];
140 __shared__
int shared_has_nonzero[block_size];
143 unsigned long long local_error_count = 0;
144 float local_max_error = 0.0f;
145 int local_has_error = 0;
146 int local_has_nonzero = 0;
149 long long idx = blockIdx.x * blockDim.x + threadIdx.x;
150 long long stride = blockDim.x * gridDim.x;
152 for(
long long i = idx; i < size; i += stride)
155 float dev_val = type_convert<float>(device_result[i]);
156 float ref_val = type_convert<float>(reference_result[i]);
161 local_has_nonzero = 1;
165 float abs_diff = fabsf(dev_val - ref_val);
168 if(abs_diff > atol + rtol * fabsf(ref_val))
172 local_max_error = fmaxf(local_max_error, abs_diff);
177 shared_error_count[threadIdx.x] = local_error_count;
178 shared_max_error[threadIdx.x] = local_max_error;
179 shared_has_error[threadIdx.x] = local_has_error;
180 shared_has_nonzero[threadIdx.x] = local_has_nonzero;
184 for(
unsigned int s = block_size / 2; s >= 32; s >>= 1)
188 shared_error_count[threadIdx.x] += shared_error_count[threadIdx.x + s];
189 shared_max_error[threadIdx.x] =
190 fmaxf(shared_max_error[threadIdx.x], shared_max_error[threadIdx.x + s]);
191 shared_has_error[threadIdx.x] |= shared_has_error[threadIdx.x + s];
192 shared_has_nonzero[threadIdx.x] |= shared_has_nonzero[threadIdx.x + s];
200 for(
int i = 1; i < 32; ++i)
202 shared_error_count[0] += shared_error_count[i];
203 shared_max_error[0] = fmaxf(shared_max_error[0], shared_max_error[i]);
204 shared_has_error[0] |= shared_has_error[i];
205 shared_has_nonzero[0] |= shared_has_nonzero[i];
209 if(shared_has_error[0])
211 atomicAdd(&result->
error_count, shared_error_count[0]);
212 atomicMax(&result->
max_error, shared_max_error[0]);
215 if(!shared_has_nonzero[0])
228 template <
typename T>
230 const void* reference_result,
234 hipStream_t stream =
nullptr)
251 constexpr
int block_size = 256;
252 int grid_size = std::min<int>(65535, (size + block_size - 1) / block_size);
255 <<<grid_size, block_size, 0, stream>>>(
static_cast<const T*
>(device_result),
256 static_cast<const T*
>(reference_result),
259 static_cast<long long>(size),
285 template <
typename T>
286 float gpu_reduce_max(
const void* device_buffer, std::size_t size, hipStream_t stream =
nullptr);
291 template <
typename OutDataType,
292 typename ComputeDataType = OutDataType,
293 typename AccDataType = ComputeDataType>
295 const void* reference_result,
296 int number_of_accumulations,
298 hipStream_t stream =
nullptr)
301 double max_abs_value =
302 static_cast<double>(gpu_reduce_max<OutDataType>(reference_result, size, stream));
305 float rtol = compute_relative_tolerance<ComputeDataType, OutDataType, AccDataType>(
306 number_of_accumulations);
314 if constexpr((std::is_same_v<ComputeDataType, F16> || std::is_same_v<ComputeDataType, BF16> ||
315 std::is_same_v<ComputeDataType, F32>) &&
316 (std::is_same_v<OutDataType, F16> || std::is_same_v<OutDataType, BF16> ||
317 std::is_same_v<OutDataType, F32>) &&
318 (std::is_same_v<AccDataType, F16> || std::is_same_v<AccDataType, BF16> ||
319 std::is_same_v<AccDataType, F32>))
321 atol =
static_cast<float>(
322 ck::utils::get_absolute_threshold<ComputeDataType, OutDataType, AccDataType>(
323 max_abs_value, number_of_accumulations));
327 return gpu_verify<OutDataType>(device_result, reference_result, rtol, atol, size, stream);
334 template <
typename T>
338 constexpr
int block_size = 256;
339 __shared__
float shared_max[block_size];
341 long long idx = blockIdx.x * blockDim.x + threadIdx.x;
342 long long stride = blockDim.x * gridDim.x;
344 float local_max = 0.0f;
346 for(
long long i = idx; i < size; i += stride)
348 float val = fabsf(type_convert<float>(data[i]));
349 local_max = fmaxf(local_max, val);
352 shared_max[threadIdx.x] = local_max;
356 for(
unsigned int s = block_size / 2; s >= 32; s >>= 1)
360 shared_max[threadIdx.x] = fmaxf(shared_max[threadIdx.x], shared_max[threadIdx.x + s]);
368 for(
int i = 1; i < 32; ++i)
370 shared_max[0] = fmaxf(shared_max[0], shared_max[i]);
374 atomicMax(max_val, shared_max[0]);
381 template <
typename T>
382 float gpu_reduce_max(
const void* device_buffer, std::size_t size, hipStream_t stream)
394 float init_val = 0.0f;
395 hip_check_error(hipMemcpy(max_dev, &init_val,
sizeof(
float), hipMemcpyHostToDevice));
400 constexpr
int block_size = 256;
401 int grid_size = std::min<int>(1024, (size + block_size - 1) / block_size);
403 gpu_reduce_max_kernel<T><<<grid_size, block_size, 0, stream>>>(
404 static_cast<const T*
>(device_buffer),
static_cast<long long>(size), max_dev);
409 if(stream ==
nullptr)
416 hip_check_error(hipMemcpy(&max_host, max_dev,
sizeof(
float), hipMemcpyDeviceToHost));
__global__ void gpu_verify_kernel(const T *__restrict__ device_result, const T *__restrict__ reference_result, float rtol, float atol, long long size, GpuVerifyDeviceResult *result)
Definition: gpu_verification.hpp:127
float gpu_reduce_max(const void *device_buffer, std::size_t size, hipStream_t stream=nullptr)
Definition: gpu_verification.hpp:382
float compute_relative_tolerance(const int number_of_accumulations=1)
Definition: gpu_verification.hpp:62
__global__ void gpu_reduce_max_kernel(const T *__restrict__ data, long long size, float *__restrict__ max_val)
Definition: gpu_verification.hpp:336
GpuVerifyResult gpu_verify(const void *device_result, const void *reference_result, float rtol, float atol, std::size_t size, hipStream_t stream=nullptr)
Definition: gpu_verification.hpp:229
float F32
32-bit floating point (single precision) type
Definition: check_err.hpp:33
ck_tile::bf16_t BF16
16-bit brain floating point type
Definition: check_err.hpp:31
ck_tile::half_t F16
16-bit floating point (half precision) type
Definition: check_err.hpp:29
int32_t I32
32-bit signed integer type
Definition: check_err.hpp:37
int8_t I8
8-bit signed integer type
Definition: check_err.hpp:35
_Float16 half_t
Definition: data_type.hpp:31
ushort bhalf_t
Definition: data_type.hpp:30
void hip_check_error(hipError_t x)
Definition: hip_check_error.hpp:12
signed short int16_t
Definition: stdint.h:122
signed int int32_t
Definition: stdint.h:123
signed char int8_t
Definition: stdint.h:121
Definition: gpu_verification.hpp:112
int all_zero
Definition: gpu_verification.hpp:115
unsigned long long error_count
Definition: gpu_verification.hpp:113
float max_error
Definition: gpu_verification.hpp:114
Definition: gpu_verification.hpp:22
std::size_t total
Definition: gpu_verification.hpp:25
float max_error
Definition: gpu_verification.hpp:24
bool all_zero
Definition: gpu_verification.hpp:26
unsigned long long error_count
Definition: gpu_verification.hpp:23
void print_error_summary() const
Definition: gpu_verification.hpp:41
float error_percentage() const
Definition: gpu_verification.hpp:33